In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import sys
sys.path.append('..')
from helpers import *

In [2]:
vocab_dim = 20 
maxlen = 25  # Maximum length of text retention
   
embedding_weights = np.load("embeddings.npy") 
# Set a zero vector for words that do not appear in the vocabulary
embedding_weights = np.r_[np.zeros((1, vocab_dim)),embedding_weights]

f = open("vocab.pkl", 'rb') 
index_dict = pickle.load(f)    # index dictionary {'word': idx}

# Index each word + 1 because of the zero vector
for key, value in index_dict.items():  
    index_dict[key] = value + 1 

with open("../twitter-datasets/train_neg.txt", "r", encoding='UTF-8') as f:
    neg_data = f.readlines()
with open("../twitter-datasets/train_pos.txt", "r", encoding='UTF-8') as f:
    pos_data = f.readlines()
    
data = neg_data + pos_data

label_list = ([0] * len(neg_data) + [1] * len(pos_data))


data = text_to_index_array(index_dict, data)
labels = np.array(label_list) 
data = creat_wordvec_mean_tensor(embedding_weights,data)

train_x,val_x,train_y,val_y = train_test_split(data, labels, test_size=0.2)

In [3]:
from sklearn.linear_model import LogisticRegression as LR


grid_values = {'C': [1e-2,1e-1,1,2]}

clf = GridSearchCV(LR(penalty='l2', random_state=0, max_iter=3000,),
                           grid_values, scoring='roc_auc', cv=20,n_jobs=4)

clf.fit(train_x, train_y)
print("using LR, Best: %f using %s" %
      (clf.best_score_, clf.best_params_))

clf = clf.best_estimator_

pred_y = clf.predict(val_x)

print('--- report ---')
print(classification_report(val_y, pred_y))

print('--- auc ---')
print(roc_auc_score(val_y, pred_y))

using LR, Best: 0.623648 using {'C': 2}
--- report ---
              precision    recall  f1-score   support

           0       0.59      0.53      0.56     20016
           1       0.58      0.64      0.60     19984

    accuracy                           0.58     40000
   macro avg       0.58      0.58      0.58     40000
weighted avg       0.58      0.58      0.58     40000

--- auc ---
0.5837163935784919


In [4]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=800,max_depth=20)
clf.fit(train_x, train_y)
pred_y = clf.predict(val_x)

print('--- report ---')
print(classification_report(val_y, pred_y))

print('--- auc ---')
print(roc_auc_score(val_y, pred_y))

--- report ---
              precision    recall  f1-score   support

           0       0.71      0.49      0.58     20016
           1       0.61      0.80      0.69     19984

    accuracy                           0.65     40000
   macro avg       0.66      0.65      0.64     40000
weighted avg       0.66      0.65      0.64     40000

--- auc ---
0.6456221331981653
