<h1>Random Forest implementation</h1>

In [81]:
import csv, nltk, pickle, re, time
import pandas as pd
import numpy as np
from io import StringIO
from collections import Counter
from scipy import sparse
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [82]:
with open('outputs/train_neg_proc.pkl', 'rb') as f:
    negative_DF = pickle.load(f)
    
with open('outputs/train_pos_proc.pkl', 'rb') as f:
    positive_DF = pickle.load(f)
    
with open('outputs/test_data_proc.pkl', 'rb') as f:
    testing_DF = pickle.load(f)

In [83]:
neg_DF = pd.DataFrame(negative_DF["lemmed"]).loc[:10, :]
pos_DF = pd.DataFrame(positive_DF["lemmed"]).loc[:10, :]
test_DF = pd.DataFrame(testing_DF["lemmed"])

In [84]:
neg_DF

Unnamed: 0,lemmed
0,"[vinco, tresorpack, difficulty, object, disass..."
1,"[glad, dot, taks, tomorrow, thankful, startho]"
2,"[v, celtic, regular, season, fucked, play, pla..."
3,"[could, actually, kill, girl, sorry]"
4,"[find, hard, believe, im, afraid]"
5,"[wish, could, night, tonight]"
6,"[got, kicked, wgm]"
7,"[yes, tell, lip, closed, okay]"
8,[perfect]
9,"[hi, harry, havea, good, time, au, didnt, get,..."


In [85]:
# Get embeddings and index values
emb = np.load("outputs/embeddings.npy")

vocab_cut = pd.read_csv("outputs/vocab_cut.txt", sep=" ", header=None, quoting=csv.QUOTE_NONE)
index = pd.Series(vocab_cut[vocab_cut.columns[0]].values)

# Create word definition matrix
word_weights = pd.DataFrame(data=emb, index=index)

word_weights.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
love,0.168707,0.212308,-0.088777,-0.088455,-0.053883,0.075036,-0.194219,0.158981,-0.24313,0.109408,-0.073149,0.090289,-0.101964,-0.170084,0.029999,0.217705,0.058087,-0.019592,0.042189,0.055808
like,0.14928,0.188781,-0.070958,-0.073076,-0.053277,0.057487,-0.171681,0.162443,-0.200086,0.093445,-0.045464,0.069396,-0.109191,-0.156937,0.03709,0.195186,0.031456,-0.021633,0.05448,0.071211
get,0.1857,0.238282,-0.09292,-0.098125,-0.076092,0.078408,-0.223484,0.1864,-0.255859,0.126814,-0.052907,0.085568,-0.120379,-0.190343,0.068558,0.243637,0.026339,-0.043308,0.050237,0.065511
frame,0.698522,0.719982,-0.262654,-0.95556,-0.24518,0.179842,-1.154688,0.967019,-0.679356,0.46796,-0.322454,0.353138,-0.152338,-1.109677,-0.000611,1.356425,0.252466,-0.235992,-0.279294,0.599314
one,0.131426,0.182103,-0.081761,-0.069416,-0.053566,0.078654,-0.174109,0.147928,-0.204716,0.107894,-0.027592,0.087196,-0.108628,-0.157488,0.048134,0.216494,0.04179,-0.027639,0.038123,0.060047


In [86]:
# Sums the embeddings of each word in the given tweet

# THIS METHOD IS NOT OPTIMAL AND WOULD BENEFIT BEING MADE FASTER UNLESS WE DO NOT USE IT IN THE END
def query_weights(tweet):
    w = pd.DataFrame(columns=range(20))
    
    for word in tweet:
        try:
            w = w.append(word_weights.loc[word, :])
        except KeyError:
            pass
        
    return w.sum(axis=0)

In [None]:
# Build tweet embeddings
neg_dims = neg_DF.copy().apply(query_weights)
pos_dims = pos_DF.copy().apply(query_weights)
test_dims = test_DF.copy().apply(query_weights)

In [70]:
neg_dims

Unnamed: 0,lemmed
0,2784.054252
1,3185.425062
2,-1214.95644
3,-1363.335188
4,-1109.582571
5,1021.759194
6,-3003.416098
7,2895.857974
8,-3458.357997
9,1593.67973


In [71]:
# Save the embeddings in pkl files
with open('outputs/neg_dims.pkl', 'wb') as f:
    pickle.dump(neg_dims, f, pickle.HIGHEST_PROTOCOL)
    
with open('outputs/pos_dims.pkl', 'wb') as f:
    pickle.dump(pos_dims, f, pickle.HIGHEST_PROTOCOL)
                
with open('outputs/test_dims.pkl', 'wb') as f:
    pickle.dump(test_dims, f, pickle.HIGHEST_PROTOCOL)            

In [72]:
# Load the embeddings from pkl files
with open('outputs/neg_dims.pkl', 'rb') as f:
    neg_dims = pickle.load(f)
    
with open('outputs/pos_dims.pkl', 'rb') as f:
    pos_dims = pickle.load(f)
    
with open('outputs/test_dims.pkl', 'rb') as f:
    test_dims = pickle.load(f)

In [73]:
# Generate the matrices for Random Forest fitting, we just put the positive and negative embeddings together and
# create the appropriate y matrix with 1's and -1's
X = pos_dims.append(neg_dims)
ones = np.ones((pos_dims.shape[0], 1))
y = np.append(ones, -1 * ones)

In [75]:
# Applies the Random Forest Classifier technique to the data
start = time.time()
clf = RandomForestClassifier(min_samples_leaf=20)
clf.fit(X, y)
end = time.time()
print("Random Forest", end - start, clf.score(X, y))
pred = pd.DataFrame(clf.predict(test_dims))
pred

Random Forest 0.02425551414489746 0.5


Unnamed: 0,0
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0


In [76]:
pred.columns = ["Prediction"]
pred.insert(0, "Id", pred.index + 1)

In [77]:
pred

Unnamed: 0,Id,Prediction
0,1,1.0
1,2,1.0
2,3,1.0
3,4,1.0
4,5,1.0
5,6,1.0
6,7,1.0
7,8,1.0
8,9,1.0
9,10,1.0


In [78]:
# We save the submission
pred.to_csv("outputs/sub_random_forest.csv", index=False, float_format="%.0f")

In [79]:
# Apply cross validation to the data
scores = cross_val_score(clf, X, y, cv=10)
scores

array([ 0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5])