<h1>Random Forest implementation</h1>

In [1]:
import csv, nltk, pickle, re, time
import pandas as pd
import numpy as np
from io import StringIO
from collections import Counter
from scipy import sparse
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [10]:
embeddings_index = {}
vocab_cut = pd.read_csv("outputs/vocab_cut.txt", sep=" ", header=None, quoting=csv.QUOTE_NONE)
index = pd.Series(vocab_cut[vocab_cut.columns[0]].values)
emb = np.load("outputs/embeddings.npy")
emb

array([[ 0.25816056, -0.29469185, -1.06713131, ...,  0.36700308,
        -0.14723212,  0.09652655],
       [-0.17024545, -0.05926867, -0.41807964, ...,  0.23533832,
        -0.15981563,  0.09256065],
       [-0.15801784, -0.06317051, -0.42592788, ...,  0.24630222,
        -0.2073824 ,  0.13902852],
       ..., 
       [ 0.9867201 ,  1.99722096, -0.09613431, ..., -0.19128622,
         0.28879319, -0.39757371],
       [-0.53913936,  0.43205019, -1.05278255, ..., -0.62491689,
        -1.3973401 , -0.26574516],
       [-0.14142788,  0.44970575,  0.61911085, ..., -1.57691884,
         0.362506  , -0.90781572]])

In [11]:
for i in range(len(emb)):
    values = emb[i]
    word = index[i]
    # for each word we find the corresponding word vector
    embeddings_index[word] = np.asarray(values[:], dtype='float32')

In [12]:
embeddings_index

{'frame': array([ 0.25816056, -0.29469186, -1.06713128,  0.54329503,  1.06416595,
        -0.22400942,  0.12450971, -1.18049645,  0.18720734,  0.38095886,
         1.01977205, -1.18408668, -0.02498168,  0.42987984, -0.59958905,
        -0.65874356, -0.5080483 ,  0.36700308, -0.14723212,  0.09652656], dtype=float32),
 'like': array([-0.17024545, -0.05926867, -0.41807964,  0.06713023,  0.63575232,
        -0.05098658,  0.11602723, -0.32906908,  0.09198951,  0.07429998,
         0.2054531 , -0.19193573, -0.02532456,  0.03758022, -0.31747898,
        -0.23165226, -0.23115797,  0.23533832, -0.15981562,  0.09256066], dtype=float32),
 'get': array([-0.15801784, -0.06317051, -0.42592788,  0.06859985,  0.66242659,
        -0.07791653,  0.12106406, -0.34197983,  0.0788883 ,  0.11862495,
         0.21881622, -0.20680302, -0.03708314,  0.05392919, -0.38541383,
        -0.23465972, -0.2542069 ,  0.24630222, -0.2073824 ,  0.13902852], dtype=float32),
 'go': array([-0.19503246, -0.08347525, -0.441967

In [3]:
def embedding_matrix(path_glove_twitter, word_index, nb_words, embedding_dim):
    
    # create index mapping words in the embeddings to their embedding vector
    embeddings_index = {}
    
    f = open(path_glove_twitter, "r", encoding="utf-8") 
    
    for line in f:
        values = line.split()
        word = values[0]
        # for each word we find the corresponding word vector
        embeddings_index[word] = np.asarray(values[1:], dtype='float32')
        
    f.close()

    # Create the embeding matrix corresponding to our Dataset
    embedding_matrix = np.zeros((nb_words + 1, embedding_dim))
    
    for word, i in word_index.items(): 
        
        if i > nb_words: 
            continue
            
        embedding_vector = embeddings_index.get(word)
        
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=nb_word)
tokenizer.fit_on_texts(train)
sequences_train = tokenizer.texts_to_sequences(train)
sequences_test = tokenizer.texts_to_sequences(test)
# take only the index of words
word_index = tokenizer.word_index

In [None]:
len(word_index)

In [None]:
m = embedding_matrix("outputs/embeddings", word_index, len(word_index), 20)

In [None]:
# Sums the embeddings of each word in the given tweet

# THIS METHOD IS NOT OPTIMAL AND WOULD BENEFIT BEING MADE FASTER UNLESS WE DO NOT USE IT IN THE END
def query_weights(tweet):
    w = pd.DataFrame(columns=range(20))
    
    for word in tweet:
        try:
            w = w.append(word_weights.loc[word, :])
        except KeyError:
            pass
        
    return w.sum(axis=0)

In [None]:
# Build tweet embeddings
neg_dims = neg_DF.copy().apply(query_weights)
pos_dims = pos_DF.copy().apply(query_weights)
test_dims = test_DF.copy().apply(query_weights)

In [None]:
# Save the embeddings in pkl files
with open('outputs/neg_dims.pkl', 'wb') as f:
    pickle.dump(neg_dims, f, pickle.HIGHEST_PROTOCOL)
    
with open('outputs/pos_dims.pkl', 'wb') as f:
    pickle.dump(pos_dims, f, pickle.HIGHEST_PROTOCOL)
                
with open('outputs/test_dims.pkl', 'wb') as f:
    pickle.dump(test_dims, f, pickle.HIGHEST_PROTOCOL)            

In [None]:
# Load the embeddings from pkl files
with open('outputs/neg_dims.pkl', 'rb') as f:
    neg_dims = pickle.load(f)
    
with open('outputs/pos_dims.pkl', 'rb') as f:
    pos_dims = pickle.load(f)
    
with open('outputs/test_dims.pkl', 'rb') as f:
    test_dims = pickle.load(f)

In [None]:
# Generate the matrices for SVM fitting, we just put the positive and negative embeddings together and
# create the appropriate y matrix with 1's and -1's
X = pos_dims.append(neg_dims)
ones = np.ones((pos_dims.shape[0], 1))
y = np.append(ones, -1 * ones)

In [None]:
# Applies the Random Forest Classifier technique to the data
start = time.time()
clf = RandomForestClassifier(min_samples_leaf=20)
clf.fit(X, y)
end = time.time()
print("Random Forest", end - start, clf.score(X, y))
pred = pd.DataFrame(clf.predict(test_dims))
pred.columns = ["Prediction"]
pred.insert(0, "Id", pred.index + 1)

In [None]:
pred.head()

In [None]:
# We save the submission
pred.to_csv("outputs/sub_random_forest.csv", index=False, float_format="%.0f")

In [None]:
# Apply cross validation to the data
scores = cross_val_score(clf, X, y, cv=10)
scores