<h1>Random Forest Implementation</h1>

In this notebook we apply a Random forest Classifier to our dataset.

In [None]:
# Needed general imports
import csv, pickle, time
import pandas as pd
import numpy as np

# Sklearn libraries for Random Forest and cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from keras.preprocessing.text import Tokenizer

First we open the DataFrames we saved previously.

In [None]:
with open('outputs/train_neg_proc.pkl', 'rb') as f:
    neg_DF = pickle.load(f)
    
with open('outputs/train_pos_proc.pkl', 'rb') as f:
    pos_DF = pickle.load(f)
    
with open('outputs/test_data_proc.pkl', 'rb') as f:
    test_DF = pickle.load(f)

In [None]:
neg_DF = pd.DataFrame(neg_DF["lemmed"])
pos_DF = pd.DataFrame(pos_DF["lemmed"])
test_DF = pd.DataFrame(test_DF["lemmed"])

In [None]:
neg_DF.head()

In [None]:
embeddings_index = {}
vocab_cut = pd.read_csv("outputs/vocab_cut.txt", sep=" ", header=None, quoting=csv.QUOTE_NONE)
index = pd.Series(vocab_cut[vocab_cut.columns[0]].values)
emb = np.load("outputs/embeddings.npy")
len(emb)

# Create word definition matrix
word_weights = pd.DataFrame(data=emb, index=index)

In [None]:
word_weights.head()

In [None]:
for i in range(len(emb)):
    values = emb[i]
    word = index[i]
    # for each word we find the corresponding word vector
    embeddings_index[word] = np.asarray(values[:], dtype='float32')

In [None]:
embeddings_index

In [None]:
def embedding_matrix(path_glove_twitter, word_index, nb_words, embedding_dim):
    
    # create index mapping words in the embeddings to their embedding vector
    embeddings_index = {}
    
    f = open(path_glove_twitter, "r", encoding="utf-8") 
    
    for line in f:
        values = line.split()
        word = values[0]
        # for each word we find the corresponding word vector
        embeddings_index[word] = np.asarray(values[1:], dtype='float32')
        
    f.close()

    # Create the embeding matrix corresponding to our Dataset
    embedding_matrix = np.zeros((nb_words + 1, embedding_dim))
    
    for word, i in word_index.items(): 
        
        if i > nb_words: 
            continue
            
        embedding_vector = embeddings_index.get(word)
        
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [None]:
nb_word = 20000
tokenizer = Tokenizer(num_words=nb_word)
tokenizer.fit_on_texts(train)
sequences_train = tokenizer.texts_to_sequences(train)
sequences_test = tokenizer.texts_to_sequences(test)
# take only the index of words
word_index = tokenizer.word_index

In [None]:
len(word_index)

In [None]:
m = embedding_matrix("outputs/embeddings", word_index, len(word_index), 20)

In [None]:
# Sums the embeddings of each word in the given tweet

# THIS METHOD IS NOT OPTIMAL AND WOULD BENEFIT BEING MADE FASTER UNLESS WE DO NOT USE IT IN THE END
def query_weights(tweet):
    w = pd.DataFrame(columns=range(20))
    
    for word in tweet:
        try:
            w = w.append(word_weights.loc[word, :])
        except KeyError:
            pass
        
    return w.sum(axis=0)

In [None]:
# Build tweet embeddings
neg_dims = neg_DF.copy().apply(query_weights)
pos_dims = pos_DF.copy().apply(query_weights)
test_dims = test_DF.copy().apply(query_weights)

In [None]:
# Save the embeddings in pkl files
with open('outputs/neg_dims.pkl', 'wb') as f:
    pickle.dump(neg_dims, f, pickle.HIGHEST_PROTOCOL)
    
with open('outputs/pos_dims.pkl', 'wb') as f:
    pickle.dump(pos_dims, f, pickle.HIGHEST_PROTOCOL)
                
with open('outputs/test_dims.pkl', 'wb') as f:
    pickle.dump(test_dims, f, pickle.HIGHEST_PROTOCOL)            

In [None]:
# Load the embeddings from pkl files
with open('outputs/neg_dims.pkl', 'rb') as f:
    neg_dims = pickle.load(f)
    
with open('outputs/pos_dims.pkl', 'rb') as f:
    pos_dims = pickle.load(f)
    
with open('outputs/test_dims.pkl', 'rb') as f:
    test_dims = pickle.load(f)

In [None]:
# Generate the matrices for SVM fitting, we just put the positive and negative embeddings together and
# create the appropriate y matrix with 1's and -1's
X = pos_dims.append(neg_dims)
ones = np.ones((pos_dims.shape[0], 1))
y = np.append(ones, -1 * ones)

In [None]:
# Applies the Random Forest Classifier technique to the data
start = time.time()
clf = RandomForestClassifier(min_samples_leaf=20)
clf.fit(X, y)
end = time.time()
print("Random Forest", end - start, clf.score(X, y))
pred = pd.DataFrame(clf.predict(test_dims))
pred.columns = ["Prediction"]
pred.insert(0, "Id", pred.index + 1)

In [None]:
pred.head()

In [None]:
# We save the submission
pred.to_csv("outputs/sub_random_forest.csv", index=False, float_format="%.0f")

In [None]:
# Apply cross validation to the data
scores = cross_val_score(clf, X, y, cv=10)
scores