# Random Forest

In this notebook we will present the Random Forest Classifier model and compare how the different vectorization methods perform with it.

First of all, we load our preprocessed dataset and do all the different vectorizations.


In [1]:
#¬†Data handling
import numpy as np
import pandas as pd

#¬†Text processing
import re
import string
import emoji
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim import models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import multiprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("../DATASETS/preprocessed_text.csv")

In [3]:
df.isnull().sum()
df.fillna('', inplace=True)

In [4]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the model and transform the data
bow = vectorizer.fit_transform(df['Content_cleaned'])

print(len(vectorizer.vocabulary_))
print(bow.shape)

39783
(113292, 39783)


In [5]:
# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the model and transform the data
tfidf = vectorizer.fit_transform(df['Content_cleaned'])

print(len(vectorizer.vocabulary_))
print(tfidf.shape)

39783
(113292, 39783)


In [6]:
w2v = models.KeyedVectors.load_word2vec_format(
'../../GoogleNews-vectors-negative300.bin', binary=True)

def get_average_word2vec(tokens_list, model, vector_size):
    """
    This function computes the average Word2Vec for a given list of tokens.
    """
    # Filter the tokens that are present in the Word2Vec model
    valid_tokens = [token for token in tokens_list if token in model]
    if not valid_tokens:
        return np.zeros(vector_size)
    
    # Compute the average Word2Vec
    word_vectors = [model[token] for token in valid_tokens]
    average_vector = np.mean(word_vectors, axis=0)
    return average_vector

# Tokenize the text data
df['tokens'] = df['Content_cleaned'].apply(lambda x: x.split())

# Compute the average Word2Vec for each row
vector_size = w2v.vector_size
df['word2vec_pretrained'] = df['tokens'].apply(lambda x: get_average_word2vec(x, w2v, vector_size))

df.head()

Unnamed: 0,Content,Score,Sentiment,Content_cleaned,tokens,word2vec_pretrained
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp giving screen limit like when you a...,"[plss, stopp, giving, screen, limit, like, whe...","[0.060924955, 0.036983438, 0.052580304, 0.1163..."
1,Good,5,positive,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328..."
2,üëçüëç,5,positive,thumbs_up thumbs_up,"[thumbs_up, thumbs_up]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Good,3,neutral,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app is useful to certain phone brand it is not...,"[app, is, useful, to, certain, phone, brand, i...","[0.016780308, -0.041579314, 0.043486457, 0.058..."


In [7]:
def get_average_word2vec2(tokens_list, model, vector_size):
    valid_tokens = [token for token in tokens_list if token in model.wv]
    if not valid_tokens:
        return np.zeros(vector_size)
    word_vectors = [model.wv[token] for token in valid_tokens]
    average_vector = np.mean(word_vectors, axis=0)
    return average_vector

# Define model parameters
vector_size = 300   # Dimensionality of the word vectors
window_size = 5     # Context window size
min_count = 1       # Minimum word frequency
workers = multiprocessing.cpu_count()  # Number of worker threads to use

# Train the Word2Vec model
cbow = models.Word2Vec(df['tokens'].tolist(), vector_size=vector_size, sg=0, window=window_size, min_count=min_count, workers=workers)

df['word2vec_cbow'] = df['tokens'].apply(lambda x: get_average_word2vec2(x, cbow, vector_size))

df.head()

Unnamed: 0,Content,Score,Sentiment,Content_cleaned,tokens,word2vec_pretrained,word2vec_cbow
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp giving screen limit like when you a...,"[plss, stopp, giving, screen, limit, like, whe...","[0.060924955, 0.036983438, 0.052580304, 0.1163...","[-0.025864253, 0.19689733, -0.101712145, 0.533..."
1,Good,5,positive,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[-0.30692956, 0.114956446, 0.44454482, 1.96634..."
2,üëçüëç,5,positive,thumbs_up thumbs_up,"[thumbs_up, thumbs_up]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.4326444, 0.9519766, -0.27984688, 0.0497392..."
3,Good,3,neutral,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[-0.30692956, 0.114956446, 0.44454482, 1.96634..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app is useful to certain phone brand it is not...,"[app, is, useful, to, certain, phone, brand, i...","[0.016780308, -0.041579314, 0.043486457, 0.058...","[-0.16589907, -0.52168304, -0.14547862, 0.4110..."


In [8]:
# Path to the GloVe embeddings file
glove_file = '../../glove.6B.100d.txt'

# Load the GloVe embeddings into a dictionary
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load the GloVe embeddings
glove_6b = load_glove_embeddings(glove_file)
print(f"Loaded {len(glove_6b)} word vectors from GloVe.")

# Define a function to get the average GloVe vector for a list of tokens
def get_average_glove(tokens_list, embeddings, embedding_dim):
    valid_tokens = [token for token in tokens_list if token in embeddings]
    if not valid_tokens:
        return np.zeros(embedding_dim)
    word_vectors = [embeddings[token] for token in valid_tokens]
    average_vector = np.mean(word_vectors, axis=0)
    return average_vector

# Define the embedding dimension (e.g., 100 for 'glove.6B.100d.txt')
embedding_dim = 100

# Compute the average GloVe vector for each row
df['glove_6B'] = df['tokens'].apply(lambda x: get_average_glove(x, glove_6b, embedding_dim))

df.head()

Loaded 400000 word vectors from GloVe.


Unnamed: 0,Content,Score,Sentiment,Content_cleaned,tokens,word2vec_pretrained,word2vec_cbow,glove_6B
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp giving screen limit like when you a...,"[plss, stopp, giving, screen, limit, like, whe...","[0.060924955, 0.036983438, 0.052580304, 0.1163...","[-0.025864253, 0.19689733, -0.101712145, 0.533...","[-0.101591855, 0.21243754, 0.45259842, -0.2616..."
1,Good,5,positive,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[-0.30692956, 0.114956446, 0.44454482, 1.96634...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739..."
2,üëçüëç,5,positive,thumbs_up thumbs_up,"[thumbs_up, thumbs_up]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.4326444, 0.9519766, -0.27984688, 0.0497392...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Good,3,neutral,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[-0.30692956, 0.114956446, 0.44454482, 1.96634...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app is useful to certain phone brand it is not...,"[app, is, useful, to, certain, phone, brand, i...","[0.016780308, -0.041579314, 0.043486457, 0.058...","[-0.16589907, -0.52168304, -0.14547862, 0.4110...","[-0.19991928, 0.11995281, 0.36286283, -0.22692..."


In [9]:
# Path to the GloVe embeddings file
glove_file = '../../glove.twitter.27B.100d.txt'

# Load the GloVe embeddings into a dictionary
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load the GloVe embeddings
glove_twitter = load_glove_embeddings(glove_file)
print(f"Loaded {len(glove_twitter)} word vectors from GloVe.")

# Define a function to get the average GloVe vector for a list of tokens
def get_average_glove(tokens_list, embeddings, embedding_dim):
    valid_tokens = [token for token in tokens_list if token in embeddings]
    if not valid_tokens:
        return np.zeros(embedding_dim)
    word_vectors = [embeddings[token] for token in valid_tokens]
    average_vector = np.mean(word_vectors, axis=0)
    return average_vector

# Define the embedding dimension (e.g., 100 for 'glove.6B.100d.txt')
embedding_dim = 100

# Compute the average GloVe vector for each row
df['glove_twitter'] = df['tokens'].apply(lambda x: get_average_glove(x, glove_twitter, embedding_dim))

df.head()

Loaded 1193514 word vectors from GloVe.


Unnamed: 0,Content,Score,Sentiment,Content_cleaned,tokens,word2vec_pretrained,word2vec_cbow,glove_6B,glove_twitter
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp giving screen limit like when you a...,"[plss, stopp, giving, screen, limit, like, whe...","[0.060924955, 0.036983438, 0.052580304, 0.1163...","[-0.025864253, 0.19689733, -0.101712145, 0.533...","[-0.101591855, 0.21243754, 0.45259842, -0.2616...","[0.058894146, 0.18850434, 0.08296321, 0.174713..."
1,Good,5,positive,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[-0.30692956, 0.114956446, 0.44454482, 1.96634...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739...","[0.091552, 0.093336, -0.028113, 0.3699, 0.1895..."
2,üëçüëç,5,positive,thumbs_up thumbs_up,"[thumbs_up, thumbs_up]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.4326444, 0.9519766, -0.27984688, 0.0497392...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Good,3,neutral,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[-0.30692956, 0.114956446, 0.44454482, 1.96634...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739...","[0.091552, 0.093336, -0.028113, 0.3699, 0.1895..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app is useful to certain phone brand it is not...,"[app, is, useful, to, certain, phone, brand, i...","[0.016780308, -0.041579314, 0.043486457, 0.058...","[-0.16589907, -0.52168304, -0.14547862, 0.4110...","[-0.19991928, 0.11995281, 0.36286283, -0.22692...","[0.23760791, 0.07707109, 0.06094666, 0.2031615..."


### Preparing the labels 

In [10]:
y_df = df['Sentiment']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_df)

### Performing the train-test splits for the different vectors

In [11]:
bow_train, bow_test, y_train, y_test = train_test_split(bow, y, test_size=0.2, random_state=42)

In [12]:
tfidf_train, tfidf_test, y_train, y_test = train_test_split(tfidf, y, test_size=0.2, random_state=42)

In [13]:
w2v_pre = np.vstack(df['word2vec_pretrained'].values)

w2v_pre_train, w2v_pre_test, y_train, y_test = train_test_split(w2v_pre, y, test_size=0.2, random_state=42)

In [14]:
w2v_cbow = np.vstack(df['word2vec_cbow'].values)

w2v_cbow_train, w2v_cbow_test, y_train, y_test = train_test_split(w2v_cbow, y, test_size=0.2, random_state=42)

In [15]:
glove_6b = np.vstack(df['glove_6B'].values)

glove_6b_train, glove_6b_test, y_train, y_test = train_test_split(glove_6b, y, test_size=0.2, random_state=42)

In [16]:
glove_twitter = np.vstack(df['glove_twitter'].values)

glove_twitter_train, glove_twitter_test, y_train, y_test = train_test_split(glove_twitter, y, test_size=0.2, random_state=42)

## Models

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf_bow = RandomForestClassifier()

rf_bow.fit(bow_train, y_train)

# Predictions
y_pred = rf_bow.predict(bow_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using BoW and RF: {accuracy}")

results = {}
results['rf_bow'] = accuracy

Accuracy using BoW and RF: 0.784765435367845


In [18]:
rf_tfidf = RandomForestClassifier()
rf_tfidf.fit(tfidf_train, y_train)

# Predictions
y_pred = rf_tfidf.predict(tfidf_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using TFIDF and RF: {accuracy}")

results['rf_tfidf'] = accuracy

Accuracy using TFIDF and RF: 0.7826912043779514


In [19]:
rf_w2v_pre = RandomForestClassifier()
rf_w2v_pre.fit(w2v_pre_train, y_train)

# Predictions
y_pred = rf_w2v_pre.predict(w2v_pre_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using W2V_pretrained and RF: {accuracy}")

results['rf_w2v_pre'] = accuracy

Accuracy using W2V_pretrained and RF: 0.7559468643805993


In [20]:
rf_w2v_cbow = RandomForestClassifier()
rf_w2v_cbow.fit(w2v_cbow_train, y_train)

# Predictions
y_pred = rf_w2v_cbow.predict(w2v_cbow_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using W2V_cbow and RF: {accuracy}")

results['rf_w2v_cbow'] = accuracy

Accuracy using W2V_cbow and RF: 0.7735557615075688


In [21]:
rf_glove_6b = RandomForestClassifier()
rf_glove_6b.fit(glove_6b_train, y_train)

# Predictions
y_pred = rf_glove_6b.predict(glove_6b_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using glove_6b and RF: {accuracy}")

results['rf_glove_6b'] = accuracy

Accuracy using glove_6b and RF: 0.7409859217088133


In [22]:
rf_glove_twitter = RandomForestClassifier()
rf_glove_twitter.fit(glove_twitter_train, y_train)

# Predictions
y_pred = rf_glove_twitter.predict(glove_twitter_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using glove_twitter and RF: {accuracy}")

results['rf_glove_twitter'] = accuracy

Accuracy using glove_twitter and RF: 0.7554614060638157


## Results

In [23]:
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])

# Print the results table
print(results_df)

              Model  Accuracy
0            rf_bow  0.784765
1          rf_tfidf  0.782691
2        rf_w2v_pre  0.755947
3       rf_w2v_cbow  0.773556
4       rf_glove_6b  0.740986
5  rf_glove_twitter  0.755461
