# Sentiment Analysis models

In this notebook we will present all the models used for our problem and we will compare their performance.

First of all, we load our preprocessed dataset and do all the different vectorizations.


In [1]:
# Data handling
import numpy as np
import pandas as pd

# Text processing
import re
import string
import emoji
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim import models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import multiprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("preprocessed_text.csv")

In [3]:
df.isnull().sum()
df.fillna('', inplace=True)

In [4]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the model and transform the data
bow = vectorizer.fit_transform(df['Content_cleaned'])

print(len(vectorizer.vocabulary_))
print(bow.shape)

31451
(113292, 31451)


In [5]:
# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the model and transform the data
tfidf = vectorizer.fit_transform(df['Content_cleaned'])

print(len(vectorizer.vocabulary_))
print(tfidf.shape)

31451
(113292, 31451)


In [6]:
w2v = models.KeyedVectors.load_word2vec_format(
'../GoogleNews-vectors-negative300.bin', binary=True)

In [7]:
def get_average_word2vec(tokens_list, model, vector_size):
    """
    This function computes the average Word2Vec for a given list of tokens.
    """
    # Filter the tokens that are present in the Word2Vec model
    valid_tokens = [token for token in tokens_list if token in model]
    if not valid_tokens:
        return np.zeros(vector_size)
    
    # Compute the average Word2Vec
    word_vectors = [model[token] for token in valid_tokens]
    average_vector = np.mean(word_vectors, axis=0)
    return average_vector

# Tokenize the text data
df['tokens'] = df['Content_cleaned'].apply(lambda x: x.split())

# Compute the average Word2Vec for each row
vector_size = w2v.vector_size
df['word2vec_pretrained'] = df['tokens'].apply(lambda x: get_average_word2vec(x, w2v, vector_size))

df.head()

Unnamed: 0,Content,Score,Sentiment,Content_cleaned,tokens,word2vec_pretrained
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp give screen limit like ur watch thi...,"[plss, stopp, give, screen, limit, like, ur, w...","[0.08365452, 0.0579847, 0.11433671, -0.0025425..."
1,Good,5,positive,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328..."
2,👍👍,5,positive,thumb up thumb up,"[thumb, up, thumb, up]","[0.08703613, 0.07147217, -0.00390625, 0.005859..."
3,Good,3,neutral,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...,"[app, useful, certain, phone, brand, except, p...","[0.0644662, -0.0806833, -0.0020926339, 0.02535..."


In [8]:
def get_average_word2vec2(tokens_list, model, vector_size):
    valid_tokens = [token for token in tokens_list if token in model.wv]
    if not valid_tokens:
        return np.zeros(vector_size)
    word_vectors = [model.wv[token] for token in valid_tokens]
    average_vector = np.mean(word_vectors, axis=0)
    return average_vector

# Define model parameters
vector_size = 300   # Dimensionality of the word vectors
window_size = 5     # Context window size
min_count = 1       # Minimum word frequency
workers = multiprocessing.cpu_count()  # Number of worker threads to use

# Train the Word2Vec model
cbow = models.Word2Vec(df['tokens'].tolist(), vector_size=vector_size, sg=0, window=window_size, min_count=min_count, workers=workers)

df['word2vec_cbow'] = df['tokens'].apply(lambda x: get_average_word2vec2(x, cbow, vector_size))

df.head()

Unnamed: 0,Content,Score,Sentiment,Content_cleaned,tokens,word2vec_pretrained,word2vec_cbow
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp give screen limit like ur watch thi...,"[plss, stopp, give, screen, limit, like, ur, w...","[0.08365452, 0.0579847, 0.11433671, -0.0025425...","[-0.010158361, -0.2950956, 0.02295302, 0.05775..."
1,Good,5,positive,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.33362323, -1.5361803, -0.45669645, -0.59262..."
2,👍👍,5,positive,thumb up thumb up,"[thumb, up, thumb, up]","[0.08703613, 0.07147217, -0.00390625, 0.005859...","[-0.49321496, 0.1811941, -0.27961582, 1.002281..."
3,Good,3,neutral,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.33362323, -1.5361803, -0.45669645, -0.59262..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...,"[app, useful, certain, phone, brand, except, p...","[0.0644662, -0.0806833, -0.0020926339, 0.02535...","[-0.13318165, -0.09479263, -0.0055906163, 0.20..."


In [9]:
# Path to the GloVe embeddings file
glove_file = '../glove.6B.100d.txt'

# Load the GloVe embeddings into a dictionary
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load the GloVe embeddings
glove_6b = load_glove_embeddings(glove_file)
print(f"Loaded {len(glove_6b)} word vectors from GloVe.")

# Define a function to get the average GloVe vector for a list of tokens
def get_average_glove(tokens_list, embeddings, embedding_dim):
    valid_tokens = [token for token in tokens_list if token in embeddings]
    if not valid_tokens:
        return np.zeros(embedding_dim)
    word_vectors = [embeddings[token] for token in valid_tokens]
    average_vector = np.mean(word_vectors, axis=0)
    return average_vector

# Define the embedding dimension (e.g., 100 for 'glove.6B.100d.txt')
embedding_dim = 100

# Compute the average GloVe vector for each row
df['glove_6B'] = df['tokens'].apply(lambda x: get_average_glove(x, glove_6b, embedding_dim))

df.head()

Loaded 400000 word vectors from GloVe.


Unnamed: 0,Content,Score,Sentiment,Content_cleaned,tokens,word2vec_pretrained,word2vec_cbow,glove_6B
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp give screen limit like ur watch thi...,"[plss, stopp, give, screen, limit, like, ur, w...","[0.08365452, 0.0579847, 0.11433671, -0.0025425...","[-0.010158361, -0.2950956, 0.02295302, 0.05775...","[-0.1198448, 0.12636456, 0.41294017, -0.217294..."
1,Good,5,positive,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.33362323, -1.5361803, -0.45669645, -0.59262...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739..."
2,👍👍,5,positive,thumb up thumb up,"[thumb, up, thumb, up]","[0.08703613, 0.07147217, -0.00390625, 0.005859...","[-0.49321496, 0.1811941, -0.27961582, 1.002281...","[-0.22568002, 0.342005, 0.248815, -0.577975, -..."
3,Good,3,neutral,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.33362323, -1.5361803, -0.45669645, -0.59262...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...,"[app, useful, certain, phone, brand, except, p...","[0.0644662, -0.0806833, -0.0020926339, 0.02535...","[-0.13318165, -0.09479263, -0.0055906163, 0.20...","[-0.21259494, -0.062381856, 0.21229614, 0.0178..."


In [10]:
# Path to the GloVe embeddings file
glove_file = '../glove.twitter.27B.100d.txt'

# Load the GloVe embeddings into a dictionary
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load the GloVe embeddings
glove_twitter = load_glove_embeddings(glove_file)
print(f"Loaded {len(glove_twitter)} word vectors from GloVe.")

# Define a function to get the average GloVe vector for a list of tokens
def get_average_glove(tokens_list, embeddings, embedding_dim):
    valid_tokens = [token for token in tokens_list if token in embeddings]
    if not valid_tokens:
        return np.zeros(embedding_dim)
    word_vectors = [embeddings[token] for token in valid_tokens]
    average_vector = np.mean(word_vectors, axis=0)
    return average_vector

# Define the embedding dimension (e.g., 100 for 'glove.6B.100d.txt')
embedding_dim = 100

# Compute the average GloVe vector for each row
df['glove_twitter'] = df['tokens'].apply(lambda x: get_average_glove(x, glove_twitter, embedding_dim))

df.head()

Loaded 1193514 word vectors from GloVe.


Unnamed: 0,Content,Score,Sentiment,Content_cleaned,tokens,word2vec_pretrained,word2vec_cbow,glove_6B,glove_twitter
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp give screen limit like ur watch thi...,"[plss, stopp, give, screen, limit, like, ur, w...","[0.08365452, 0.0579847, 0.11433671, -0.0025425...","[-0.010158361, -0.2950956, 0.02295302, 0.05775...","[-0.1198448, 0.12636456, 0.41294017, -0.217294...","[0.123258926, 0.09078163, -0.101420276, 0.2712..."
1,Good,5,positive,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.33362323, -1.5361803, -0.45669645, -0.59262...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739...","[0.091552, 0.093336, -0.028113, 0.3699, 0.1895..."
2,👍👍,5,positive,thumb up thumb up,"[thumb, up, thumb, up]","[0.08703613, 0.07147217, -0.00390625, 0.005859...","[-0.49321496, 0.1811941, -0.27961582, 1.002281...","[-0.22568002, 0.342005, 0.248815, -0.577975, -...","[0.26894343, -0.28983998, 0.164455, -0.166473,..."
3,Good,3,neutral,good,[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.33362323, -1.5361803, -0.45669645, -0.59262...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739...","[0.091552, 0.093336, -0.028113, 0.3699, 0.1895..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...,"[app, useful, certain, phone, brand, except, p...","[0.0644662, -0.0806833, -0.0020926339, 0.02535...","[-0.13318165, -0.09479263, -0.0055906163, 0.20...","[-0.21259494, -0.062381856, 0.21229614, 0.0178...","[0.30852813, 0.06642222, -0.07303124, 0.210921..."



### Preparing the labels 

In [11]:
y_df = df['Sentiment']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_df)

### Performing the train-test splits for the different vectors

In [12]:
bow_train, bow_test, y_train, y_test = train_test_split(bow, y, test_size=0.2, random_state=42)

In [13]:
tfidf_train, tfidf_test, y_train, y_test = train_test_split(tfidf, y, test_size=0.2, random_state=42)

In [14]:
w2v_pre = np.vstack(df['word2vec_pretrained'].values)

w2v_pre_train, w2v_pre_test, y_train, y_test = train_test_split(w2v_pre, y, test_size=0.2, random_state=42)

In [15]:
w2v_cbow = np.vstack(df['word2vec_cbow'].values)

w2v_cbow_train, w2v_cbow_test, y_train, y_test = train_test_split(w2v_cbow, y, test_size=0.2, random_state=42)

In [16]:
glove_6b = np.vstack(df['glove_6B'].values)

glove_6b_train, glove_6b_test, y_train, y_test = train_test_split(glove_6b, y, test_size=0.2, random_state=42)

In [17]:
glove_twitter = np.vstack(df['glove_twitter'].values)

glove_twitter_train, glove_twitter_test, y_train, y_test = train_test_split(glove_twitter, y, test_size=0.2, random_state=42)

## Models

### Logistic Regression

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)  # with_mean=False because BoW has sparse matrix format
bow_train_scaled = scaler.fit_transform(bow_train)
bow_test_scaled = scaler.transform(bow_test)

# Logistic Regression model
lr_bow = LogisticRegression()
lr_bow.fit(bow_train_scaled, y_train)

# Predictions
y_pred = lr_bow.predict(bow_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using BoW and Logistic Regression: {accuracy}")

results = {}
results['lr_bow'] = accuracy

Accuracy using BoW and Logistic Regression: 0.735248687055916


In [19]:
scaler = StandardScaler(with_mean=False)  # with_mean=False because BoW has sparse matrix format
tfidf_train_scaled = scaler.fit_transform(tfidf_train)
tfidf_test_scaled = scaler.transform(tfidf_test)

# Logistic Regression model
lr_tfidf = LogisticRegression()
lr_tfidf.fit(tfidf_train_scaled, y_train)

# Predictions
y_pred = lr_tfidf.predict(tfidf_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using TFIDF and Logistic Regression: {accuracy}")

results['lr_tfidf'] = accuracy

Accuracy using TFIDF and Logistic Regression: 0.733659914382806


In [20]:
# Logistic Regression model
lr_w2v_pre = LogisticRegression()
lr_w2v_pre.fit(w2v_pre_train, y_train)

# Predictions
y_pred = lr_w2v_pre.predict(w2v_pre_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using W2V_pretrained and Logistic Regression: {accuracy}")

results['lr_w2v_pre'] = accuracy

Accuracy using W2V_pretrained and Logistic Regression: 0.7758065227944746


In [21]:
scaler = StandardScaler(with_mean=False)  # with_mean=False because BoW has sparse matrix format
w2v_cbow_train_scaled = scaler.fit_transform(w2v_cbow_train)
w2v_cbow_test_scaled = scaler.transform(w2v_cbow_test)

# Logistic Regression model
lr_w2v_cbow = LogisticRegression(max_iter=500)
lr_w2v_cbow.fit(w2v_cbow_train_scaled, y_train)

# Predictions
y_pred = lr_w2v_cbow.predict(w2v_cbow_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using W2V_CBOW and Logistic Regression: {accuracy}")

results['lr_w2b_cbow'] = accuracy

Accuracy using W2V_CBOW and Logistic Regression: 0.7860452800211837


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
scaler = StandardScaler(with_mean=False)  # with_mean=False because BoW has sparse matrix format
glove_6b_train_scaled = scaler.fit_transform(glove_6b_train)
glove_6b_test_scaled = scaler.transform(glove_6b_test)

# Logistic Regression model
lr_glove_6b = LogisticRegression(max_iter=500)
lr_glove_6b.fit(glove_6b_train_scaled, y_train)

# Predictions
y_pred = lr_glove_6b.predict(glove_6b_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using W2V_CBOW and Logistic Regression: {accuracy}")

results['lr_glove_6b'] = accuracy

Accuracy using W2V_CBOW and Logistic Regression: 0.7518425349750651


In [23]:
scaler = StandardScaler(with_mean=False)  # with_mean=False because BoW has sparse matrix format
glove_twitter_train_scaled = scaler.fit_transform(glove_twitter_train)
glove_twitter_test_scaled = scaler.transform(glove_twitter_test)

# Logistic Regression model
lr_glove_twitter = LogisticRegression(max_iter=500)
lr_glove_twitter.fit(glove_twitter_train_scaled, y_train)

# Predictions
y_pred = lr_glove_twitter.predict(glove_twitter_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using W2V_CBOW and Logistic Regression: {accuracy}")

results['lr_glove_twitter'] = accuracy

Accuracy using W2V_CBOW and Logistic Regression: 0.7619047619047619


In [24]:
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])

# Print the results table
print(results_df)

              Model  Accuracy
0            lr_bow  0.735249
1          lr_tfidf  0.733660
2        lr_w2v_pre  0.775807
3       lr_w2b_cbow  0.786045
4       lr_glove_6b  0.751843
5  lr_glove_twitter  0.761905


### SVC

In [41]:
from sklearn.svm import LinearSVC

svc_bow = LinearSVC(C=1, max_iter=5000)
svc_bow.fit(bow_train, y_train)

# Predictions
y_pred = svc_bow.predict(bow_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using BoW and SVC: {accuracy}")

results = {}
results['svc_bow'] = accuracy



Accuracy using BoW and SVC: 0.7736881592303279




In [42]:
svc_tfidf = LinearSVC(C=1, max_iter=5000)
svc_tfidf.fit(tfidf_train, y_train)

# Predictions
y_pred = svc_tfidf.predict(tfidf_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using tfidf and SVC: {accuracy}")

results['svc_tfidf'] = accuracy



Accuracy using tfidf and SVC: 0.7867955337834855


In [43]:
svc_w2v_pre = LinearSVC(C=1, max_iter=5000)
svc_w2v_pre.fit(w2v_pre_train, y_train)

# Predictions
y_pred = svc_w2v_pre.predict(w2v_pre_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using w2v_pre and SVC: {accuracy}")

results['svc_w2v_pre'] = accuracy



Accuracy using w2v_pre and SVC: 0.7766891742795358


In [44]:
svc_w2v_cbow = LinearSVC(C=1, max_iter=5000)
svc_w2v_cbow.fit(w2v_cbow_train, y_train)

# Predictions
y_pred = svc_w2v_cbow.predict(w2v_cbow_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using w2v_cbow and SVC: {accuracy}")

results['svc_w2v_cbow'] = accuracy



Accuracy using w2v_cbow and SVC: 0.7862218103181958




In [45]:
svc_glove_6b = LinearSVC(C=1, max_iter=5000)
svc_glove_6b.fit(glove_6b_train, y_train)

# Predictions
y_pred = svc_glove_6b.predict(glove_6b_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using glove_6b and SVC: {accuracy}")

results['svc_glove_6b'] = accuracy



Accuracy using glove_6b and SVC: 0.7530341144798977


In [46]:
svc_glove_twitter = LinearSVC(C=1, max_iter=5000)
svc_glove_twitter.fit(glove_twitter_train, y_train)

# Predictions
y_pred = svc_glove_twitter.predict(glove_twitter_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using glove_twitter and SVC: {accuracy}")

results['svc_glove_twitter'] = accuracy



Accuracy using glove_twitter and SVC: 0.7615958338849905


In [47]:
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])

# Print the results table
print(results_df)

               Model  Accuracy
0            svc_bow  0.773688
1          svc_tfidf  0.786796
2        svc_w2v_pre  0.776689
3       svc_w2v_cbow  0.786222
4       svc_glove_6b  0.753034
5  svc_glove_twitter  0.761596


### Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier

rf_bow = RandomForestClassifier()

rf_bow.fit(bow_train, y_train)

# Predictions
y_pred = rf_bow.predict(bow_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using BoW and RF: {accuracy}")

results = {}
results['rf_bow'] = accuracy

Accuracy using BoW and RF: 0.7834414581402533


In [27]:
# Logistic Regression model
rf_tfidf = RandomForestClassifier()
rf_tfidf.fit(tfidf_train, y_train)

# Predictions
y_pred = rf_tfidf.predict(tfidf_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using TFIDF and RF: {accuracy}")

results['rf_tfidf'] = accuracy

Accuracy using TFIDF and RF: 0.7811465642790943


In [28]:
rf_w2v_pre = RandomForestClassifier()
rf_w2v_pre.fit(w2v_pre_train, y_train)

# Predictions
y_pred = rf_w2v_pre.predict(w2v_pre_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using W2V_pretrained and RF: {accuracy}")

results['rf_w2v_pre'] = accuracy

Accuracy using W2V_pretrained and RF: 0.7578445650734807


In [29]:
rf_w2v_cbow = RandomForestClassifier()
rf_w2v_cbow.fit(w2v_cbow_train, y_train)

# Predictions
y_pred = rf_w2v_cbow.predict(w2v_cbow_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using W2V_cbow and RF: {accuracy}")

results['rf_w2v_cbow'] = accuracy

Accuracy using W2V_cbow and RF: 0.7751445341806787


In [30]:
rf_glove_6b = RandomForestClassifier()
rf_glove_6b.fit(glove_6b_train, y_train)

# Predictions
y_pred = rf_glove_6b.predict(glove_6b_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using glove_6b and RF: {accuracy}")

results['rf_glove_6b'] = accuracy

Accuracy using glove_6b and RF: 0.7417803080453683


In [31]:
rf_glove_twitter = RandomForestClassifier()
rf_glove_twitter.fit(glove_twitter_train, y_train)

# Predictions
y_pred = rf_glove_twitter.predict(glove_twitter_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using glove_twitter and RF: {accuracy}")

results['rf_glove_twitter'] = accuracy

Accuracy using glove_twitter and RF: 0.7510481486385101


In [32]:
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])

# Print the results table
print(results_df)

              Model  Accuracy
0            rf_bow  0.783441
1          rf_tfidf  0.781147
2        rf_w2v_pre  0.757845
3       rf_w2v_cbow  0.775145
4       rf_glove_6b  0.741780
5  rf_glove_twitter  0.751048
