In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
import tf_idf
import ELMo
import bag_of_words
import BERT_updated
import fasttext
import wget
import glove
import Word2Vec

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc



In [4]:
main_data = pd.read_csv('/Users/andrewsimon/Desktop/Dow_dat.csv')

In [5]:
def get_tfidf_embeddings(data, text, labels):
    return tf_idf.generate_tfidf_embeddings(data, text, labels).drop(columns=labels).values

In [6]:
def get_BoW_embeddings(data, text, labels):
    return bag_of_words.generate_bow_embeddings(data, text, labels).drop(columns=labels).values

In [7]:
def get_bert_embeddings(data, text, labels):
    return BERT_updated.generate_bert_embeddings(data, text, labels)

In [8]:
def get_Word2Vec_embeddings(data, text):
    return Word2Vec.get_embeddings(data, text)

In [9]:
def get_elmo_embeddings(data, text):
    return ELMo.get_embeddings(data, text)

In [10]:
def get_fasttext_embeddings(data, text, labels):
    return fasttext.fasttext_embedding(data, text, labels)

In [11]:
def get_glove_embeddings(data, text, labels):
    return glove.glove_embedding(data, text, labels)

In [12]:
def train_test_split_downstream(features, labels, test_size, random_state):
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [13]:
def random_forest_model(X_train, X_test, y_train, y_test, n_estimators=1100, scoring_metric='accuracy'):
    
    rf = RandomForestClassifier(n_estimators=n_estimators).fit(X_train, y_train)

    predictions = rf.predict(X_test)

    if scoring_metric == 'accuracy':
        score = rf.score(X_test, y_test)
        return score
    elif scoring_metric == 'precision':
        precision = precision_score(y_test, predictions)
        return precision
    elif scoring_metric == 'recall':
        recall = recall_score(y_test, predictions)
        return recall
    elif scoring_metric == 'auc':
        fpr, tpr, thresholds = roc_curve(y_test, predictions)
        auc_score = auc(fpr, tpr)
        return auc_score

        

In [14]:
def decision_tree_model(X_train, X_test, y_train, y_test, scoring_metric='accuracy'):
    
    clf_decision_tree = DecisionTreeClassifier()
    clf_decision_tree.fit(X_train, y_train)

    predictions = clf_decision_tree.predict(X_test)

    if scoring_metric == 'accuracy':
        score = accuracy_score(y_test, predictions)
        return score
    elif scoring_metric == 'precision':
        precision = precision_score(y_test, predictions)
        return precision
    elif scoring_metric == 'recall':
        recall = recall_score(y_test, predictions)
        return recall
    elif scoring_metric == 'auc':
        fpr, tpr, thresholds = roc_curve(y_test, predictions)
        auc_score = auc(fpr, tpr)
        return auc_score

In [15]:
def logistic_regression_model(X_train, X_test, y_train, y_test, scoring_metric='accuracy'):
    
    classifier = LogisticRegression( max_iter = 100000)
    classifier.fit(X_train, y_train)

    predictions = classifier.predict(X_test)

    if scoring_metric == 'accuracy':
        score = accuracy_score(y_test, predictions)
        return score
    elif scoring_metric == 'precision':
        precision = precision_score(y_test, predictions)
        return precision
    elif scoring_metric == 'recall':
        recall = recall_score(y_test, predictions)
        return recall
    elif scoring_metric == 'auc':
        fpr, tpr, thresholds = roc_curve(y_test, predictions)
        auc_score = auc(fpr, tpr)
        return auc_score

In [16]:
def svm_model(X_train, X_test, y_train, y_test, scoring_metric='accuracy'):
    
    clf = svm.SVC()
    clf.fit(X_train, y_train)

    predictions = clf.predict(X_test)

    if scoring_metric == 'accuracy':
        score = accuracy_score(y_test, predictions)
        return score
    elif scoring_metric == 'precision':
        precision = precision_score(y_test, predictions)
        return precision
    elif scoring_metric == 'recall':
        recall = recall_score(y_test, predictions)
        return recall
    elif scoring_metric == 'auc':
        fpr, tpr, thresholds = roc_curve(y_test, predictions)
        auc_score = auc(fpr, tpr)
        return auc_score

In [75]:
main_data = pd.read_csv('/Users/andrewsimon/Desktop/Dow_dat.csv')

embeddings = get_bert_embeddings(main_data, 'Report', 'Level')

X_train, X_test, y_train, y_test = train_test_split_downstream(embeddings, main_data['Level'], 0.2, 42)

accuracy = logistic_regression_model(X_train, X_test, y_train, y_test, scoring_metric='accuracy')

accuracy

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.7333333333333333

In [17]:
def find_optimal_method(data, features, labels, test_size=0.2, random_state=42):
    
    bow_embeddings = get_BoW_embeddings(data, features, labels)
    tf_idf_embeddings = get_tfidf_embeddings(data,features,labels)
    bert_embeddings = get_bert_embeddings(data, features, labels)
    word2vec_embeddings = get_Word2Vec_embeddings(data,features)
    elmo_embeddings = get_elmo_embeddings(data,features)
    fasttext_embeddings = get_fasttext_embeddings(data,features, labels)
    glove_embeddings = get_glove_embeddings(data,features,labels)

    X_train_bow, X_test_bow, y_train, y_test = train_test_split_downstream(bow_embeddings, data[labels], test_size=test_size, random_state=random_state)
    X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split_downstream(tf_idf_embeddings, data[labels], test_size=test_size, random_state=random_state)
    X_train_bert, X_test_bert, y_train, y_test = train_test_split_downstream(bert_embeddings, data[labels], test_size=test_size, random_state=random_state)
    X_train_w2v, X_test_w2v, y_train, y_test = train_test_split_downstream(word2vec_embeddings, data[labels], test_size=test_size, random_state=random_state)
    X_train_elmo, X_test_elmo, y_train, y_test = train_test_split_downstream(elmo_embeddings, data[labels], test_size=test_size, random_state=random_state)
    X_train_fasttext, X_test_fasttext, y_train, y_test = train_test_split_downstream(fasttext_embeddings, data[labels], test_size=test_size, random_state=random_state)
    X_train_glove, X_test_glove, y_train, y_test = train_test_split_downstream(glove_embeddings, data[labels], test_size=test_size, random_state=random_state)

    bow_embeddings_values = [
        random_forest_model(X_train_bow,X_test_bow, y_train, y_test),
        decision_tree_model(X_train_bow,X_test_bow, y_train, y_test),
        logistic_regression_model(X_train_bow,X_test_bow, y_train, y_test),
        svm_model(X_train_bow,X_test_bow, y_train, y_test)
    ]

    tf_idf_embeddings_values = [
        random_forest_model(X_train_tfidf,X_test_tfidf, y_train, y_test),
        decision_tree_model(X_train_tfidf,X_test_tfidf, y_train, y_test),
        logistic_regression_model(X_train_tfidf,X_test_tfidf, y_train, y_test),
        svm_model(X_train_tfidf,X_test_tfidf, y_train, y_test)
    ]

    bert_embeddings_values = [
        random_forest_model(X_train_bert,X_test_bert, y_train, y_test),
        decision_tree_model(X_train_bert,X_test_bert, y_train, y_test),
        logistic_regression_model(X_train_bert,X_test_bert, y_train, y_test),
        svm_model(X_train_bert,X_test_bert, y_train, y_test)
    ]

    w2v_embeddings_values = [
        random_forest_model(X_train_w2v,X_test_w2v, y_train, y_test),
        decision_tree_model(X_train_w2v,X_test_w2v, y_train, y_test),
        logistic_regression_model(X_train_w2v,X_test_w2v, y_train, y_test),
        svm_model(X_train_w2v,X_test_w2v, y_train, y_test)
    ]

    elmo_embeddings_values = [
        random_forest_model(X_train_elmo ,X_test_elmo, y_train, y_test),
        decision_tree_model(X_train_elmo ,X_test_elmo, y_train, y_test),
        logistic_regression_model(X_train_elmo ,X_test_elmo, y_train, y_test),
        svm_model(X_train_elmo ,X_test_elmo, y_train, y_test)
    ]

    fasttext_embeddings_values = [
        random_forest_model(X_train_fasttext ,X_test_fasttext, y_train, y_test),
        decision_tree_model(X_train_fasttext ,X_test_fasttext, y_train, y_test),
        logistic_regression_model(X_train_fasttext ,X_test_fasttext, y_train, y_test),
        svm_model(X_train_fasttext ,X_test_fasttext, y_train, y_test)
    ]

    glove_embeddings_values = [
        random_forest_model(X_train_glove ,X_test_glove, y_train, y_test),
        decision_tree_model(X_train_glove ,X_test_glove, y_train, y_test),
        logistic_regression_model(X_train_glove ,X_test_glove, y_train, y_test),
        svm_model(X_train_glove ,X_test_glove, y_train, y_test)
    ]

    score_pd = {'Bag of Words': bow_embeddings_values, 'tf idf': tf_idf_embeddings_values, 'BERT': bert_embeddings_values,
                'Word2Vec': w2v_embeddings_values, 'ELMo': elmo_embeddings_values, 'FastText': fasttext_embeddings_values,
                'GLoVE': glove_embeddings_values}
    return pd.DataFrame(data=score_pd)

In [85]:
main_data = main_data = pd.read_csv('/Users/andrewsimon/Desktop/Dow_dat.csv')
pd = find_optimal_method(main_data, 'Report', 'Level')

AttributeError: 'DataFrame' object has no attribute 'read_csv'

In [18]:
main_data = pd.read_csv('/Users/andrewsimon/Desktop/IMDB_Dataset.csv')
main_data['sentiment'] = main_data['sentiment'].map({'positive': 1, 'negative': 0})

main_data = main_data.sample(n=1000)
main_data = main_data.reset_index()

results = find_optimal_method(main_data, 'review', 'sentiment')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
progress-bar: 100%|██████████| 1000/1000 [00:00<00:00, 1403.21it/s]
1000it [00:00, 568487.94it/s]
1000it [00:01, 570.79it/s]
2023-06-07 10:57:39.816127: W 

Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 0 / 1000


In [84]:
pd

Unnamed: 0,Bag of Words,tf idf,BERT,Word2Vec,ELMo,FastText,GLoVE
0,0.8,0.716667,0.766667,0.683333,0.75,0.75,0.75
1,0.766667,0.733333,0.55,0.583333,0.633333,0.466667,0.55
2,0.766667,0.683333,0.733333,0.666667,0.683333,0.45,0.533333
3,0.55,0.616667,0.7,0.533333,0.45,0.45,0.266667


In [19]:
results

Unnamed: 0,Bag of Words,tf idf,BERT,Word2Vec,ELMo,FastText,GLoVE
0,0.81,0.805,0.665,0.75,0.745,0.755,0.71
1,0.67,0.6,0.595,0.56,0.65,0.585,0.56
2,0.825,0.83,0.815,0.805,0.815,0.575,0.71
3,0.725,0.825,0.745,0.78,0.805,0.67,0.645


In [4]:
embeddings_tfidf = tf_idf.generate_tfidf_embeddings(main_data, 'Report', 'Level')

In [5]:
embeddings_array = embeddings_tfidf.drop(columns='Level').values
labels = embeddings_tfidf['Level'].values

In [19]:
X_train, X_test, y_train, y_test = train_test_split(embeddings,
                                                    main_data['Level'], test_size=0.2, random_state=42)

In [20]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1100).fit(X_train, y_train)
predictions = rf.predict(X_test)
score = rf.score(X_test, y_test)
score

0.75

In [8]:
embeddings_BoW = bag_of_words.generate_bow_embeddings(main_data, 'Report', "Level")

In [9]:
embeddings_array = embeddings_BoW.drop(columns='Level').values
labels=embeddings_BoW["Level"].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(embeddings_array,
                                                    labels, test_size=0.2, random_state=42)

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1100).fit(X_train, y_train)
predictions = rf.predict(X_test)
score = rf.score(X_test, y_test)
score

0.8

In [13]:
fasttext_embeddings = fasttext.fasttext_embedding(main_data, 'Report', 'Level')

In [12]:
fasttext_embeddings

array([[ 0.00462546, -0.00377681,  0.00835782, ...,  0.08570157,
         0.00314991, -0.00139768],
       [ 0.00573479, -0.01565916,  0.00055071, ...,  0.15088515,
        -0.00016365, -0.01036716],
       [ 0.00575128, -0.00947065,  0.00352919, ...,  0.09792969,
        -0.0005991 , -0.00288053],
       ...,
       [ 0.00498452, -0.02765043, -0.00959891, ...,  0.11969245,
        -0.00538165, -0.00801392],
       [ 0.00151075, -0.01813442,  0.01682064, ...,  0.08681662,
         0.01577524, -0.01038384],
       [-0.00617472, -0.02306897,  0.01692365, ...,  0.11087851,
        -0.00601448,  0.00964224]], dtype=float32)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(fasttext_embeddings,
                                                    main_data['Level'], test_size=0.2, random_state=42)

In [17]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1100).fit(X_train, y_train)
predictions = rf.predict(X_test)
score = rf.score(X_test, y_test)
score

0.75

In [4]:
bert_embeddings = BERT_updated.generate_bert_embeddings(main_data, 'Report', 'Level')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
bert_embeddings

array([[-0.36320272, -0.32057866, -0.50059915, ..., -0.16303292,
        -0.17720942, -0.03341506],
       [ 0.05917213, -0.30518496, -0.52072686, ...,  0.22519325,
         0.5077894 ,  0.2868259 ],
       [-0.24662916, -0.2357285 , -0.49209008, ..., -0.063917  ,
        -0.2540468 ,  0.05076841],
       ...,
       [-0.34315735, -0.39122057, -0.69828576, ...,  0.40737885,
         0.36793244,  0.00545898],
       [-0.41907865, -0.6085989 , -0.62236917, ..., -0.18170431,
        -0.05564305, -0.29402852],
       [-0.342932  , -0.18032494, -0.31473652, ...,  0.3635915 ,
         0.29547995, -0.00772604]], dtype=float32)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(bert_embeddings,
                                                    main_data['Level'], test_size=0.2, random_state=42)

In [8]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1100).fit(X_train, y_train)
predictions = rf.predict(X_test)
score = rf.score(X_test, y_test)
score

0.7333333333333333

In [7]:
glove_embeddings = glove.glove_embedding(main_data, 'Report', 'Level')

Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 0 / 300


In [8]:
X_train, X_test, y_train, y_test = train_test_split(glove_embeddings,
                                                    main_data['Level'], test_size=0.2, random_state=42)

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1100).fit(X_train, y_train)
predictions = rf.predict(X_test)
score = rf.score(X_test, y_test)
score

0.6833333333333333

In [None]:
main_data