In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [70]:
import tf_idf
import ELMo
import bag_of_words
import BERT_updated
import fasttext
import wget
import glove
import Word2Vec

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc



In [71]:
main_data = pd.read_csv('/Users/andrewsimon/Desktop/Dow_dat.csv')

In [72]:
def get_tfidf_embeddings(data, text, labels):
    return tf_idf.generate_tfidf_embeddings(data, text, labels).drop(columns=labels).values

In [73]:
def get_BoW_embeddings(data, text, labels):
    return bag_of_words.generate_bow_embeddings(data, text, labels).drop(columns=labels).values

In [74]:
def get_bert_embeddings(data, text, labels):
    return BERT_updated.generate_bert_embeddings(data, text, labels)

In [75]:
def get_Word2Vec_embeddings(data, text):
    return Word2Vec.get_embeddings(data, text)

In [76]:
def get_elmo_embeddings(data, text):
    return ELMo.get_embeddings(data, text)

In [77]:
def get_fasttext_embeddings(data, text, labels):
    return fasttext.fasttext_embedding(data, text, labels)

In [78]:
def get_glove_embeddings(data, text, labels):
    return glove.glove_embedding(data, text, labels)

In [79]:
def train_test_split_downstream(features, labels, test_size, random_state):
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [80]:
def random_forest_model(X_train, X_test, y_train, y_test, n_estimators=1100, scoring_metric='accuracy'):
    
    rf = RandomForestClassifier(n_estimators=n_estimators).fit(X_train, y_train)

    predictions = rf.predict(X_test)

    if scoring_metric == 'accuracy':
        score = rf.score(X_test, y_test)
        return score
    elif scoring_metric == 'precision':
        precision = precision_score(y_test, predictions)
        return precision
    elif scoring_metric == 'recall':
        recall = recall_score(y_test, predictions)
        return recall
    elif scoring_metric == 'auc':
        fpr, tpr, thresholds = roc_curve(y_test, predictions)
        auc_score = auc(fpr, tpr)
        return auc_score

        

In [81]:
def decision_tree_model(X_train, X_test, y_train, y_test, scoring_metric='accuracy'):
    
    clf_decision_tree = DecisionTreeClassifier()
    clf_decision_tree.fit(X_train, y_train)

    predictions = clf_decision_tree.predict(X_test)

    if scoring_metric == 'accuracy':
        score = accuracy_score(y_test, predictions)
        return score
    elif scoring_metric == 'precision':
        precision = precision_score(y_test, predictions)
        return precision
    elif scoring_metric == 'recall':
        recall = recall_score(y_test, predictions)
        return recall
    elif scoring_metric == 'auc':
        fpr, tpr, thresholds = roc_curve(y_test, predictions)
        auc_score = auc(fpr, tpr)
        return auc_score

In [82]:
def logistic_regression_model(X_train, X_test, y_train, y_test, scoring_metric='accuracy'):
    
    classifier = LogisticRegression( max_iter = 100000)
    classifier.fit(X_train, y_train)

    predictions = classifier.predict(X_test)

    if scoring_metric == 'accuracy':
        score = accuracy_score(y_test, predictions)
        return score
    elif scoring_metric == 'precision':
        precision = precision_score(y_test, predictions)
        return precision
    elif scoring_metric == 'recall':
        recall = recall_score(y_test, predictions)
        return recall
    elif scoring_metric == 'auc':
        fpr, tpr, thresholds = roc_curve(y_test, predictions)
        auc_score = auc(fpr, tpr)
        return auc_score

In [83]:
def svm_model(X_train, X_test, y_train, y_test, scoring_metric='accuracy'):
    
    clf = svm.SVC()
    clf.fit(X_train, y_train)

    predictions = clf.predict(X_test)

    if scoring_metric == 'accuracy':
        score = accuracy_score(y_test, predictions)
        return score
    elif scoring_metric == 'precision':
        precision = precision_score(y_test, predictions)
        return precision
    elif scoring_metric == 'recall':
        recall = recall_score(y_test, predictions)
        return recall
    elif scoring_metric == 'auc':
        fpr, tpr, thresholds = roc_curve(y_test, predictions)
        auc_score = auc(fpr, tpr)
        return auc_score

In [84]:
def find_optimal_method(data, features, labels, test_size=0.2, random_state=42):
    
    bow_embeddings = get_BoW_embeddings(data, features, labels)
    tf_idf_embeddings = get_tfidf_embeddings(data,features,labels)
    bert_embeddings = get_bert_embeddings(data, features, labels)
    word2vec_embeddings = get_Word2Vec_embeddings(data,features)
    elmo_embeddings = get_elmo_embeddings(data,features)
    fasttext_embeddings = get_fasttext_embeddings(data,features, labels)
    glove_embeddings = get_glove_embeddings(data,features,labels)

    X_train_bow, X_test_bow, y_train, y_test = train_test_split_downstream(bow_embeddings, data[labels], test_size=test_size, random_state=random_state)
    X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split_downstream(tf_idf_embeddings, data[labels], test_size=test_size, random_state=random_state)
    X_train_bert, X_test_bert, y_train, y_test = train_test_split_downstream(bert_embeddings, data[labels], test_size=test_size, random_state=random_state)
    X_train_w2v, X_test_w2v, y_train, y_test = train_test_split_downstream(word2vec_embeddings, data[labels], test_size=test_size, random_state=random_state)
    X_train_elmo, X_test_elmo, y_train, y_test = train_test_split_downstream(elmo_embeddings, data[labels], test_size=test_size, random_state=random_state)
    X_train_fasttext, X_test_fasttext, y_train, y_test = train_test_split_downstream(fasttext_embeddings, data[labels], test_size=test_size, random_state=random_state)
    X_train_glove, X_test_glove, y_train, y_test = train_test_split_downstream(glove_embeddings, data[labels], test_size=test_size, random_state=random_state)

    bow_embeddings_values = [
        random_forest_model(X_train_bow,X_test_bow, y_train, y_test),
        decision_tree_model(X_train_bow,X_test_bow, y_train, y_test),
        logistic_regression_model(X_train_bow,X_test_bow, y_train, y_test),
        svm_model(X_train_bow,X_test_bow, y_train, y_test)
    ]

    tf_idf_embeddings_values = [
        random_forest_model(X_train_tfidf,X_test_tfidf, y_train, y_test),
        decision_tree_model(X_train_tfidf,X_test_tfidf, y_train, y_test),
        logistic_regression_model(X_train_tfidf,X_test_tfidf, y_train, y_test),
        svm_model(X_train_tfidf,X_test_tfidf, y_train, y_test)
    ]

    bert_embeddings_values = [
        random_forest_model(X_train_bert,X_test_bert, y_train, y_test),
        decision_tree_model(X_train_bert,X_test_bert, y_train, y_test),
        logistic_regression_model(X_train_bert,X_test_bert, y_train, y_test),
        svm_model(X_train_bert,X_test_bert, y_train, y_test)
    ]

    w2v_embeddings_values = [
        random_forest_model(X_train_w2v,X_test_w2v, y_train, y_test),
        decision_tree_model(X_train_w2v,X_test_w2v, y_train, y_test),
        logistic_regression_model(X_train_w2v,X_test_w2v, y_train, y_test),
        svm_model(X_train_w2v,X_test_w2v, y_train, y_test)
    ]

    elmo_embeddings_values = [
        random_forest_model(X_train_elmo ,X_test_elmo, y_train, y_test),
        decision_tree_model(X_train_elmo ,X_test_elmo, y_train, y_test),
        logistic_regression_model(X_train_elmo ,X_test_elmo, y_train, y_test),
        svm_model(X_train_elmo ,X_test_elmo, y_train, y_test)
    ]

    fasttext_embeddings_values = [
        random_forest_model(X_train_fasttext ,X_test_fasttext, y_train, y_test),
        decision_tree_model(X_train_fasttext ,X_test_fasttext, y_train, y_test),
        logistic_regression_model(X_train_fasttext ,X_test_fasttext, y_train, y_test),
        svm_model(X_train_fasttext ,X_test_fasttext, y_train, y_test)
    ]

    glove_embeddings_values = [
        random_forest_model(X_train_glove ,X_test_glove, y_train, y_test),
        decision_tree_model(X_train_glove ,X_test_glove, y_train, y_test),
        logistic_regression_model(X_train_glove ,X_test_glove, y_train, y_test),
        svm_model(X_train_glove ,X_test_glove, y_train, y_test)
    ]

    score_pd = { "Classifier" : ['Random Forest',' Decision Tree','Logistic Regression', 'SVM'],
                'Bag of Words': bow_embeddings_values, 'tf idf': tf_idf_embeddings_values, 'BERT': bert_embeddings_values,
                'Word2Vec': w2v_embeddings_values, 'ELMo': elmo_embeddings_values, 'FastText': fasttext_embeddings_values,
                'GLoVE': glove_embeddings_values}
    
    return pd.DataFrame(data=score_pd)