0223214 - Efren Flores Porras

In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
def main():
    warnings.filterwarnings("ignore")

    datos = leeDatos()
    print(datos)
    print("\n")

    encodeDatos = encodeData(datos)
    print(encodeDatos)
    print("\n")
    print(encodeDatos['Positively Rated'].mean())
    print("\n")

    ts_size = .25
    [trainSet, testSet] = splitDataSet(encodeDatos, test_size=ts_size)
    print(trainSet)
    print('trainSet shape: ', trainSet.shape)
    print(testSet)
    print('testSet shape: ', testSet.shape)
    print("\n")

    [train_vectors, test_vectors] = processData(trainSet, testSet)

    metodosML(train_vectors, trainSet, test_vectors, testSet)

def leeDatos():
    dataSet = pd.read_csv("archives/Sentiment_Stock_data.csv", header=0)
    return dataSet

def encodeData(dataSet=0):
    dataSet.dropna(inplace=True)

    # Encode 1s as rated positively
    # Encode 0 as rated poorly
    dataSet['Positively Rated'] = np.where(dataSet['Sentiment'] == 1, 1, 0)
    datos = ['Sentence', 'Positively Rated']
    misDatos = dataSet[datos]
    misDatos = misDatos.iloc[0:10000, :]

    return misDatos

def splitDataSet(dataSet=0, test_size=.2):
    """
    Split data in train and test sets
    """
    train, test = train_test_split(dataSet, test_size=test_size, random_state=0)
    return [train, test]

def processData(trainSet=0, testSet=0):
    # Create feature vectors
    vectorizer = TfidfVectorizer(stop_words='english',
                                 min_df=5,
                                 max_df=0.8,
                                 sublinear_tf=True,
                                 use_idf=True)
    train_vectors = vectorizer.fit_transform(trainSet['Sentence'])
    test_vectors = vectorizer.transform(testSet['Sentence'])
    return [train_vectors, test_vectors]

In [5]:
def metodosML(train_vectors=0, trainSet=0, test_vectors=0, testSet=0):
    methodsUsed = ['SVM', 'DT', 'Logistic Regression', 'Random Forest']
    performanceHeaders = ['precision', 'recall', 'f1-score']
    modPerformancePos = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
    modPerformanceNeg = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)

    # Perform classification with SVM
    print('Classification with SVM')
    svm_clf = svm.SVC()
    param_search_svm = {
        'kernel': ["poly"],
        'degree': [1, 2, 3, 4],
        'coef0': [1, 2]
    }
    grid_search_svm = GridSearchCV(estimator=svm_clf, param_grid=param_search_svm, cv=5, verbose=1)
    grid_search_svm.fit(train_vectors, trainSet['Positively Rated'])
    best_clf_svm = grid_search_svm.best_estimator_
    svm_prediction = best_clf_svm.predict(test_vectors)

    df = pd.DataFrame(svm_prediction, columns=['SVM Prediction'])

    # results report
    report = classification_report(testSet['Positively Rated'], svm_prediction, output_dict=True)
    positive = report['1']
    dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
    negative = report['0']
    dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

    print(dfpos)
    print("\n")
    print(dfneg)
    print("\n")
    print("accuracy: ", round(report['accuracy'], 2))
    print("\n")

    modPerformancePos.iloc[0, 0] = dfpos.iloc[0, 0]
    modPerformanceNeg.iloc[0, 0] = dfneg.iloc[0, 0]
    modPerformancePos.iloc[0, 1] = dfpos.iloc[1, 0]
    modPerformanceNeg.iloc[0, 1] = dfneg.iloc[1, 0]
    modPerformancePos.iloc[0, 2] = dfpos.iloc[2, 0]
    modPerformanceNeg.iloc[0, 2] = dfneg.iloc[2, 0]

    # Perform classification with DT
    print('Classification with DT')
    dt_clf = DecisionTreeClassifier()
    param_search_dt = {
        'criterion': ["gini", 'entropy'],
        'max_depth': [5, 10, 20, 30, None]
    }
    grid_search_dt = GridSearchCV(estimator=dt_clf, param_grid=param_search_dt, cv=5, verbose=1)
    grid_search_dt.fit(train_vectors, trainSet['Positively Rated'])
    best_clf_dt = grid_search_dt.best_estimator_
    dt_prediction = best_clf_dt.predict(test_vectors)

    df['DT Prediction'] = dt_prediction

    # results report
    report = classification_report(testSet['Positively Rated'], dt_prediction, output_dict=True)
    positive = report['1']
    dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
    negative = report['0']
    dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

    print(dfpos)
    print("\n")
    print(dfneg)
    print("\n")
    print("accuracy: ", round(report['accuracy'], 2))
    print("\n")

    modPerformancePos.iloc[1, 0] = dfpos.iloc[0, 0]
    modPerformanceNeg.iloc[1, 0] = dfneg.iloc[0, 0]
    modPerformancePos.iloc[1, 1] = dfpos.iloc[1, 0]
    modPerformanceNeg.iloc[1, 1] = dfneg.iloc[1, 0]
    modPerformancePos.iloc[1, 2] = dfpos.iloc[2, 0]
    modPerformanceNeg.iloc[1, 2] = dfneg.iloc[2, 0]

    # Perform classification with Logistic Regression
    print('Classification with Logistic Regression')
    lr_clf = LogisticRegression()
    param_search_lr = {
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'C': [0.1, 1, 10, 100],
        'solver': ['lbfgs', 'saga']
    }
    grid_search_lr = GridSearchCV(estimator=lr_clf, param_grid=param_search_lr, cv=5, verbose=1)
    grid_search_lr.fit(train_vectors, trainSet['Positively Rated'])
    best_clf_lr = grid_search_lr.best_estimator_
    lr_prediction = best_clf_lr.predict(test_vectors)

    df['Logistic Regression Prediction'] = lr_prediction

    # results report
    report = classification_report(testSet['Positively Rated'], lr_prediction, output_dict=True)
    positive = report['1']
    dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
    negative = report['0']
    dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

    print(dfpos)
    print("\n")
    print(dfneg)
    print("\n")
    print("accuracy: ", round(report['accuracy'], 2))
    print("\n")

    modPerformancePos.iloc[2, 0] = dfpos.iloc[0, 0]
    modPerformanceNeg.iloc[2, 0] = dfneg.iloc[0, 0]
    modPerformancePos.iloc[2, 1] = dfpos.iloc[1, 0]
    modPerformanceNeg.iloc[2, 1] = dfneg.iloc[1, 0]
    modPerformancePos.iloc[2, 2] = dfpos.iloc[2, 0]
    modPerformanceNeg.iloc[2, 2] = dfneg.iloc[2, 0]

    # Perform classification with Random Forest
    print('Classification with Random Forest')
    rf_clf = RandomForestClassifier()
    param_search_rf = {
        'n_estimators': [10, 50, 100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 10, 20, 30, None],
        'max_leaf_nodes': [10, 20, 30, None]
    }
    grid_search_rf = GridSearchCV(estimator=rf_clf, param_grid=param_search_rf, cv=5, verbose=1)
    grid_search_rf.fit(train_vectors, trainSet['Positively Rated'])
    best_clf_rf = grid_search_rf.best_estimator_
    rf_prediction = best_clf_rf.predict(test_vectors)

    df['Random Forest Prediction'] = rf_prediction

    # results report
    report = classification_report(testSet['Positively Rated'], rf_prediction, output_dict=True)

    # Comparing performance of the models
    positive = report['1']
    dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
    negative = report['0']
    dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

    print(dfpos)
    print("\n")
    print(dfneg)
    print("\n")
    print("accuracy: ", round(report['accuracy'], 2))
    print("\n")

    modPerformancePos.iloc[3, 0] = dfpos.iloc[0, 0]
    modPerformanceNeg.iloc[3, 0] = dfneg.iloc[0, 0]
    modPerformancePos.iloc[3, 1] = dfpos.iloc[1, 0]
    modPerformanceNeg.iloc[3, 1] = dfneg.iloc[1, 0]
    modPerformancePos.iloc[3, 2] = dfpos.iloc[2, 0]
    modPerformanceNeg.iloc[3, 2] = dfneg.iloc[2, 0]

    print(modPerformancePos)
    print("\n")
    print(modPerformanceNeg)
    print("\n")

In [6]:
main()

            ID  Sentiment                                           Sentence
0            0          0  According to Gran , the company has no plans t...
1            1          1  For the last quarter of 2010 , Componenta 's n...
2            2          1  In the third quarter of 2010 , net sales incre...
3            3          1  Operating profit rose to EUR 13.1 mn from EUR ...
4            4          1  Operating profit totalled EUR 21.1 mn , up fro...
...        ...        ...                                                ...
108746  111290          1  Philippines president Rodrigo Duterte urges pe...
108747  111291          1  Spain arrests three Pakistanis accused of prom...
108748  111292          1  Venezuela, where anger over food shortages is ...
108749  111293          1  A Hindu temple worker has been killed by three...
108750  111294          1  Ozone layer hole seems to be healing - US &amp...

[108751 rows x 3 columns]


                                               