In [1]:
import os 
import re
import time
import string

import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
from pylab import rcParams

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import lightgbm as lgb

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import nltk
from nltk.corpus import stopwords
from nltk import PorterStemmer
#for first time using nltk
# nltk.download ()

In [2]:

splits=10
stop_words = stopwords.words('english') 


In [3]:
def clean_text(txt):
    txt = txt.lower()
    txt = re.sub(r"(http\S+|http)", "", txt) # remove links 
    txt = re.sub('[^a-zA-Z ]+', '', txt) #only allows for letters
    txt = ' '.join([PorterStemmer().stem(word=word) for word in txt.split(" ") if word not in stop_words ]) # stem & remove stop words
    return txt

def print_model_performance(target,predicted):
    print('outcome of training')
    print(classification_report( target,predicted))   #uncomment if you want to see full report 
    print('test average accuracy ',accuracy_score( target,predicted))
    print(confusion_matrix( target,predicted))


In [4]:
def sentiment_analyzer_scores(text, threshold=0.05, engl=True):
    analyser = SentimentIntensityAnalyzer()

    if engl:
        trans = text
    else:
        trans = translator.translate(text).text

    score = analyser.polarity_scores(trans)
    lb = score['compound']
    return lb
    if lb >= threshold:
        return 1
    elif (lb > -threshold) and (lb < threshold):
        return 0
    else:
        return -1
def train_test_split_features(train, test,train_feature, target_feature,vectorise):
    y_train = train[target_feature]   
    X_train = train[train_feature]
    y_test = test[target_feature]   
    X_test = test[train_feature]
    feature_names=[]
    if(vectorise):
        vect = TfidfVectorizer(min_df=5, ngram_range=(1, 4)) # create Count vectorizer.
        X_train = vect.fit(X_train).transform(X_train) # transform text_train  into a vector 
        X_test = vect.transform(X_test) 
        feature_names = vect.get_feature_names() # to return all words used in vectorizer
  
    return X_train, X_test, y_train, y_test, feature_names

#get this working a bit better later


In [5]:
def pull_data(dataset, test_run=False):
    data_df=[]
    if (dataset == 'redit_data'):
        dataset_location="datasets/Twitter and Reddit Sentimental analysis Dataset/Twitter_Data.csv"
        text_variable='clean_text'
        target_feature='category'
        data_df = pd.read_csv(dataset_location) 

    if (dataset == 'financial'):
        text_variable='clean_text'
        target_feature='category'
        dataset_location="datasets/Sentiment Analysis for Financial News/all-data.csv"
        data_df = pd.read_csv(dataset_location,encoding='ISO-8859-1',header='infer',)
        data_df.columns = [ target_feature,text_variable]
    if (dataset == 'us_airline'):
        target_feature='airline_sentiment'
        text_variable='text'
        dataset_location='datasets/Twitter US Airline Sentiment/Tweets.csv'
        data_df = pd.read_csv(dataset_location) 
        
    data_df=data_df[[text_variable,target_feature]]
    data_df.columns = [ 'clean_text','target']
    data_df[text_variable] = data_df['clean_text'].astype(str)
    data_df[text_variable] = data_df['clean_text'].apply(clean_text) 
#     df.cc.astype('category').cat.codes
#     data_df['target']=data_df['target'].cat.codes
    data_df['target'] = pd.factorize(data_df['target'])[0] 
    if test_run:
        data_df,_,_ = quick_run(data_df)
    data_df = data_df.dropna()
    data_df = data_df.reset_index(drop=True)
    return data_df

# a list of all models used
def all_models():
    #Using the recomended classifiers
    #https://arxiv.org/abs/1708.05070
    GBC = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
    RFC = RandomForestClassifier(n_estimators=500, max_features=0.25, criterion="entropy")
    SVM = SVC(C = 0.01, gamma=0.1, kernel="poly", degree=3, coef0=10.0)
    ETC = ExtraTreesClassifier(n_estimators=1000, max_features="log2", criterion="entropy")
    LR = LogisticRegression(C=1.5,fit_intercept=True)
    # Models that were not included in the paper not from SKlearn
    XGC = XGBClassifier()
    CBC = CatBoostClassifier(silent=True)
    light_gb = lgb.LGBMClassifier()
    models=[(LR, "linear_regression"),(ETC, "Extra_tree_classifier"),(SVM, "support_vector_classifier"), (RFC, "random_forest_classifier"), (GBC, "gradient_boosted_classifier"),
             (XGC, "XGBoost"),(light_gb,"Light_GBM"), (CBC, "catboost_classifier")]
    #this subset was selected due to runtime
    models=[(LR, "linear_regression"), (GBC, "gradient_boosted_classifier"),
             (XGC, "XGBoost"),(light_gb,"Light_GBM")]
    return models

In [6]:
# Run the relavent machine learning model
def run_features(df, model, splits,features='clean_text',vectorise=True, predict_probability=False):
    cv = KFold(n_splits=splits, random_state=42, shuffle=False)
    full_prediciton=[]
    for train_index, test_index in cv.split(df):
        train, test = df.loc[train_index], df.loc[test_index]
        X_train, X_test, y_train, y_test, feature_names=train_test_split_features(train,test,features,'target', vectorise)
        model.fit(X_train, y_train)
        if (predict_probability==True):
            prediction = model.predict_proba(X_test)
        else:
            prediction = model.predict(X_test)
        full_prediciton.append(prediction)
    predictions=[]
    for set_of_prediction in full_prediciton:
        for predicted in set_of_prediction:
            predictions.append(predicted)
    return predictions

In [7]:
# a quick way to run though a dataset to confirm that everything works
def quick_run(df):
    train, test = train_test_split(df, test_size=0.99)
    train, validation = train_test_split(train, test_size=0.125)
    return train, validation, test

In [8]:
#current_datasets=['redit_data','financial','us_airline']
data_df=pull_data('us_airline',test_run=False)
predict_probability=True

In [9]:
model_predicted_names=[]
models=all_models()
df_copy=data_df.copy()
for model, name in models:
    print(name)
    
    start_time = time.time()
    
    predictions=run_features(df_copy,model,splits,predict_probability=predict_probability)
    print("--- %s seconds ---" % (time.time() - start_time))
    predicted_name=name+'_prediction'
    negative_prob=[]
    neutral_prob=[]
    positive_prob=[]
    if (predict_probability):
        for neg, neut,pos in predictions:
            negative_prob.append(neg)
            neutral_prob.append(neut)
            positive_prob.append(pos)
        neg_predictions=predicted_name+'_neg'
        neut_predictions=predicted_name+'_neut'
        pos_predictions=predicted_name+'_pos'
        df_copy[neg_predictions]=negative_prob
        df_copy[neut_predictions]=neutral_prob
        df_copy[pos_predictions]=positive_prob

        model_predicted_names.append(neg_predictions)
        model_predicted_names.append(neut_predictions)
        model_predicted_names.append(pos_predictions)
        negative_prob.clear()
        neutral_prob.clear()
        positive_prob.clear()
    else:
        df_copy[predicted_name]=predictions
        model_predicted_names.append(predicted_name)

        print_model_performance(df_copy['target'],predictions)
        predictions.clear()


linear_regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

--- 33.62941312789917 seconds ---
gradient_boosted_classifier
--- 97.80858898162842 seconds ---
XGBoost




--- 47.79335379600525 seconds ---
Light_GBM




--- 58.408225536346436 seconds ---


In [10]:
df_copy['polarity']=df_copy['clean_text'].apply(lambda text: TextBlob(text).sentiment.polarity)
df_copy['subjectivity']=df_copy['clean_text'].apply(lambda text: TextBlob(text).sentiment.subjectivity)
df_copy['vader_sentiment']=df_copy['clean_text'].apply(lambda tweet: sentiment_analyzer_scores(tweet))
model_predicted_names.append('polarity')
model_predicted_names.append('subjectivity')
model_predicted_names.append('vader_sentiment')

In [11]:
model = RandomForestClassifier()
predictions=run_features(df_copy,model,splits, features=model_predicted_names,vectorise=False)
print_model_performance(df_copy['target'],predictions)



outcome of training
              precision    recall  f1-score   support

           0       0.69      0.59      0.63      3099
           1       0.76      0.70      0.73      2363
           2       0.85      0.91      0.88      9178

    accuracy                           0.81     14640
   macro avg       0.77      0.73      0.75     14640
weighted avg       0.80      0.81      0.81     14640

test average accuracy  0.8106557377049181
[[1814  267 1018]
 [ 292 1663  408]
 [ 535  252 8391]]


In [12]:
df_copy

Unnamed: 0,clean_text,target,text,linear_regression_prediction_neg,linear_regression_prediction_neut,linear_regression_prediction_pos,gradient_boosted_classifier_prediction_neg,gradient_boosted_classifier_prediction_neut,gradient_boosted_classifier_prediction_pos,XGBoost_prediction_neg,XGBoost_prediction_neut,XGBoost_prediction_pos,Light_GBM_prediction_neg,Light_GBM_prediction_neut,Light_GBM_prediction_pos,polarity,subjectivity,vader_sentiment
0,@VirginAmerica What @dhepburn said.,0,virginamerica dhepburn said,0.273156,0.025889,0.700955,0.196817,0.068586,0.734597,0.509426,0.039224,0.451350,0.543241,0.030542,0.426217,0.000000,0.000000,0.0000
1,@VirginAmerica plus you've added commercials t...,1,virginamerica plu youv ad commerci experi tacki,0.166651,0.129347,0.704002,0.093975,0.041864,0.864161,0.118286,0.138499,0.743214,0.157750,0.125022,0.717228,0.000000,0.000000,0.0000
2,@VirginAmerica I didn't today... Must mean I n...,0,virginamerica didnt today must mean need take ...,0.332415,0.031368,0.636218,0.322555,0.057849,0.619596,0.446112,0.046278,0.507609,0.387729,0.044367,0.567905,-0.390625,0.687500,0.0000
3,@VirginAmerica it's really aggressive to blast...,2,virginamerica realli aggress blast obnoxi ente...,0.081795,0.084485,0.833720,0.062889,0.036372,0.900738,0.136733,0.116503,0.746763,0.088637,0.074582,0.836781,0.006250,0.350000,-0.2716
4,@VirginAmerica and it's a really big bad thing...,2,virginamerica realli big bad thing,0.074216,0.095449,0.830335,0.101453,0.035354,0.863194,0.098442,0.149364,0.752194,0.074206,0.128055,0.797739,-0.350000,0.383333,-0.5829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,@AmericanAir thank you we got on a different f...,1,americanair thank got differ flight chicago,0.401143,0.505183,0.093674,0.065390,0.839596,0.095014,0.104683,0.852461,0.042855,0.104315,0.864741,0.030945,0.000000,0.600000,0.3612
14636,@AmericanAir leaving over 20 minutes Late Flig...,2,americanair leav minut late flight warn commu...,0.004824,0.001856,0.993320,0.031886,0.011740,0.956374,0.004014,0.005126,0.990860,0.005531,0.004408,0.990062,-0.300000,0.600000,-0.4043
14637,@AmericanAir Please bring American Airlines to...,0,americanair pleas bring american airlin blackb...,0.682025,0.139579,0.178396,0.585465,0.089334,0.325201,0.481403,0.107839,0.410758,0.704197,0.086921,0.208882,0.000000,0.000000,0.3182
14638,"@AmericanAir you have my money, you change my ...",2,americanair money chang flight dont answer pho...,0.122450,0.023481,0.854069,0.152957,0.055705,0.791339,0.116851,0.031588,0.851561,0.059958,0.010421,0.929621,-0.125000,0.375000,0.5027


In [13]:
model_predicted_names

['linear_regression_prediction_neg',
 'linear_regression_prediction_neut',
 'linear_regression_prediction_pos',
 'gradient_boosted_classifier_prediction_neg',
 'gradient_boosted_classifier_prediction_neut',
 'gradient_boosted_classifier_prediction_pos',
 'XGBoost_prediction_neg',
 'XGBoost_prediction_neut',
 'XGBoost_prediction_pos',
 'Light_GBM_prediction_neg',
 'Light_GBM_prediction_neut',
 'Light_GBM_prediction_pos',
 'polarity',
 'subjectivity',
 'vader_sentiment']