In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import xlrd

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
#import label data
data_path= 'SentimentCleaning.xlsx'
train=pd.read_excel(data_path, sep = ';')

In [4]:
#Build vectorizer on Word Level TF IDF Vectors
tf=TfidfVectorizer()
text_tf= tf.fit_transform(train['remove'])

In [12]:
#Functions for using different classifying models
def train_model(classifier, feature_vector_train, label, feature_vector_valid,valid_y, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier=classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
   # y_predict_probabilities = classifier.predict_proba(label)[:,1]
    #print(confusion_matrix(valid_y, predictions))
    #print(classification_report(valid_y, predictions)) 
    #print( roc_auc_score(valid_y, y_predict_probabilities))
   # return metrics.accuracy_score(predictions, valid_y)
    return accuracy_score(valid_y, predictions)

In [13]:
entries=[]
#split train and test set from train dataset
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    text_tf, train['Mean'], test_size=0.3, random_state=123)

entries.clear()

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(MultinomialNB(), X_train_tfidf, y_train_tfidf, X_test_tfidf,y_test_tfidf)

entries.append((MultinomialNB().__class__.__name__, "Word Level TF IDF Vectors", accuracy))

# LogisticRegression Classifier on Word Level TF IDF Vectors
accuracy = train_model(LogisticRegression(solver='lbfgs',multi_class='auto'), X_train_tfidf, y_train_tfidf, X_test_tfidf,y_test_tfidf)
entries.append((LogisticRegression().__class__.__name__, "Word Level TF IDF Vectors", accuracy))

# Linear-SVC Classifier on Word Level TF IDF Vectors
accuracy = train_model(LinearSVC(), X_train_tfidf, y_train_tfidf, X_test_tfidf,y_test_tfidf)

entries.append((LinearSVC().__class__.__name__, "Word Level TF IDF Vectors", accuracy))

# RandomForest Classifier on Word Level TF IDF Vectors
accuracy = train_model(RandomForestClassifier(n_estimators=100), X_train_tfidf, y_train_tfidf, X_test_tfidf,y_test_tfidf)

entries.append((RandomForestClassifier().__class__.__name__, "Word Level TF IDF Vectors", accuracy))

#transform list to dataframe
df1=pd.DataFrame(entries, columns=['model_name', 'Vectors', 'accuracy'])
display(df1)

Unnamed: 0,model_name,Vectors,accuracy
0,MultinomialNB,Word Level TF IDF Vectors,0.559524
1,LogisticRegression,Word Level TF IDF Vectors,0.571429
2,LinearSVC,Word Level TF IDF Vectors,0.654762
3,RandomForestClassifier,Word Level TF IDF Vectors,0.571429
