# Set Up Design Matrix

In [4]:
import pandas as pd

# read labeled Excel file
df = pd.read_excel("project_excel_sentences_labeled.xlsx", usecols="B:C", names=["sentences", "label"])

# get relative frequencies of positive, negative, and neutral labels
df.label.value_counts(normalize=True)

neutral     0.4585
negative    0.3100
positive    0.2315
Name: label, dtype: float64

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# transform sentences into an inverse document frequency matrix
# exclude terms that appear in more than 70% of our sentences
vectorizer = TfidfVectorizer(use_idf=False, norm="l2", stop_words="english", max_df=0.7)

# split into X and y
X = vectorizer.fit_transform(df.sentences)
y = df.label

X.shape, y.shape

((2000, 7055), (2000,))

# Modeling

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# to keep track of best results
summary = {} 

# see distribution of y_train
y_train.value_counts(normalize=True)

neutral     0.441875
negative    0.320625
positive    0.237500
Name: label, dtype: float64

## KNN

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier 
import pprint

# parameters to try on KNN
params = {"n_neighbors": [15, 17, 19, 21, 25, 30], 
          "weights": ["uniform", "distance"]}

# use grid search CV with various parameters to find optimal solution
grid = GridSearchCV(KNeighborsClassifier(), param_grid=params, cv=5)
grid.fit(X_train, y_train)

# get results from best knn parameters
knn_results = grid.best_params_

In [6]:
# classification report
# get predictions on test set
pred_knn = grid.predict(X_test)

# add results and classification report to summary
knn_results["report"] = classification_report(y_test, pred_knn)
summary["knn"] = knn_results
pprint.pprint(knn_results)

{'n_neighbors': 30,
 'report': '              precision    recall  f1-score   support\n'
           '\n'
           '    negative       0.47      0.56      0.51       107\n'
           '     neutral       0.62      0.57      0.60       210\n'
           '    positive       0.33      0.31      0.32        83\n'
           '\n'
           '    accuracy                           0.52       400\n'
           '   macro avg       0.47      0.48      0.48       400\n'
           'weighted avg       0.52      0.52      0.52       400\n',
 'weights': 'distance'}


## Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression

# create LR parameters
params = {"C": [5, 7, 9, 11],
         "penalty": ["l1"],
         "solver": ["liblinear"]}

# use grid search CV with various parameters to find optimal solution
gridLR = GridSearchCV(LogisticRegression(max_iter=400), 
                      param_grid=params, 
                      cv=5)
gridLR.fit(X_train, y_train)

# get results from best LR parameters
lr_results = gridLR.best_params_



In [8]:
# classification report 

# get predictions
pred_lr = gridLR.predict(X_test)

# get results and add to summary
lr_results["report"] = classification_report(y_test, pred_lr)
summary["lr"] = lr_results
pprint.pprint(lr_results)

{'C': 5,
 'penalty': 'l1',
 'report': '              precision    recall  f1-score   support\n'
           '\n'
           '    negative       0.48      0.60      0.54       107\n'
           '     neutral       0.63      0.61      0.62       210\n'
           '    positive       0.45      0.35      0.39        83\n'
           '\n'
           '    accuracy                           0.55       400\n'
           '   macro avg       0.52      0.52      0.52       400\n'
           'weighted avg       0.55      0.55      0.55       400\n',
 'solver': 'liblinear'}


## Multinomial Naive Bayes

In [9]:
from sklearn.naive_bayes import MultinomialNB

# create parameters for naive bayes
params = {"alpha": [0.001, 0.01, 0.1, 1, 3]}

gridMNB = GridSearchCV(MultinomialNB(), 
                       param_grid=params, 
                       cv=5)
gridMNB.fit(X_train, y_train)

# get results from MNB
mnb_results = gridMNB.best_params_

In [10]:
# get predictions
pred_mnb = gridMNB.predict(X_test)

# get results
mnb_results["report"] = classification_report(y_test, pred_mnb)
summary["mnb"] = mnb_results
pprint.pprint(mnb_results)

{'alpha': 0.1,
 'report': '              precision    recall  f1-score   support\n'
           '\n'
           '    negative       0.48      0.56      0.52       107\n'
           '     neutral       0.62      0.59      0.60       210\n'
           '    positive       0.39      0.37      0.38        83\n'
           '\n'
           '    accuracy                           0.54       400\n'
           '   macro avg       0.50      0.51      0.50       400\n'
           'weighted avg       0.54      0.54      0.54       400\n'}


## Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

# create parameters for random forest
params = {"min_samples_split": [3, 5, 10, 15],
         "criterion": ["entropy"]}

gridRF = GridSearchCV(RandomForestClassifier(n_estimators=400, max_depth=None), 
                      param_grid=params, 
                      cv=5)
gridRF.fit(X_train, y_train)

# get results from RF
rf_results = gridRF.best_params_

In [12]:
# make predictions
pred_rf = gridRF.predict(X_test)

# get results and add to summary
rf_results["report"] = classification_report(y_test, pred_rf)
summary["rf"] = rf_results
pprint.pprint(rf_results)

{'criterion': 'entropy',
 'min_samples_split': 15,
 'report': '              precision    recall  f1-score   support\n'
           '\n'
           '    negative       0.51      0.45      0.48       107\n'
           '     neutral       0.60      0.78      0.68       210\n'
           '    positive       0.68      0.28      0.39        83\n'
           '\n'
           '    accuracy                           0.59       400\n'
           '   macro avg       0.60      0.50      0.52       400\n'
           'weighted avg       0.59      0.59      0.57       400\n'}


## Linear SVM

In [13]:
from sklearn.svm import LinearSVC

# create parameters for linear SVM
params = {"C": [0.1, 0.5, 1, 5, 10],
         "penalty": ["l1", "l2"],
         "loss": ["squared_hinge"]}

gridSVM = GridSearchCV(LinearSVC(dual=False, tol=0.001),
                      param_grid=params,
                      cv=5)
gridSVM.fit(X_train, y_train)

# get results from linear SVM
svm_results = gridSVM.best_params_

In [14]:
# get predictions
pred_svm = gridSVM.predict(X_test)

# get results and add to summary
svm_results["report"] = classification_report(y_test, pred_svm)
summary["linearSVM"] = svm_results
pprint.pprint(svm_results)

{'C': 0.5,
 'loss': 'squared_hinge',
 'penalty': 'l1',
 'report': '              precision    recall  f1-score   support\n'
           '\n'
           '    negative       0.54      0.55      0.55       107\n'
           '     neutral       0.63      0.72      0.67       210\n'
           '    positive       0.54      0.31      0.40        83\n'
           '\n'
           '    accuracy                           0.59       400\n'
           '   macro avg       0.57      0.53      0.54       400\n'
           'weighted avg       0.59      0.59      0.58       400\n'}


## Kernalized SVM

In [15]:
from sklearn.svm import SVC

# specify parameters for SVC
params = {"C": [1, 5, 7, 9],
         "kernel": ["rbf", "poly", "linear", "sigmoid"]}

gridSVM = GridSearchCV(SVC(),
                      param_grid=params,
                      cv=5)
gridSVM.fit(X_train, y_train)

# get best parameters from SVM
svm_results = gridSVM.best_params_

In [16]:
# get predictions
pred_ker_svm = gridSVM.predict(X_test)

# get results and add to summary
svm_results["report"] = classification_report(y_test, pred_ker_svm)
summary["SVM"] = svm_results
pprint.pprint(svm_results)

{'C': 1,
 'kernel': 'linear',
 'report': '              precision    recall  f1-score   support\n'
           '\n'
           '    negative       0.52      0.55      0.54       107\n'
           '     neutral       0.61      0.69      0.65       210\n'
           '    positive       0.51      0.31      0.39        83\n'
           '\n'
           '    accuracy                           0.57       400\n'
           '   macro avg       0.55      0.52      0.52       400\n'
           'weighted avg       0.57      0.57      0.57       400\n'}


## Neural Network - MLP

In [17]:
from sklearn.neural_network import MLPClassifier

# specify parameters to try
#### long training time. The parameters below were tested. `optimal_params` below provide the best results. ####
# params = {"hidden_layer_sizes": [(10, ), (20, ), (30, )],
#          "activation": ["relu", "identity", "logistic", "tanh"],
#          "solver": ["adam", "sgd", "lbfgs"],
#          "alpha": [0.00001, 0.0001, 0.01, 1]}

optimal_params = {"hidden_layer_sizes": [(30, )],
         "activation": ["identity"],
         "solver": ["lbfgs"],
         "alpha": [0.0001]}

gridMLP = GridSearchCV(MLPClassifier(max_iter=300), 
                      param_grid=optimal_params,
                      cv=5)
gridMLP.fit(X_train, y_train)

# get best parameters from MLP
mlp_results = gridMLP.best_params_

In [18]:
# get predictions
pred_mlp = gridMLP.predict(X_test)

# get results and add to summary
mlp_results["report"] = classification_report(y_test, pred_mlp)
summary["MLP"] = mlp_results
pprint.pprint(mlp_results)

{'activation': 'identity',
 'alpha': 0.0001,
 'hidden_layer_sizes': (30,),
 'report': '              precision    recall  f1-score   support\n'
           '\n'
           '    negative       0.56      0.52      0.54       107\n'
           '     neutral       0.63      0.70      0.66       210\n'
           '    positive       0.43      0.35      0.38        83\n'
           '\n'
           '    accuracy                           0.58       400\n'
           '   macro avg       0.54      0.52      0.53       400\n'
           'weighted avg       0.57      0.58      0.57       400\n',
 'solver': 'lbfgs'}


## Gradient Boosting

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

# parameters to be tested
params = {"max_depth": [3, 7, 11]}

gridGB = GridSearchCV(GradientBoostingClassifier(n_estimators=400, learning_rate=0.1), 
                     param_grid=params,
                     cv=5)
gridGB.fit(X_train, y_train)

# get best parameters
gb_results = gridGB.best_params_

In [20]:
# get predictions
pred_gb = gridGB.predict(X_test)

# get results and add to summary
gb_results["report"] = classification_report(y_test, pred_gb)
summary["GB"] = gb_results
pprint.pprint(gb_results)

{'max_depth': 7,
 'report': '              precision    recall  f1-score   support\n'
           '\n'
           '    negative       0.49      0.56      0.52       107\n'
           '     neutral       0.65      0.68      0.67       210\n'
           '    positive       0.61      0.43      0.51        83\n'
           '\n'
           '    accuracy                           0.60       400\n'
           '   macro avg       0.58      0.56      0.57       400\n'
           'weighted avg       0.60      0.60      0.60       400\n'}


## Results

In [27]:
for model in summary:
    print(f"{model}:\n{summary[model]['report']}")


knn:
              precision    recall  f1-score   support

    negative       0.47      0.56      0.51       107
     neutral       0.62      0.57      0.60       210
    positive       0.33      0.31      0.32        83

    accuracy                           0.52       400
   macro avg       0.47      0.48      0.48       400
weighted avg       0.52      0.52      0.52       400

lr:
              precision    recall  f1-score   support

    negative       0.48      0.60      0.54       107
     neutral       0.63      0.61      0.62       210
    positive       0.45      0.35      0.39        83

    accuracy                           0.55       400
   macro avg       0.52      0.52      0.52       400
weighted avg       0.55      0.55      0.55       400

mnb:
              precision    recall  f1-score   support

    negative       0.48      0.56      0.52       107
     neutral       0.62      0.59      0.60       210
    positive       0.39      0.37      0.38        83

    ac

Gradient Boosting achieved the best accuracy score.

# Test on our own sentences

In [2]:
test_sentences = ["We are hoping this is a happy and nice sentence",
                 "I want this to be a bad and gross sentence",
                 "There is great weather outside today",
                 "The weather is dull and rainy outside",
                 "We are using Python for machine learning",
                 "The summer is my favorite time of year",
                 "We learned many new topics in this course",
                 "I'm very excited to play with my dog today",
                 "I am annoyed at my bad performance",
                 "The date is May 3, 2022"]
hand_labels = ["positive", "negative", "positive", "negative", "neutral", "positive", 
               "neutral", "positive", "negative", "neutral"]
len(test_sentences)

10

In [6]:
# use vectorizer to transform sentences into tfidf matrix
testX = vectorizer.transform(test_sentences)
testX

<10x7055 sparse matrix of type '<class 'numpy.float64'>'
	with 30 stored elements in Compressed Sparse Row format>

In [15]:
# use gradient boosting to predict polarity of sentences
pred_testX = gridGB.predict(testX)

# view predictions
test_df = pd.DataFrame({"Sentences": test_sentences, "hand_labeled": hand_labels, "Predictions": pred_testX})
test_df

Unnamed: 0,Sentences,hand_labeled,Predictions
0,We are hoping this is a happy and nice sentence,positive,positive
1,I want this to be a bad and gross sentence,negative,negative
2,There is great weather outside today,positive,positive
3,The weather is dull and rainy outside,negative,neutral
4,We are using Python for machine learning,neutral,neutral
5,The summer is my favorite time of year,positive,neutral
6,We learned many new topics in this course,neutral,neutral
7,I'm very excited to play with my dog today,positive,neutral
8,I am annoyed at my bad performance,negative,negative
9,"The date is May 3, 2022",neutral,neutral


In [16]:
print(classification_report(test_df.hand_labeled, test_df.Predictions))

              precision    recall  f1-score   support

    negative       1.00      0.67      0.80         3
     neutral       0.50      1.00      0.67         3
    positive       1.00      0.50      0.67         4

    accuracy                           0.70        10
   macro avg       0.83      0.72      0.71        10
weighted avg       0.85      0.70      0.71        10



## Comparison to Textblob Sentiment Classifier

In [19]:
from textblob import TextBlob

# use TextBlob to get polarity
def textblob_label(sentence):
    polarity = TextBlob(sentence).polarity
    # use 0.5 thresholds for positive, negative, and neutral
    if polarity >= 0.5:
        return "positive"
    elif polarity <= -0.5:
        return "negative"
    else:
        return "neutral"

# add textblob labels to df
test_df["textblob"] = test_df.Sentences.apply(lambda x: textblob_label(x)) 
test_df

Unnamed: 0,Sentences,hand_labeled,Predictions,textblob
0,We are hoping this is a happy and nice sentence,positive,positive,positive
1,I want this to be a bad and gross sentence,negative,negative,neutral
2,There is great weather outside today,positive,positive,neutral
3,The weather is dull and rainy outside,negative,neutral,neutral
4,We are using Python for machine learning,neutral,neutral,neutral
5,The summer is my favorite time of year,positive,neutral,positive
6,We learned many new topics in this course,neutral,neutral,neutral
7,I'm very excited to play with my dog today,positive,neutral,neutral
8,I am annoyed at my bad performance,negative,negative,negative
9,"The date is May 3, 2022",neutral,neutral,neutral


In [20]:
# TextBlob results
print(classification_report(test_df.hand_labeled, test_df.textblob))

              precision    recall  f1-score   support

    negative       1.00      0.33      0.50         3
     neutral       0.43      1.00      0.60         3
    positive       1.00      0.50      0.67         4

    accuracy                           0.60        10
   macro avg       0.81      0.61      0.59        10
weighted avg       0.83      0.60      0.60        10

