# 03 - Different Models
## DEPRECATED FILE

**Last Updated: 2022/04/05** <br>
Python Notebook contains 3 different models that predict the 6 sadness emotions. <br>
<br>
Superseded by other python notebooks beginning with '03' which each contain 1 of the 3 models here. The models in those files are using a better cleaned dataset and have been updated since 2022/04/05. **This file has not been deleted because it produced the graph and charts on our poster.**

# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eric\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Packages that are typically not included in Jupyter and might need to be installed.
try:
    import pandas_profiling as pp
except:
    !pip install ipywidgets
    !pip install pandas-profiling
    import pandas_profiling as pp

# Loading Dataset

In [4]:
## Read in the database and csv file
#anno2_r_path = "./02-anno2_r.xlsx"    #2-annotations, researcher focussed
#anno2_p_path = "./02-anno2_p.xlsx"
#anno3_path = "./02-anno3.xlsx"

#df = pd.read_excel(anno2_p_path)
#df

In [5]:
file_path = "./03-anno4.csv"
df = pd.read_csv(file_path, index_col='ID')

## Rename column names to shorten their names
df = df.rename(
    columns={
        "Research Paper Intensity": "ori_i",
        "Participant 1 Classified Emotion": "p1_e",
        "Participant 1 Classified Intensity" : "p1_i",
        "Participant 2 Classified Emotion": "p2_e",
        "Participant 2 Classified Intensity": "p2_i"
    }
)

#For easy reference later
emotion_cols = ['p1_e', 'p2_e']
intensity_cols = ['p1_i', 'p2_i']


In [6]:
## Finding the number of similar annotations between p1_e and p1_2
#df[df['anno1_e'] == 'displeased']['anno2_e'].value_counts()
df[df['p1_e'] == 'displeased']['p2_e'].value_counts()

# displeased : 88
# depressed : 67
# hurt : 12
# lonely : 12
# grief : 10
# guilty : 6

displeased    88
depressed      6
hurt           6
lonely         2
guilty         1
Name: p2_e, dtype: int64

In [7]:
#df[df['anno1_e'] == df['anno2_e']]

In [8]:
#df_new = df[df['anno1_e'] == df['anno2_e']]
df_new = df[df['p1_e'] == df['p2_e']]
df_new.head()

Unnamed: 0_level_0,Tweet,ori_i,p1_e,p1_i,p2_e,p2_i
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Depression sucks! #depression,0.958,depressed,5,depressed,4
3,Im think ima lay in bed all day and sulk. Life...,0.934,depressed,4,depressed,4
4,So when I try I fail... and when I don't try.....,0.917,displeased,3,displeased,4
5,my life in one word is depressing,0.917,depressed,4,depressed,4
6,Panic attacks are the worst. Feeling really si...,0.917,depressed,3,depressed,5


In [9]:
# Removing Stopwords
english_stop_words = set(stopwords.words('english'))

In [10]:
#TODO: Check for over-fitting

# tf-idf Embedding

In [11]:
## Applying tf-idf - To Map tweets to numerical vetors
vectorizer = TfidfVectorizer(stop_words = english_stop_words)
X = vectorizer.fit_transform(df_new['Tweet'])
X

<195x1053 sparse matrix of type '<class 'numpy.float64'>'
	with 1697 stored elements in Compressed Sparse Row format>

In [12]:
df_with_features = pd.DataFrame(X.todense())
df_with_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1043,1044,1045,1046,1047,1048,1049,1050,1051,1052
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#df_with_features['label'] = df_new['anno1_e'].values
df_with_features['label'] = df_new['p1_e'].values
df_with_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1044,1045,1046,1047,1048,1049,1050,1051,1052,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,depressed
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,depressed
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,displeased
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,depressed
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,depressed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,guilty
191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,depressed
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,displeased
193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,guilty


In [14]:
## we don't use it in this code though (in SVM)
## Create pca object
pca = PCA(n_components = 100)
df_dimension_reduced = pd.DataFrame(pca.fit_transform(df_with_features.drop(['label'], axis = 1)))
#df_dimension_reduced.columns = ['PC1', 'PC2']
df_dimension_reduced['label'] = df_with_features['label']
df_dimension_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,label
0,-0.111032,0.769564,0.254704,0.136737,0.026857,-0.043460,0.041377,0.066542,-0.004919,0.038366,...,-0.033603,-0.027778,0.006183,-0.000766,0.041112,0.003158,-0.011032,-0.047754,-0.023400,depressed
1,-0.027734,-0.026762,-0.167538,0.122474,0.080768,-0.085073,0.150438,0.013916,-0.059412,-0.033693,...,-0.029841,-0.024015,-0.016919,-0.010454,-0.036890,0.097553,0.036585,0.026887,-0.013950,depressed
2,-0.033705,-0.031979,-0.064263,0.102870,0.003095,0.041733,-0.121566,-0.147166,-0.057166,0.032405,...,-0.021326,0.035332,0.020358,0.000583,-0.042724,0.006830,0.026168,0.033443,-0.045220,displeased
3,0.008316,-0.030260,-0.075881,-0.070567,-0.271253,-0.027885,0.408140,-0.108277,-0.137777,0.022303,...,-0.056000,-0.037889,0.011643,0.035213,-0.016536,0.044525,-0.003096,-0.015755,0.028733,depressed
4,-0.059868,0.294648,0.095710,0.009272,0.058506,0.094978,-0.051613,-0.110510,-0.032501,0.032010,...,0.003653,0.027935,-0.028015,-0.091033,-0.014291,0.023388,-0.036564,-0.039257,-0.010545,depressed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,-0.036427,0.006252,-0.031628,-0.063461,-0.003848,-0.020419,-0.088007,-0.135199,0.031702,0.049475,...,0.127516,-0.065124,0.110378,0.065648,0.079968,0.179159,0.034372,0.019538,0.101696,guilty
191,-0.048573,-0.041874,-0.013056,-0.146635,-0.218258,-0.125578,-0.065646,0.266066,0.038721,0.063579,...,0.014811,0.002259,-0.004138,-0.105373,-0.020897,0.077265,-0.009871,0.009292,-0.030721,depressed
192,-0.021715,-0.017943,-0.007292,-0.037930,-0.020106,-0.017925,-0.018191,-0.008508,-0.010835,0.000330,...,-0.089303,-0.056420,0.116480,-0.024746,0.002040,-0.099268,0.015467,0.016320,0.034783,displeased
193,-0.039932,-0.045529,-0.056976,-0.056877,-0.170069,-0.096955,-0.359040,0.581094,-0.377304,0.316305,...,0.008371,-0.005677,0.004512,0.009618,0.006335,-0.004878,0.001309,0.001874,0.005822,guilty


# Support Vector Machine (SVM) Model

In [15]:
svm_model = SVC(kernel = 'linear') # 0.57 # it should be changed to KNN, RandomForest
#knn_model = KNeighborsClassifier(n_neighbors = 15) # 0.51
#dtree_model = DecisionTreeClassifier(max_depth = 10, random_state = 101) # 0.54 # Do better for depressed percision & displeased recall
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
cross_val_predict(svm_model, X, y, cv = 5)

array(['depressed', 'depressed', 'displeased', 'depressed', 'displeased',
       'displeased', 'depressed', 'depressed', 'depressed', 'depressed',
       'depressed', 'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'displeased', 'displeased',
       'displeased', 'depressed', 'displeased', 'depressed', 'displeased',
       'displeased', 'depressed', 'depressed', 'depressed', 'displeased',
       'depressed', 'displeased', 'depressed', 'displeased', 'depressed',
       'displeased', 'displeased', 'displeased', 'displeased',
       'displeased', 'depressed', 'displeased', 'displeased',
       'displeased', 'grief', 'displeased', 'displeased', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'displeased', 'displeased',
       'depressed', 'displeased', 'displeased'

In [16]:
#profile = pp.ProfileReport(df_dimension_reduced)
#profile.to_file('EDAReport.html')

## Create a Confusion Matrix

In [17]:
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
svm_model.fit(X, y)

## Apllying 5-fold cross validation
predict_svc = pd.Series(cross_val_predict(svm_model, X, y, cv = 5))

## Generating Confusoin Matrix
conf_matrix_svc = confusion_matrix(y, predict_svc, labels = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty'])
print(conf_matrix_svc)

[[79  9  0  0  0  0]
 [37 30  0  0  0  0]
 [11  1  0  0  0  0]
 [ 8  3  0  0  1  0]
 [ 8  0  0  0  2  0]
 [ 4  1  0  0  0  1]]


In [45]:
### How come it does not show the confussion matrix?
## Display the visualization of the Confusion Matrix.
fig = plt.figure(figsize = (16,6))

ax = sns.heatmap(conf_matrix_svc, annot = True, cmap = 'Blues')

ax.set_title('SVM Confusion Matrix \n');
ax.set_xlabel('\nPredicted Label')
ax.set_ylabel('True Label ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['depressed', 'displeased', 'grief', 'guilty', 'hurt', 'lonely' ])
ax.yaxis.set_ticklabels(['depressed', 'displeased', 'grief', 'guilty', 'hurt', 'lonely' ])

fig.savefig("heatmap.png")

## Report precision,  recall,  accuracy,  F1 score
[Resource](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html)

In [19]:
## Classification Report
target_names = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty']
print(classification_report(y, predict_svc, target_names = target_names))

              precision    recall  f1-score   support

  displeased       0.68      0.45      0.54        67
   depressed       0.54      0.90      0.67        88
        hurt       0.67      0.20      0.31        10
      lonely       1.00      0.17      0.29         6
       grief       0.00      0.00      0.00        12
      guilty       0.00      0.00      0.00        12

    accuracy                           0.57       195
   macro avg       0.48      0.29      0.30       195
weighted avg       0.54      0.57      0.51       195



  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
## Prediction
sentence = ["she has depression"]
X_test1 = vectorizer.transform(sentence)
#X_test1

svm_model.fit(X, y)
svm_model.predict(X_test1.todense())

array(['depressed'], dtype=object)

# KNN Model

In [21]:
#svm_model = SVC(kernel = 'linear') # 0.57 # it should be changed to KNN, RandomForest
knn_model = KNeighborsClassifier(n_neighbors = 15) # 0.51
#dtree_model = DecisionTreeClassifier(max_depth = 10, random_state = 101) # 0.54 # Do better for depressed percision & displeased recall
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
cross_val_predict(knn_model, X, y, cv = 5)

array(['depressed', 'depressed', 'displeased', 'depressed', 'displeased',
       'displeased', 'depressed', 'depressed', 'depressed', 'depressed',
       'depressed', 'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased', 'depressed',
       'displeased', 'displeased', 'depressed', 'depressed', 'depressed',
       'displeased', 'depressed', 'displeased', 'displeased', 'depressed',
       'depressed', 'displeased', 'displeased', 'depressed', 'depressed',
       'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased', 'depressed',
       'displeased', 'displeased', 'displeased', 'displeased',
       'displeased', 'displeased', 'displeased', 'displeased',
       'displeased', 'depressed', 'displeased', 'displeased', 'depressed',
       'depressed', 'displeased', 'displeased', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased', 'depressed',
     

In [22]:
## Apllying 5-fold cross validation
predict_svc = pd.Series(cross_val_predict(knn_model, X, y, cv = 5))

## Generating Confusoin Matrix
conf_matrix_svc = confusion_matrix(y, predict_svc, labels = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty'])
print(conf_matrix_svc)

[[71 15  0  1  1  0]
 [40 27  0  0  0  0]
 [ 9  3  0  0  0  0]
 [ 9  3  0  0  0  0]
 [ 8  2  0  0  0  0]
 [ 5  1  0  0  0  0]]


In [23]:
## Classification Report
target_names = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty']
print(classification_report(y, predict_svc, target_names = target_names))

              precision    recall  f1-score   support

  displeased       0.53      0.40      0.46        67
   depressed       0.50      0.81      0.62        88
        hurt       0.00      0.00      0.00        10
      lonely       0.00      0.00      0.00         6
       grief       0.00      0.00      0.00        12
      guilty       0.00      0.00      0.00        12

    accuracy                           0.50       195
   macro avg       0.17      0.20      0.18       195
weighted avg       0.41      0.50      0.44       195



  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
## Prediction
sentence = ["she has depression"]
X_test1 = vectorizer.transform(sentence)
#X_test1

knn_model.fit(X, y)
knn_model.predict(X_test1.todense())

array(['depressed'], dtype=object)

# Decision Tree Model

In [75]:
#svm_model = SVC(kernel = 'linear') # 0.57 # it should be changed to KNN, RandomForest
#knn_model = KNeighborsClassifier(n_neighbors = 15) # 0.51
dtree_model = DecisionTreeClassifier(max_depth = 10, random_state = 101) # 0.54 # Do better for depressed percision & displeased recall
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
cross_val_predict(dtree_model, X, y, cv = 5)

array(['depressed', 'displeased', 'displeased', 'displeased', 'depressed',
       'displeased', 'depressed', 'depressed', 'depressed', 'displeased',
       'depressed', 'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased', 'depressed',
       'displeased', 'displeased', 'displeased', 'depressed', 'depressed',
       'displeased', 'depressed', 'displeased', 'displeased',
       'displeased', 'depressed', 'displeased', 'displeased', 'grief',
       'displeased', 'displeased', 'displeased', 'depressed',
       'displeased', 'depressed', 'displeased', 'displeased',
       'displeased', 'depressed', 'displeased', 'displeased',
       'displeased', 'grief', 'displeased', 'displeased', 'displeased',
       'displeased', 'displeased', 'displeased', 'displeased',
       'displeased', 'hurt', 'depressed', 'displeased', 'displeased',
       'displeased', 'displeased', 'displeased', 'depressed',
       'displeased', 'displeased', 'disp

In [76]:
## Apllying 5-fold cross validation
predict_svc = pd.Series(cross_val_predict(dtree_model, X, y, cv = 5))

## Generating Confusoin Matrix
conf_matrix_svc = confusion_matrix(y, predict_svc, labels = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty'])
print(conf_matrix_svc)

[[82  2  0  0  4  0]
 [42 23  0  0  2  0]
 [10  1  0  0  1  0]
 [10  1  1  0  0  0]
 [ 9  0  0  0  1  0]
 [ 6  0  0  0  0  0]]


In [77]:
## Classification Report
target_names = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty']
print(classification_report(y, predict_svc, target_names = target_names))

              precision    recall  f1-score   support

  displeased       0.85      0.34      0.49        67
   depressed       0.52      0.93      0.66        88
        hurt       0.12      0.10      0.11        10
      lonely       0.00      0.00      0.00         6
       grief       0.00      0.00      0.00        12
      guilty       0.00      0.00      0.00        12

    accuracy                           0.54       195
   macro avg       0.25      0.23      0.21       195
weighted avg       0.53      0.54      0.47       195



  _warn_prf(average, modifier, msg_start, len(result))


In [78]:
## Prediction
sentence = ["she has depression"]
X_test1 = vectorizer.transform(sentence)
#X_test1

clf = dtree_model.fit(X, y)
dtree_model.predict(X_test1.todense())

array(['depressed'], dtype=object)

### Visualize the Decision Tree

# Packages
from sklearn import tree

try:
    import graphviz
except:
    !pip install graphviz
    import graphviz
    
try:
    import pydotplus as pdp
except:
    !pip install pydotplus
    import pydotplus as pdp
    
    
    
dTree_pic="decision_tree_pic.png"
tree_graph = tree.export_graphviz(dtree_model, out_file=None,
                                        feature_names = X.columns,
                                        class_names = y,
                                        filled=True)

#graph = pdp.graph_from_dot_data(tree_graph.getvalue())
#Image(graph.create_png())

#graph = graphviz.Source(tree_graph, format="png")
#graph.render("decision_tree_graphivz")
#graph

In [79]:
from sklearn import tree
!pip install pyqt5



In [80]:
#for choosing colors on the decision tree figure: https://stackoverflow.com/questions/70437840/

plt.figure(figsize=(50,25))

tree.plot_tree(dtree_model,
              feature_names = X.columns,
              class_names = y,
              filled=True,
              )
#tree.plot_tree(dtree_model)
#plt.show()

plt.title("Decision Tree", fontdict=({'fontsize':30}))
plt.savefig("test.png")

  plt.figure(figsize=(50,25))
