# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
!pip install pandas-profiling
!pip install ipywidgets
import pandas_profiling as pp
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns



# Loading Dataset

In [2]:
## Read in the database and csv file
file_path = "./03-anno4.csv"
df = pd.read_csv(file_path, index_col='ID')

#df

In [3]:
## Reformat the dataframe

## Rename column names to shorten their names
df = df.rename(
    columns={
        "Research Paper Intensity": "ori_i",
        "Participant 1 Classified Emotion": "p1_e",
        "Participant 1 Classified Intensity" : "p1_i",
        "Participant 2 Classified Emotion": "p2_e",
        "Participant 2 Classified Intensity": "p2_i"
    }
)

#For easy reference later
emotion_cols = ['p1_e', 'p2_e']
intensity_cols = ['p1_i', 'p2_i']

In [4]:
## Finding the number of similar annotations between p1_e and p1_2
df[df['p1_e'] == 'displeased']['p2_e'].value_counts()
# displeased : 88
# depressed : 67
# hurt : 12
# lonely : 12
# grief : 10
# guilty : 6

displeased    88
depressed      6
hurt           6
lonely         2
guilty         1
Name: p2_e, dtype: int64

In [5]:
df[df['p1_e'] == df['p2_e']]

Unnamed: 0_level_0,Tweet,ori_i,p1_e,p1_i,p2_e,p2_i
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Depression sucks! #depression,0.958,depressed,5,depressed,4
3,Im think ima lay in bed all day and sulk. Life...,0.934,depressed,4,depressed,4
4,So when I try I fail... and when I don't try.....,0.917,displeased,3,displeased,4
5,my life in one word is depressing,0.917,depressed,4,depressed,4
6,Panic attacks are the worst. Feeling really si...,0.917,depressed,3,depressed,5
...,...,...,...,...,...,...
266,"Like, 'loon attic'. Luckily it quickly sunk in...",0.375,guilty,2,guilty,2
267,I'm too sober time to get shit faced,0.360,depressed,4,depressed,4
269,"Pops are joyless, soulless toys which look nea...",0.354,displeased,3,displeased,3
271,Regret for the things we did can be tempered b...,0.354,guilty,4,guilty,3


In [6]:
df_new = df[df['p1_e'] == df['p2_e']]
df_new.head()

Unnamed: 0_level_0,Tweet,ori_i,p1_e,p1_i,p2_e,p2_i
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Depression sucks! #depression,0.958,depressed,5,depressed,4
3,Im think ima lay in bed all day and sulk. Life...,0.934,depressed,4,depressed,4
4,So when I try I fail... and when I don't try.....,0.917,displeased,3,displeased,4
5,my life in one word is depressing,0.917,depressed,4,depressed,4
6,Panic attacks are the worst. Feeling really si...,0.917,depressed,3,depressed,5


In [7]:
# Removing Stopwords
english_stop_words = set(stopwords.words('english'))

# tf-idf Embedding

In [8]:
## Applying tf-idf - To Map tweets to numerical vetors
vectorizer = TfidfVectorizer(stop_words = english_stop_words)
X = vectorizer.fit_transform(df_new['Tweet'])
X

<195x1053 sparse matrix of type '<class 'numpy.float64'>'
	with 1697 stored elements in Compressed Sparse Row format>

In [9]:
df_with_features = pd.DataFrame(X.todense())
df_with_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1043,1044,1045,1046,1047,1048,1049,1050,1051,1052
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df_with_features['label'] = df_new['p1_e'].values
df_with_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1044,1045,1046,1047,1048,1049,1050,1051,1052,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,depressed
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,depressed
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,displeased
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,depressed
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,depressed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,guilty
191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,depressed
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,displeased
193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,guilty


In [11]:
## we don't use it in this code though (in SVM)
## Create pca object
pca = PCA(n_components = 100)
df_dimension_reduced = pd.DataFrame(pca.fit_transform(df_with_features.drop(['label'], axis = 1)))
#df_dimension_reduced.columns = ['PC1', 'PC2']
df_dimension_reduced['label'] = df_with_features['label']
df_dimension_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,label
0,-0.111018,0.769689,0.254640,0.136379,0.027120,-0.042943,0.041501,0.067008,-0.004453,0.039529,...,-0.016054,0.050640,0.034541,0.004175,-0.021767,0.030987,0.002258,0.004387,-0.032516,depressed
1,-0.027729,-0.026785,-0.167467,0.121849,0.082007,-0.086758,0.149561,0.013522,-0.059180,-0.033529,...,0.105516,0.041566,-0.057542,-0.004717,0.040604,0.085265,-0.003065,-0.002614,-0.071056,depressed
2,-0.033676,-0.032109,-0.064116,0.103464,0.000686,0.040574,-0.120847,-0.146630,-0.056175,0.029709,...,-0.029485,-0.016726,-0.167868,-0.050826,0.071107,0.081852,-0.063425,-0.065506,0.119508,displeased
3,0.008306,-0.030552,-0.075834,-0.070768,-0.269586,-0.028362,0.407830,-0.108463,-0.137747,0.021009,...,0.055327,0.065644,-0.038781,0.075595,-0.015036,-0.005519,0.004701,-0.002218,-0.084870,depressed
4,-0.059862,0.294760,0.095784,0.009010,0.058160,0.094249,-0.050561,-0.110489,-0.033732,0.031137,...,-0.087304,0.043185,-0.051188,0.039589,-0.043557,0.065382,0.087788,-0.063145,-0.006270,depressed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,-0.036289,0.006186,-0.031788,-0.063920,-0.002722,-0.018197,-0.093840,-0.133836,0.028881,0.048373,...,0.038509,0.017862,0.145231,-0.007857,0.053254,-0.074224,0.011918,-0.050220,0.178798,guilty
191,-0.048590,-0.041897,-0.013142,-0.146442,-0.220293,-0.125013,-0.064531,0.266751,0.041842,0.062144,...,0.076206,-0.024993,-0.004399,0.032836,0.034272,0.013380,0.019615,-0.014564,0.061100,depressed
192,-0.021809,-0.018173,-0.007024,-0.038810,-0.022002,-0.017324,-0.016453,-0.006770,-0.007710,-0.000114,...,0.057832,-0.087886,0.015263,0.099480,0.046780,0.108001,-0.030455,-0.030782,0.030864,displeased
193,-0.039928,-0.045542,-0.057011,-0.057038,-0.170163,-0.096075,-0.356097,0.585006,-0.374650,0.316149,...,0.012579,0.015452,0.010606,-0.003848,0.010123,-0.018460,-0.006054,-0.000788,-0.000231,guilty


# Support Vector Machine (SVM) Model

In [12]:
svm_model = SVC(kernel = 'linear') # 0.57 # it should be changed to KNN, RandomForest
#knn_model = KNeighborsClassifier(n_neighbors = 15) # 0.51
#dtree_model = DecisionTreeClassifier(max_depth = 10, random_state = 101) # 0.54 # Do better for depressed percision & displeased recall
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
cross_val_predict(svm_model, X, y, cv = 5)

array(['depressed', 'depressed', 'displeased', 'depressed', 'displeased',
       'displeased', 'depressed', 'depressed', 'depressed', 'depressed',
       'depressed', 'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'displeased', 'displeased',
       'displeased', 'depressed', 'displeased', 'depressed', 'displeased',
       'displeased', 'depressed', 'depressed', 'depressed', 'displeased',
       'depressed', 'displeased', 'depressed', 'displeased', 'depressed',
       'displeased', 'displeased', 'displeased', 'displeased',
       'displeased', 'depressed', 'displeased', 'displeased',
       'displeased', 'grief', 'displeased', 'displeased', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'displeased', 'displeased',
       'depressed', 'displeased', 'displeased'

In [13]:
#profile = pp.ProfileReport(df_dimension_reduced)
#profile.to_file('EDAReport.html')

## Create a Confusion Matrix

In [14]:
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
svm_model.fit(X, y)

## Apllying 5-fold cross validation
predict_svc = pd.Series(cross_val_predict(svm_model, X, y, cv = 5))

## Generating Confusoin Matrix
conf_matrix_svc = confusion_matrix(y, predict_svc, labels = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty'])
print(conf_matrix_svc)

[[79  9  0  0  0  0]
 [37 30  0  0  0  0]
 [11  1  0  0  0  0]
 [ 8  3  0  0  1  0]
 [ 8  0  0  0  2  0]
 [ 4  1  0  0  0  1]]


In [15]:
### How come it does not show the confussion matrix?
"""
## Display the visualization of the Confusion Matrix.
fig = plt.figure(figsize = (16,6))

ax = sns.heatmap(conf_matrix_svc, annot = True, cmap = 'Blues')

ax.set_title('SVM Confusion Matrix \n');
ax.set_xlabel('\nPredicted Label')
ax.set_ylabel('True Label ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['depressed', 'displeased', 'grief', 'guilty', 'hurt', 'lonely' ])
ax.yaxis.set_ticklabels(['depressed', 'displeased', 'grief', 'guilty', 'hurt', 'lonely' ])
"""

"\n## Display the visualization of the Confusion Matrix.\nfig = plt.figure(figsize = (16,6))\n\nax = sns.heatmap(conf_matrix_svc, annot = True, cmap = 'Blues')\n\nax.set_title('SVM Confusion Matrix \n');\nax.set_xlabel('\nPredicted Label')\nax.set_ylabel('True Label ');\n\n## Ticket labels - List must be in alphabetical order\nax.xaxis.set_ticklabels(['depressed', 'displeased', 'grief', 'guilty', 'hurt', 'lonely' ])\nax.yaxis.set_ticklabels(['depressed', 'displeased', 'grief', 'guilty', 'hurt', 'lonely' ])\n"

## Report precision,  recall,  accuracy,  F1 score
[Resource](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html)

In [16]:
## Classification Report
target_names = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty']
print(classification_report(y, predict_svc, target_names = target_names))

              precision    recall  f1-score   support

  displeased       0.68      0.45      0.54        67
   depressed       0.54      0.90      0.67        88
        hurt       0.67      0.20      0.31        10
      lonely       1.00      0.17      0.29         6
       grief       0.00      0.00      0.00        12
      guilty       0.00      0.00      0.00        12

    accuracy                           0.57       195
   macro avg       0.48      0.29      0.30       195
weighted avg       0.54      0.57      0.51       195



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
## Prediction
sentence = ["she has depression"]
X_test1 = vectorizer.transform(sentence)
#X_test1

svm_model.fit(X, y)
svm_model.predict(X_test1.todense())



array(['depressed'], dtype=object)

# KNN Model

In [18]:
#svm_model = SVC(kernel = 'linear') # 0.57 # it should be changed to KNN, RandomForest
knn_model = KNeighborsClassifier(n_neighbors = 15) # 0.51
#dtree_model = DecisionTreeClassifier(max_depth = 10, random_state = 101) # 0.54 # Do better for depressed percision & displeased recall
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
cross_val_predict(knn_model, X, y, cv = 5)

array(['depressed', 'depressed', 'displeased', 'depressed', 'displeased',
       'displeased', 'depressed', 'depressed', 'depressed', 'depressed',
       'depressed', 'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased', 'depressed',
       'displeased', 'displeased', 'depressed', 'depressed', 'depressed',
       'displeased', 'depressed', 'displeased', 'displeased', 'depressed',
       'depressed', 'displeased', 'displeased', 'displeased', 'depressed',
       'displeased', 'displeased', 'depressed', 'displeased', 'depressed',
       'displeased', 'depressed', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'displeased', 'displeased',
       'displeased', 'displeased', 'displeased', 'depressed', 'depressed',
       'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'displeased', 'displeased',
       'displeased', 'depressed', 'displeased', 'depressed', 'disp

In [19]:
## Apllying 5-fold cross validation
predict_svc = pd.Series(cross_val_predict(knn_model, X, y, cv = 5))

## Generating Confusoin Matrix
conf_matrix_svc = confusion_matrix(y, predict_svc, labels = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty'])
print(conf_matrix_svc)

[[66 21  0  0  1  0]
 [34 33  0  0  0  0]
 [ 6  6  0  0  0  0]
 [ 6  6  0  0  0  0]
 [ 7  3  0  0  0  0]
 [ 5  1  0  0  0  0]]


In [20]:
## Classification Report
target_names = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty']
print(classification_report(y, predict_svc, target_names = target_names))

              precision    recall  f1-score   support

  displeased       0.47      0.49      0.48        67
   depressed       0.53      0.75      0.62        88
        hurt       0.00      0.00      0.00        10
      lonely       0.00      0.00      0.00         6
       grief       0.00      0.00      0.00        12
      guilty       0.00      0.00      0.00        12

    accuracy                           0.51       195
   macro avg       0.17      0.21      0.18       195
weighted avg       0.40      0.51      0.45       195



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
## Prediction
sentence = ["she has depression"]
X_test1 = vectorizer.transform(sentence)
#X_test1

knn_model.fit(X, y)
knn_model.predict(X_test1.todense())



array(['depressed'], dtype=object)

# Decision Tree Model

In [22]:
#svm_model = SVC(kernel = 'linear') # 0.57 # it should be changed to KNN, RandomForest
#knn_model = KNeighborsClassifier(n_neighbors = 15) # 0.51
dtree_model = DecisionTreeClassifier(max_depth = 10, random_state = 101) # 0.54 # Do better for depressed percision & displeased recall
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
cross_val_predict(dtree_model, X, y, cv = 5)

array(['depressed', 'displeased', 'displeased', 'displeased', 'depressed',
       'displeased', 'depressed', 'depressed', 'depressed', 'displeased',
       'depressed', 'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased', 'depressed',
       'displeased', 'displeased', 'displeased', 'depressed', 'depressed',
       'displeased', 'depressed', 'displeased', 'displeased',
       'displeased', 'depressed', 'displeased', 'displeased', 'grief',
       'displeased', 'displeased', 'displeased', 'depressed',
       'displeased', 'depressed', 'displeased', 'displeased',
       'displeased', 'depressed', 'displeased', 'displeased',
       'displeased', 'grief', 'displeased', 'displeased', 'displeased',
       'displeased', 'displeased', 'displeased', 'displeased',
       'displeased', 'hurt', 'depressed', 'displeased', 'displeased',
       'displeased', 'displeased', 'displeased', 'depressed',
       'displeased', 'displeased', 'disp

In [23]:
## Apllying 5-fold cross validation
predict_svc = pd.Series(cross_val_predict(dtree_model, X, y, cv = 5))

## Generating Confusoin Matrix
conf_matrix_svc = confusion_matrix(y, predict_svc, labels = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty'])
print(conf_matrix_svc)

[[82  2  0  0  4  0]
 [42 23  0  0  2  0]
 [10  1  0  0  1  0]
 [10  1  1  0  0  0]
 [ 9  0  0  0  1  0]
 [ 6  0  0  0  0  0]]


In [24]:
## Classification Report
target_names = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty']
print(classification_report(y, predict_svc, target_names = target_names))

              precision    recall  f1-score   support

  displeased       0.85      0.34      0.49        67
   depressed       0.52      0.93      0.66        88
        hurt       0.12      0.10      0.11        10
      lonely       0.00      0.00      0.00         6
       grief       0.00      0.00      0.00        12
      guilty       0.00      0.00      0.00        12

    accuracy                           0.54       195
   macro avg       0.25      0.23      0.21       195
weighted avg       0.53      0.54      0.47       195



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
## Prediction
sentence = ["she has depression"]
X_test1 = vectorizer.transform(sentence)
#X_test1

dtree_model.fit(X, y)
dtree_model.predict(X_test1.todense())



array(['depressed'], dtype=object)