# 03 - SVM
**Last Updated: 2022/04/12** <br>
**Support Vector Machine to predict the 6 sadness sub-emotions.**

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
#tf-idf 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eric\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Packages that are typically not included in Jupyter and might need to be installed.
try:
    import pandas_profiling as pp
except:
    !pip install ipywidgets
    !pip install pandas-profiling
    import pandas_profiling as pp

## Loading Dataset

In [4]:
# Read in the database and csv file
anno2_r_path = "./02-anno2_r.xlsx"    #2-annotations, researcher focussed
anno2_p_path = "./02-anno2_p.xlsx"    #2-annotations, participant focussed
anno3_path = "./02-anno3.xlsx"    #3-annotations, balanced

df = pd.read_excel(anno2_p_path)
df

Unnamed: 0,ID,Tweet,ori_i,anno1_e,anno1_i,anno2_e,anno2_i
0,0,Depression sucks! #depression,0.958,depressed,2,depressed,2
1,1,Feeling worthless as always #depression,0.958,depressed,1,depressed,1
2,2,Feeling worthless as always,0.958,depressed,2,depressed,2
3,3,Im think ima lay in bed all day and sulk. Life...,0.934,depressed,4,depressed,4
4,4,So when I try I fail... and when I don't try.....,0.917,displeased,4,depressed,4
...,...,...,...,...,...,...,...
269,269,"Pops are joyless, soulless toys which look nea...",0.354,displeased,1,displeased,3
270,270,Why is it that we rejoice at a birth and griev...,0.354,displeased,2,depressed,3
271,271,Regret for the things we did can be tempered b...,0.354,guilty,3,hurt,1
272,272,In serious need of a nap,0.354,depressed,1,depressed,4


In [5]:
# Finding the number of similar annotations between p1_e and p1_2
df[df['anno1_e'] == 'displeased']['anno2_e'].value_counts()

displeased    82
depressed     36
hurt           3
guilty         1
grief          1
lonely         1
Name: anno2_e, dtype: int64

In [6]:
df_new = df[df['anno1_e'] == df['anno2_e']]
df_new.head()

Unnamed: 0,ID,Tweet,ori_i,anno1_e,anno1_i,anno2_e,anno2_i
0,0,Depression sucks! #depression,0.958,depressed,2,depressed,2
1,1,Feeling worthless as always #depression,0.958,depressed,1,depressed,1
2,2,Feeling worthless as always,0.958,depressed,2,depressed,2
3,3,Im think ima lay in bed all day and sulk. Life...,0.934,depressed,4,depressed,4
5,5,my life in one word is depressing,0.917,depressed,3,depressed,3


In [7]:
#Quick check that all 6 sub-emotions are represented
df_new['anno1_e'].unique()

array(['depressed', 'displeased', 'grief', 'lonely', 'hurt', 'guilty'],
      dtype=object)

## tf-idf Embedding

In [8]:
# Removing Stopwords
english_stop_words = set(stopwords.words('english'))

In [9]:
## Applying tf-idf - To Map tweets to numerical vetors
vectorizer = TfidfVectorizer(stop_words = english_stop_words)
X = vectorizer.fit_transform(df_new['Tweet'])
X

<195x1020 sparse matrix of type '<class 'numpy.float64'>'
	with 1644 stored elements in Compressed Sparse Row format>

In [10]:
df_with_features = pd.DataFrame(X.todense())
df_with_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df_with_features['label'] = df_new['anno1_e'].values

## Support Vector Machine (SVM) Model

In [12]:
svm_model = SVC(kernel = 'linear') # 0.57 # it should be changed to KNN, RandomForest
#knn_model = KNeighborsClassifier(n_neighbors = 15) # 0.51
#dtree_model = DecisionTreeClassifier(max_depth = 10, random_state = 101) # 0.54 # Do better for depressed percision & displeased recall
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
cross_val_predict(svm_model, X, y, cv = 5)



array(['depressed', 'depressed', 'depressed', 'depressed', 'depressed',
       'displeased', 'displeased', 'depressed', 'depressed', 'depressed',
       'depressed', 'depressed', 'displeased', 'depressed', 'displeased',
       'depressed', 'displeased', 'displeased', 'depressed', 'depressed',
       'displeased', 'depressed', 'displeased', 'depressed', 'depressed',
       'depressed', 'depressed', 'displeased', 'depressed', 'depressed',
       'depressed', 'displeased', 'depressed', 'depressed', 'displeased',
       'displeased', 'depressed', 'depressed', 'displeased', 'displeased',
       'depressed', 'displeased', 'depressed', 'depressed', 'displeased',
       'depressed', 'displeased', 'grief', 'displeased', 'displeased',
       'depressed', 'displeased', 'depressed', 'displeased', 'depressed',
       'displeased', 'depressed', 'displeased', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased', 'depressed',
       'displeased', 'displeased', 'displeased', 'disp

In [13]:
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
svm_model.fit(X, y)

## Apllying 5-fold cross validation
predict_svc = pd.Series(cross_val_predict(svm_model, X, y, cv = 5))

## Generating Confusoin Matrix
conf_matrix_svc = confusion_matrix(y, predict_svc, labels = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty'])
print(conf_matrix_svc)



[[59 23  0  0  0  0]
 [37 49  0  0  0  0]
 [ 5  3  0  0  0  0]
 [ 6  3  0  0  0  0]
 [ 4  3  0  0  2  0]
 [ 0  1  0  0  0  0]]


In [14]:
### How come it does not show the confussion matrix?
## Display the visualization of the Confusion Matrix.
fig = plt.figure(figsize = (16,6))

ax = sns.heatmap(conf_matrix_svc, annot = True, cmap = 'Blues')

ax.set_title('SVM Confusion Matrix \n');
ax.set_xlabel('\nPredicted Label')
ax.set_ylabel('True Label ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['depressed', 'displeased', 'grief', 'guilty', 'hurt', 'lonely' ])
ax.yaxis.set_ticklabels(['depressed', 'displeased', 'grief', 'guilty', 'hurt', 'lonely' ])

fig.savefig("svm_heatmap.png")

## Accuracy Scores

In [15]:
## Classification Report
target_names = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty']
print(classification_report(y, predict_svc, target_names = target_names))

              precision    recall  f1-score   support

  displeased       0.60      0.57      0.58        86
   depressed       0.53      0.72      0.61        82
        hurt       1.00      0.22      0.36         9
      lonely       0.00      0.00      0.00         1
       grief       0.00      0.00      0.00         8
      guilty       0.00      0.00      0.00         9

    accuracy                           0.56       195
   macro avg       0.35      0.25      0.26       195
weighted avg       0.53      0.56      0.53       195



  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
## Prediction
sentence = ["she has depression"]
X_test1 = vectorizer.transform(sentence)
#X_test1

svm_model.fit(X, y)
svm_model.predict(X_test1.todense())

array(['depressed'], dtype=object)