# 03 - SVM
**Last Updated: 2022/04/12** <br>
**Support Vector Machine to predict the 6 sadness sub-emotions.**

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.utils import resample

In [2]:
#tf-idf 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/keemia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Packages that are typically not included in Jupyter and might need to be installed.
try:
    import pandas_profiling as pp
except:
    !pip install ipywidgets
    !pip install pandas-profiling
    import pandas_profiling as pp

In [4]:
!pip install openpyxl



## Loading Dataset

In [5]:
# Read in the database and csv file
anno2_r_path = "./02-anno2_r.xlsx"    #2-annotations, researcher focussed
anno2_p_path = "./02-anno2_p.xlsx"    #2-annotations, participant focussed
anno3_path = "./02-anno3.xlsx"    #3-annotations, balanced

df = pd.read_excel(anno2_r_path)
df

Unnamed: 0,ID,Tweet,ori_i,anno1_e,anno1_i,anno2_e,anno2_i
0,0,Depression sucks! #depression,0.958,depressed,3,depressed,2
1,1,Feeling worthless as always #depression,0.958,depressed,4,depressed,1
2,2,Feeling worthless as always,0.958,depressed,4,depressed,2
3,3,Im think ima lay in bed all day and sulk. Life...,0.934,depressed,5,depressed,4
4,4,So when I try I fail... and when I don't try.....,0.917,depressed,4,displeased,4
...,...,...,...,...,...,...,...
269,269,"Pops are joyless, soulless toys which look nea...",0.354,displeased,3,displeased,1
270,270,Why is it that we rejoice at a birth and griev...,0.354,depressed,3,displeased,2
271,271,Regret for the things we did can be tempered b...,0.354,hurt,1,guilty,3
272,272,In serious need of a nap,0.354,depressed,4,depressed,1


In [6]:
df_new = df[df['anno1_e'] == df['anno2_e']]
df_new.head()

Unnamed: 0,ID,Tweet,ori_i,anno1_e,anno1_i,anno2_e,anno2_i
0,0,Depression sucks! #depression,0.958,depressed,3,depressed,2
1,1,Feeling worthless as always #depression,0.958,depressed,4,depressed,1
2,2,Feeling worthless as always,0.958,depressed,4,depressed,2
3,3,Im think ima lay in bed all day and sulk. Life...,0.934,depressed,5,depressed,4
5,5,my life in one word is depressing,0.917,depressed,3,depressed,3


In [7]:
#Quick check that all 6 sub-emotions are represented
df_new['anno1_e'].unique()

array(['depressed', 'displeased', 'grief', 'hurt', 'lonely', 'guilty'],
      dtype=object)

In [8]:
# Finding the number of similar annotations between p1_e and p1_2
df_new['anno1_e'].value_counts()

displeased    77
depressed     76
hurt           8
grief          7
lonely         6
guilty         1
Name: anno1_e, dtype: int64

## tf-idf Embedding

In [9]:
# Removing Stopwords
english_stop_words = set(stopwords.words('english'))

In [10]:
## Applying tf-idf - To Map tweets to numerical vetors
vectorizer = TfidfVectorizer(stop_words = english_stop_words)
X = vectorizer.fit_transform(df_new['Tweet'])
X

<175x938 sparse matrix of type '<class 'numpy.float64'>'
	with 1482 stored elements in Compressed Sparse Row format>

In [11]:
df_with_features = pd.DataFrame(X.todense())
df_with_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,928,929,930,931,932,933,934,935,936,937
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df_with_features['label'] = df_new['anno1_e'].values

## Support Vector Machine (SVM) Model

In [13]:
svm_model = SVC(kernel = 'linear') # 0.57 # it should be changed to KNN, RandomForest
#knn_model = KNeighborsClassifier(n_neighbors = 15) # 0.51
#dtree_model = DecisionTreeClassifier(max_depth = 10, random_state = 101) # 0.54 # Do better for depressed percision & displeased recall
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
cross_val_predict(svm_model, X, y, cv = 5)



array(['depressed', 'depressed', 'depressed', 'depressed', 'depressed',
       'displeased', 'displeased', 'depressed', 'depressed', 'depressed',
       'depressed', 'depressed', 'displeased', 'depressed', 'displeased',
       'displeased', 'depressed', 'displeased', 'displeased', 'depressed',
       'displeased', 'depressed', 'depressed', 'displeased', 'depressed',
       'depressed', 'depressed', 'displeased', 'depressed', 'displeased',
       'depressed', 'depressed', 'displeased', 'displeased', 'depressed',
       'depressed', 'depressed', 'displeased', 'depressed', 'displeased',
       'grief', 'displeased', 'depressed', 'displeased', 'displeased',
       'displeased', 'depressed', 'depressed', 'depressed', 'displeased',
       'displeased', 'displeased', 'displeased', 'depressed',
       'displeased', 'depressed', 'displeased', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased', 'depressed',
       'depressed', 'displeased', 'displeased', 'depressed', 'dis

In [14]:
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
svm_model.fit(X, y)

## Apllying 5-fold cross validation
predict_svc = pd.Series(cross_val_predict(svm_model, X, y, cv = 5))

## Generating Confusoin Matrix
conf_matrix_svc = confusion_matrix(y, predict_svc, labels = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty'])
print(conf_matrix_svc)

[[62 15  0  0  0  0]
 [34 42  0  0  0  0]
 [ 5  3  0  0  0  0]
 [ 4  2  0  0  0  0]
 [ 3  2  0  0  2  0]
 [ 1  0  0  0  0  0]]




In [15]:
## Display the visualization of the Confusion Matrix.
fig = plt.figure(figsize = (16,6))

ax = sns.heatmap(conf_matrix_svc, annot = True, cmap = 'Blues')

ax.set_title('SVM Confusion Matrix \n');
ax.set_xlabel('\nPredicted Label')
ax.set_ylabel('True Label ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['depressed', 'displeased', 'grief', 'guilty', 'hurt', 'lonely' ])
ax.yaxis.set_ticklabels(['depressed', 'displeased', 'grief', 'guilty', 'hurt', 'lonely' ])

fig.savefig("svm_heatmap.png")

## Accuracy Scores

In [16]:
## Classification Report
target_names = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty']
print(classification_report(y, predict_svc, target_names = target_names))

              precision    recall  f1-score   support

  displeased       0.66      0.55      0.60        76
   depressed       0.57      0.81      0.67        77
        hurt       1.00      0.29      0.44         7
      lonely       0.00      0.00      0.00         1
       grief       0.00      0.00      0.00         8
      guilty       0.00      0.00      0.00         6

    accuracy                           0.61       175
   macro avg       0.37      0.27      0.29       175
weighted avg       0.58      0.61      0.57       175



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
## Prediction
sentence = ["Never leave me alone"]
X_test1 = vectorizer.transform(sentence)

svm_model.fit(X, y)
svm_model.predict(X_test1.todense())



array(['displeased'], dtype=object)

In [18]:
# Create separate dataframe for each sub-emotion
df_with_features_displeased = df_with_features[df_with_features['label'] == 'displeased']
df_with_features_depressed = df_with_features[df_with_features['label'] == 'depressed']
df_with_features_hurt = df_with_features[df_with_features['label'] == 'hurt']
df_with_features_grief = df_with_features[df_with_features['label'] == 'grief']
df_with_features_lonely = df_with_features[df_with_features['label'] == 'lonely']
df_with_features_guilty = df_with_features[df_with_features['label'] == 'guilty']

In [19]:
# Up Sampling
# Citation: [https://wellsr.com/python/upsampling-and-downsampling-imbalanced-data-in-python/]
# Down sample does not work based on the value counts of each sub-emotion in our dataset

def custom_resample(df):
    return resample(df,
             replace=True,
             n_samples=len(df_with_features_displeased),
             random_state=42)

depressed_upsample = custom_resample(df_with_features_depressed)
hurt_upsample = custom_resample(df_with_features_hurt)
grief_upsample = custom_resample(df_with_features_grief)
lonely_upsample = custom_resample(df_with_features_lonely)
guilty_upsample = custom_resample(df_with_features_guilty)

In [20]:
# Create a new dataframe containing all sub-emotions
df_all_emotions = pd.concat([df_with_features_displeased, depressed_upsample, hurt_upsample, grief_upsample, lonely_upsample, guilty_upsample]).reset_index(drop = True)

# Shuffle the dataframe - to make sure that our algorithms are not biased in their prediction citation: [https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows]
df_all_emotions = df_all_emotions.sample(frac=1).reset_index(drop=True)
df_all_emotions


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,929,930,931,932,933,934,935,936,937,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,lonely
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,hurt
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,guilty
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,guilty
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.298902,depressed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,lonely
458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,depressed
459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,lonely
460,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,guilty


In [21]:
# Testing SVM Model
svm_model = SVC(kernel = 'linear') # 0.57 # it should be changed to KNN, RandomForest
#knn_model = KNeighborsClassifier(n_neighbors = 15) # 0.51
#dtree_model = DecisionTreeClassifier(max_depth = 10, random_state = 101) # 0.54 # Do better for depressed percision & displeased recall
X = df_all_emotions.drop(['label'], axis = 1)
y = df_all_emotions['label']

svm_model.fit(X, y)

## Apllying 5-fold cross validation
predict_svc = pd.Series(cross_val_predict(svm_model, X, y, cv = 5))

## Generating Confusoin Matrix
conf_matrix_svc = confusion_matrix(y, predict_svc, labels = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty'])
print(conf_matrix_svc)


## Classification Report
target_names = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty']
print(classification_report(y, predict_svc, target_names = target_names))

[[74  2  0  1  0  0]
 [17 59  0  1  0  0]
 [ 0  0 77  0  0  0]
 [ 0  0  0 77  0  0]
 [ 0  0  0  0 77  0]
 [ 0  0  0  0  0 77]]
              precision    recall  f1-score   support

  displeased       0.97      0.77      0.86        77
   depressed       0.81      0.96      0.88        77
        hurt       1.00      1.00      1.00        77
      lonely       1.00      1.00      1.00        77
       grief       1.00      1.00      1.00        77
      guilty       0.97      1.00      0.99        77

    accuracy                           0.95       462
   macro avg       0.96      0.95      0.95       462
weighted avg       0.96      0.95      0.95       462



In [22]:
## Prediction
sentence = ["Never leave me alone"]
X_test1 = vectorizer.transform(sentence)

svm_model.fit(X, y)
svm_model.predict(X_test1.todense())



array(['lonely'], dtype=object)