# 03-KNN
**Last Updated: 2022/04/12** <br>
**A K-Nearest Neighbors (KNN) model to predict the 6 sadness sub-emotions.**

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# for tf-idf
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eric\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Packages that are typically not included in Jupyter and might need to be installed.
try:
    import pandas_profiling as pp
except:
    !pip install ipywidgets
    !pip install pandas-profiling
    import pandas_profiling as pp

## Loading Dataset

In [4]:
# Read in the database and csv file
anno2_r_path = "./02-anno2_r.xlsx"    #2-annotations, researcher focussed
anno2_p_path = "./02-anno2_p.xlsx"    #2-annotations, participant focussed
anno3_path = "./02-anno3.xlsx"    #3-annotations, balanced

df = pd.read_excel(anno2_r_path)
df

Unnamed: 0,ID,Tweet,ori_i,anno1_e,anno1_i,anno2_e,anno2_i
0,0,Depression sucks! #depression,0.958,depressed,3,depressed,2
1,1,Feeling worthless as always #depression,0.958,depressed,4,depressed,1
2,2,Feeling worthless as always,0.958,depressed,4,depressed,2
3,3,Im think ima lay in bed all day and sulk. Life...,0.934,depressed,5,depressed,4
4,4,So when I try I fail... and when I don't try.....,0.917,depressed,4,displeased,4
...,...,...,...,...,...,...,...
269,269,"Pops are joyless, soulless toys which look nea...",0.354,displeased,3,displeased,1
270,270,Why is it that we rejoice at a birth and griev...,0.354,depressed,3,displeased,2
271,271,Regret for the things we did can be tempered b...,0.354,hurt,1,guilty,3
272,272,In serious need of a nap,0.354,depressed,4,depressed,1


In [5]:
df_new = df[df['anno1_e'] == df['anno2_e']]
df_new.head()

Unnamed: 0,ID,Tweet,ori_i,anno1_e,anno1_i,anno2_e,anno2_i
0,0,Depression sucks! #depression,0.958,depressed,3,depressed,2
1,1,Feeling worthless as always #depression,0.958,depressed,4,depressed,1
2,2,Feeling worthless as always,0.958,depressed,4,depressed,2
3,3,Im think ima lay in bed all day and sulk. Life...,0.934,depressed,5,depressed,4
5,5,my life in one word is depressing,0.917,depressed,3,depressed,3


## tf-idf

In [6]:
# Removing Stopwords
english_stop_words = set(stopwords.words('english'))

In [7]:
## Applying tf-idf - To Map tweets to numerical vetors
vectorizer = TfidfVectorizer(stop_words = english_stop_words)
X = vectorizer.fit_transform(df_new['Tweet'])
X

<175x938 sparse matrix of type '<class 'numpy.float64'>'
	with 1482 stored elements in Compressed Sparse Row format>

In [8]:
df_with_features = pd.DataFrame(X.todense())
df_with_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,928,929,930,931,932,933,934,935,936,937
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_with_features['label'] = df_new['anno1_e'].values

## KNN Model

In [10]:
## Create pca object
pca = PCA(n_components = 100)
df_dimension_reduced = pd.DataFrame(pca.fit_transform(df_with_features.drop(['label'], axis = 1)))
#df_dimension_reduced.columns = ['PC1', 'PC2']
df_dimension_reduced['label'] = df_with_features['label']
df_dimension_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,label
0,0.656901,0.435940,0.148815,0.108774,-0.133893,0.114160,0.080466,0.015268,0.013551,-0.001482,...,-0.039727,-0.002490,0.024797,0.037741,0.004495,0.028113,-0.013707,-0.002858,-0.007783,depressed
1,0.390551,0.320814,0.029539,-0.236134,0.588798,-0.123605,-0.324558,0.037867,-0.031030,-0.023975,...,0.010073,-0.008114,-0.008854,0.011833,-0.008254,-0.029573,0.001168,-0.008445,-0.029841,depressed
2,0.167823,0.175810,-0.023701,-0.293107,0.673559,-0.171280,-0.377918,0.036772,-0.037643,-0.030136,...,0.004350,-0.007493,-0.001495,0.026964,-0.010008,-0.025157,-0.008564,-0.016909,-0.033435,depressed
3,-0.026489,-0.035450,-0.141446,0.156748,0.046954,0.104951,-0.041739,-0.077954,-0.084144,0.167787,...,0.008574,0.026602,0.050996,0.046235,-0.048788,0.043566,0.009829,-0.064111,0.038045,depressed
4,-0.088467,0.018452,-0.089873,0.005788,-0.135844,-0.106331,-0.078855,-0.283854,-0.210584,0.334621,...,0.034115,0.017036,0.049836,0.032899,-0.069142,-0.019070,0.014213,0.000399,0.006561,depressed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,-0.031998,-0.037094,-0.065090,0.054011,0.033020,-0.060004,-0.032626,-0.026909,0.001361,0.025621,...,-0.006889,-0.167052,-0.129782,0.055312,0.008407,0.063463,0.071288,0.052961,0.058629,displeased
171,-0.008499,-0.035945,-0.011078,-0.031154,-0.004689,-0.012905,0.054751,0.017915,-0.036330,-0.005415,...,0.016603,0.058489,0.012525,0.025419,-0.018470,0.067721,-0.118981,0.019790,0.004631,depressed
172,-0.019612,-0.057277,0.008775,-0.057797,-0.176920,-0.043758,-0.150356,0.254035,0.019060,0.006036,...,-0.024297,0.050571,0.021752,-0.030858,-0.084331,0.002913,0.027218,-0.017736,0.008265,depressed
173,-0.011747,-0.032111,-0.013079,-0.014850,-0.020657,-0.012874,-0.021125,0.006045,-0.000101,-0.030313,...,-0.058862,0.041985,-0.034589,0.093805,0.048267,0.001904,-0.005477,-0.075018,-0.008978,displeased


In [11]:
#svm_model = SVC(kernel = 'linear') # 0.57 # it should be changed to KNN, RandomForest
knn_model = KNeighborsClassifier(n_neighbors = 15) # 0.51
#dtree_model = DecisionTreeClassifier(max_depth = 10, random_state = 101) # 0.54 # Do better for depressed percision & displeased recall
X = df_with_features.drop(['label'], axis = 1)
y = df_with_features['label']
cross_val_predict(knn_model, X, y, cv = 5)



array(['depressed', 'depressed', 'displeased', 'depressed', 'depressed',
       'displeased', 'displeased', 'depressed', 'depressed', 'depressed',
       'displeased', 'depressed', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased', 'depressed',
       'displeased', 'depressed', 'depressed', 'displeased', 'depressed',
       'depressed', 'depressed', 'displeased', 'depressed', 'displeased',
       'depressed', 'depressed', 'displeased', 'displeased', 'depressed',
       'displeased', 'depressed', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased',
       'displeased', 'displeased', 'displeased', 'depressed', 'depressed',
       'displeased', 'displeased', 'displeased', 'displeased',
       'depressed', 'displeased', 'depressed', 'displeased', 'displeased',
       'displeased', 'displeased', 'depressed', 'displeased', 'depressed',
       'depressed', 'displeased', 'displeased', 'depress

## Accuracy Scores

In [12]:
## Apllying 5-fold cross validation
predict_svc = pd.Series(cross_val_predict(knn_model, X, y, cv = 5))

## Generating Confusoin Matrix
conf_matrix_svc = confusion_matrix(y, predict_svc, labels = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty'])
print(conf_matrix_svc)

[[56 21  0  0  0  0]
 [35 41  0  0  0  0]
 [ 6  2  0  0  0  0]
 [ 4  2  0  0  0  0]
 [ 7  0  0  0  0  0]
 [ 0  1  0  0  0  0]]




In [13]:
## Classification Report
target_names = ['displeased', 'depressed', 'hurt', 'lonely', 'grief', 'guilty']
print(classification_report(y, predict_svc, target_names = target_names))

              precision    recall  f1-score   support

  displeased       0.61      0.54      0.57        76
   depressed       0.52      0.73      0.61        77
        hurt       0.00      0.00      0.00         7
      lonely       0.00      0.00      0.00         1
       grief       0.00      0.00      0.00         8
      guilty       0.00      0.00      0.00         6

    accuracy                           0.55       175
   macro avg       0.19      0.21      0.20       175
weighted avg       0.49      0.55      0.52       175



  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
## Prediction
sentence = ["she has depression"]
X_test1 = vectorizer.transform(sentence)
#X_test1

knn_model.fit(X, y)
knn_model.predict(X_test1.todense())

array(['depressed'], dtype=object)