In [1]:
import pandas as pd
import numpy as np
import nltk 
import string 
import re 
from imblearn.over_sampling import RandomOverSampler
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.tokenize import word_tokenize
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [70]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Chaitanya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [7]:
#reading data 
train=pd.read_csv('train_data.txt',sep=';',header=None)
test=pd.read_csv('test_data.txt',sep=';',header=None)

In [8]:
#saving data to csv
train.to_csv('train.csv')
test.to_csv('test.csv')

In [9]:
#specifying column names
train.columns = ['text', 'emotion'] 
test.columns = ['text', 'emotion'] 

In [10]:
#checking for class imbalance
train.emotion.value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: emotion, dtype: int64

In [11]:
#text to lowercase
train["text"] = train["text"].str.lower()
test["text"] = test["text"].str.lower()

In [12]:
#custom function to remove the punctuation
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

train["text"] = train["text"].apply(lambda text: remove_punctuation(text))
test["text"] = test["text"].apply(lambda text: remove_punctuation(text))

In [13]:
#custom function to remove the stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train["text"] = train["text"].apply(lambda text: remove_stopwords(text))
test["text"] = test["text"].apply(lambda text: remove_stopwords(text))

In [16]:
#lemmetizing words in each sentence
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

train["text"] = train["text"].apply(lambda text: lemmatize_words(text))
test["text"] = test["text"].apply(lambda text: lemmatize_words(text))

In [17]:
train.text

0                                     didnt feel humiliate
1        go feeling hopeless damn hopeful around someon...
2                    im grab minute post feel greedy wrong
3        ever feel nostalgic fireplace know still property
4                                             feel grouchy
                               ...                        
15995         brief time beanbag say anna feel like beaten
15996    turn feel pathetic still wait table sub teach ...
15997                             feel strong good overall
15998                       feel like rude comment im glad
15999                         know lot feel stupid portray
Name: text, Length: 16000, dtype: object

In [18]:
X_train=train.text
y_train=train.emotion

In [19]:
X_test=test.text
y_test=test.emotion

In [20]:
#tokenizing text
def tokenize(data):
    result=[]
    for sentence in data:
        result_sentence=[]
        result_sentence=nltk.word_tokenize(sentence)
        result.append(result_sentence)
    return(result)

In [21]:
X_train_tok= tokenize(X_train)

In [22]:
X_test_tok=tokenize(X_test)

In [23]:
#computing Tfidf features
v = TfidfVectorizer()
xtr = v.fit_transform(X_train)
xts= v.transform(X_test)

In [24]:
a=v.get_feature_names()
a

['aa',
 'aaaaaaand',
 'aaaaand',
 'aaaand',
 'aac',
 'aahhh',
 'aaron',
 'ab',
 'abandon',
 'abandonment',
 'abate',
 'abbigail',
 'abc',
 'abdomen',
 'abdominal',
 'abduct',
 'abelard',
 'abhorrent',
 'abide',
 'ability',
 'abit',
 'able',
 'ableness',
 'abnormally',
 'aboard',
 'abominable',
 'abortion',
 'abou',
 'abound',
 'abraham',
 'abroad',
 'abruptly',
 'absence',
 'absolute',
 'absolutely',
 'absolutly',
 'absorb',
 'abstain',
 'abstinence',
 'abstract',
 'absurd',
 'absurdity',
 'abt',
 'abundance',
 'abundantly',
 'abuse',
 'abused',
 'abusive',
 'abyss',
 'ac',
 'academia',
 'academic',
 'academy',
 'acause',
 'accelerate',
 'accent',
 'accentuate',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepts',
 'access',
 'accessary',
 'accessibility',
 'accessory',
 'accident',
 'accidentally',
 'acclimate',
 'acco',
 'accommodate',
 'accommodation',
 'accompaniment',
 'accompany',
 'accomplish',
 'accomplished',
 'accomplishing',
 'accomplishment',
 'accord',
 'accor

In [25]:
Xtrain=xtr.toarray()

In [26]:
Xtest=xts.toarray()

In [27]:
Xtrain=pd.DataFrame(Xtrain)
Xtest=pd.DataFrame(Xtest)

In [28]:
Xtrain

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12099,12100,12101,12102,12103,12104,12105,12106,12107,12108
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
#dimentionality reduction(keeping 10% of the features)
select = SelectKBest(score_func=chi2, k=1200)
fit = select.fit(Xtrain, y_train)

In [30]:
TrainX = fit.transform(Xtrain)
TestX=fit.transform(Xtest)

In [31]:
TrainX=pd.DataFrame(TrainX)

In [32]:
TestX=pd.DataFrame(TestX)

In [33]:
#Oversampling data to deal with imbalanced classes
ros = RandomOverSampler(random_state=0)
x_train_res, y_train_res=ros.fit_sample(TrainX, y_train)

In [35]:
#new class breakdown
y_train_res.value_counts()

surprise    5362
fear        5362
joy         5362
sadness     5362
anger       5362
love        5362
Name: emotion, dtype: int64

In [36]:
#XGBoost modelling
clf = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7)

In [38]:
clf.fit(x_train_res,y_train_res)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, seed=2, subsample=0.7,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [39]:
pred=clf.predict(TestX)

In [40]:
pred

array(['sadness', 'sadness', 'sadness', ..., 'joy', 'joy', 'fear'],
      dtype=object)

In [41]:
#accuracy score
accuracy_score(y_test, pred)

0.8645

In [42]:
final=pd.DataFrame(pred)

In [43]:
final['true']=y_test

In [44]:
#test emotions vs predicted emotion
final

Unnamed: 0,0,true
0,sadness,sadness
1,sadness,sadness
2,sadness,sadness
3,joy,joy
4,sadness,sadness
...,...,...
1995,anger,anger
1996,anger,anger
1997,joy,joy
1998,joy,joy


In [45]:
# classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

       anger       0.87      0.89      0.88       275
        fear       0.83      0.82      0.83       224
         joy       0.92      0.87      0.89       695
        love       0.68      0.89      0.77       159
     sadness       0.95      0.85      0.90       581
    surprise       0.55      0.92      0.69        66

    accuracy                           0.86      2000
   macro avg       0.80      0.87      0.83      2000
weighted avg       0.88      0.86      0.87      2000

