# Imports

In [1]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from nltk.stem import PorterStemmer
from textblob import Word
from nltk.util import ngrams
import re, numpy, nltk, string, xgboost
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation,NMF,TruncatedSVD
import sklearn.feature_extraction.text as text
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm,decomposition, ensemble
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_error

# Data Cleaning and Preprocessing

In [2]:
data = pd.read_csv('C:/Users/emnag/ISEAR.csv')

In [3]:
data.columns=['EMOTION','TEXT','UNAMED']

In [4]:
data.head()

Unnamed: 0,EMOTION,TEXT,UNAMED
0,fear,Every time I imagine that someone I love or I ...,
1,anger,When I had been obviously unjustly treated and...,
2,sadness,When I think about the short time that we live...,
3,disgust,At a gathering I found myself involuntarily si...,
4,shame,When I realized that I was directing the feeli...,


In [5]:
data['TEXT']=data['TEXT'].apply(lambda a:" ".join(a.lower() for a in a.split()))

In [6]:
data['TEXT']=data['TEXT'].apply(lambda a:" ".join(a.replace('[^\w\s]','') for a in a.split()))

In [7]:
stop = stopwords.words('english')

In [8]:
data['TEXT']=data['TEXT'].apply(lambda a:" ".join(a for a in a.split() if a not in stop))

In [9]:
data['TEXT']=data['TEXT'].apply(lambda a:str(TextBlob(a).correct()))

In [10]:
st = PorterStemmer()
data['TEXT']= data['TEXT'].apply(lambda a:" ".join([st.stem(word) for word in a.split()]))

In [11]:
data.head()

Unnamed: 0,EMOTION,TEXT,UNAMED
0,fear,everi time imagin someon love could contact se...,
1,anger,obvious unjustli treat possibl elucid this.,
2,sadness,think short time live relat period life think ...,
3,disgust,gather found involuntarili sit next two peopl ...,
4,shame,realiz direct feel discont partner way tri put...,


# Label Encoding

In [12]:
data['EMOTION'].value_counts()

EMOTION
joy        1091
sadness    1082
anger      1079
fear       1076
shame      1071
disgust    1066
guilt      1050
Name: count, dtype: int64

In [13]:
object = preprocessing.LabelEncoder()
data['EMOTION']=object.fit_transform(data['EMOTION'])

In [14]:
data['EMOTION'].value_counts()

EMOTION
4    1091
5    1082
0    1079
2    1076
6    1071
1    1066
3    1050
Name: count, dtype: int64

# Train-Test Split

In [15]:
Xtrain, Xtest, Ytrain, Ytest = model_selection.train_test_split(data['TEXT'],data['EMOTION'],stratify=data['EMOTION'])

# Feature Engineering

In [22]:
cv = CountVectorizer()
cv.fit(data['TEXT'])
cv_xtrain = cv.transform(Xtrain)
cv_xtest = cv.transform(Xtest)

In [23]:
tv = TfidfVectorizer()
tv.fit(data['TEXT'])
tv_xtrain = tv.transform(Xtrain)
tv_xtest = tv.transform(Xtest)

# Model Building Phase

In [31]:
def build(model_initializer, independent_variables_training,target, independent_variable_test):
    model_initializer.fit(independent_variables_training,target)
    modelPred=model_initializer.predict(independent_variable_test)
    return metrics.accuracy_score(modelPred,Ytest)

# Multinomial Naive Bayes

In [32]:
output = build(naive_bayes.MultinomialNB(),cv_xtrain,Ytrain,cv_xtest)
print(output)

0.55401809473124


In [33]:
output = build(naive_bayes.MultinomialNB(),tv_xtrain,Ytrain,tv_xtest)
print(output)

0.5620010643959553


# Linear Classifier/Logistic Regression

In [34]:
output = build(linear_model.LogisticRegression(),cv_xtrain,Ytrain,cv_xtest)
print(output)

0.5550824906865354


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
output = build(linear_model.LogisticRegression(),tv_xtrain,Ytrain,tv_xtest)
print(output)

0.5779670037253859


# Support-Vector Machine

In [36]:
output = build(svm.SVC(),cv_xtrain,Ytrain,cv_xtest)
print(output)

0.5290047897817989


In [37]:
output = build(svm.SVC(),tv_xtrain,Ytrain,tv_xtest)
print(output)

0.5763704097924428


# Random Forest

In [38]:
output = build(ensemble.RandomForestClassifier(),cv_xtrain,Ytrain,cv_xtest)
print(output)

0.532197977647685


In [39]:
output = build(ensemble.RandomForestClassifier(),tv_xtrain,Ytrain,tv_xtest)
print(output)

0.5529536987759447


# Confusion Matrix for the Selected Model

In [41]:
classifier = linear_model.LogisticRegression().fit(tv_xtrain,Ytrain)
val_predictions = classifier.predict(tv_xtest)
y_true, y_pred = Ytest, val_predictions
print(classification_report(y_true,y_pred))
print()

              precision    recall  f1-score   support

           0       0.48      0.50      0.49       270
           1       0.61      0.65      0.63       266
           2       0.69      0.68      0.68       269
           3       0.47      0.42      0.45       262
           4       0.63      0.75      0.68       273
           5       0.69      0.58      0.63       271
           6       0.46      0.47      0.47       268

    accuracy                           0.58      1879
   macro avg       0.58      0.58      0.58      1879
weighted avg       0.58      0.58      0.58      1879


