In [1]:
import numpy as np 
import pandas as pd 
import os 
import re

In [2]:
train_data = pd.read_csv("Datasets/train.txt",names=['Message','label'],sep=";")
test_data = pd.read_csv("Datasets/test.txt",names=['Message','label'],sep=";")
val_data = pd.read_csv("Datasets/val.txt",names=['Message','label'],sep=';')

In [7]:
print(sorted(train_data['label'].unique()))
print(sorted(test_data['label'].unique()))
print(sorted(val_data['label'].unique()))

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']


In [11]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
lb.fit(sorted(train_data['label'].unique()))
lb.transform(sorted(train_data['label'].unique()))

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [22]:
def to_label(column):
    lb = LabelEncoder()
    return lb.fit_transform(column)

In [23]:
train_data['labels'] = to_label(train_data['label'])
test_data['labels'] = to_label(test_data['label'])
val_data['labels'] = to_label(val_data['label'])
train_data.head()

Unnamed: 0,Message,label,labels
0,i didnt feel humiliated,sadness,4
1,i can go from feeling so hopeless to so damned...,sadness,4
2,im grabbing a minute to post i feel greedy wrong,anger,0
3,i am ever feeling nostalgic about the fireplac...,love,3
4,i am feeling grouchy,anger,0


In [24]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

train_data['Message'] = train_data['Message'].apply(preprocessor)
test_data['Message'] = test_data['Message'].apply(preprocessor)
val_data['Message'] = val_data['Message'].apply(preprocessor)

In [25]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [26]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\infra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
from nltk.corpus import stopwords
stop = stopwords.words("english")

In [29]:
X_train = train_data['Message'].values
y_train = train_data['labels'].values

X_test = test_data['Message'].values
y_test = test_data['labels'].values

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

tfidf = TfidfVectorizer(strip_accents = None,
                       lowercase=False,
                       preprocessor=None)
lr = LogisticRegression(solver="liblinear",random_state=1)
param_grid = [{"vect__ngram_range":[(1,1)],
              "vect__stop_words":[stop,None],
              "clf__penalty":["l1","l2"],
              "vect__tokenizer":[tokenizer,tokenizer_porter],
              "clf__C":[1.0,10.0,100.0]},
             {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}]
pipe = Pipeline([("vect",tfidf),
                ("clf",lr)])
grid = GridSearchCV(pipe,param_grid,scoring="accuracy",cv=5,verbose=2,n_jobs=-1)
grid.fit(X_train,y_train.ravel())

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  4.0min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(random_state=1,
                                                           solver='liblinear'))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [1.0, 10.0, 100.0],
                          'clf__penalty': ['l1', 'l2'],
                          'vect__ngram_range': [(1, 1)],
                          'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've...
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've",
                                                "you'll", "you'd", 'your',
 

In [91]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
prediction = grid.predict(X_test)
print(classification_report(prediction,y_test))
print("accuracy: {}".format(accuracy_score(prediction,y_test)))

              precision    recall  f1-score   support

           0       0.89      0.90      0.90       274
           1       0.87      0.88      0.87       222
           2       0.92      0.92      0.92       697
           3       0.80      0.78      0.79       163
           4       0.94      0.95      0.94       577
           5       0.68      0.67      0.68        67

    accuracy                           0.90      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.90      0.90      0.90      2000

accuracy: 0.901


In [96]:
tfidf = TfidfVectorizer(ngram_range=(1,1),
                       stop_words=grid.best_params_['vect__stop_words'],
                       tokenizer=tokenizer,
                       lowercase=False,
                       strip_accents=None,
                       preprocessor=None,
                       use_idf=False)
lr = LogisticRegression(penalty="l1",random_state=1,solver="liblinear",
                       C=1.0)

pipeline = Pipeline([("vect",tfidf),
                    ("clf",lr)])
pipeline.fit(X_train,y_train.ravel())
prediction = pipeline.predict(X_test)
print(classification_report(prediction,y_test))
print("accuracy: {}".format(accuracy_score(prediction,y_test)))

              precision    recall  f1-score   support

           0       0.89      0.90      0.90       271
           1       0.86      0.89      0.87       215
           2       0.93      0.90      0.91       716
           3       0.79      0.77      0.78       163
           4       0.94      0.95      0.94       577
           5       0.65      0.74      0.69        58

    accuracy                           0.90      2000
   macro avg       0.84      0.86      0.85      2000
weighted avg       0.90      0.90      0.90      2000

accuracy: 0.8985


In [97]:
def pred(text):
    predicted_value = grid.predict(text)
    value = lb.inverse_transform([predicted_value])
    print("the message: ",text)
    return value[0]

In [94]:
print(lb.classes_)

['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']


In [98]:
pred([val_data['Message'][0]])

the message:  ['im feeling quite sad and sorry for myself but ill snap out of it soon']


'sadness'

In [101]:
pred([val_data['Message'][2]])

the message:  ['i feel like a faithful servant']


'joy'

In [104]:
pred([val_data['Message'][3]])

the message:  ['i am just feeling cranky and blue']


'anger'

In [114]:
pred([val_data['Message'][np.random.randint(low=0,high=700)]])

the message:  ['i see wonderful godly parents taking care of their childrens i praise god even though i feel jealous']


  return f(**kwargs)


'anger'