In [2]:
import pandas as pd

df_train = pd.read_csv('train.txt',sep = ';', names = ['comment', 'emotion'])
df_train.head()

Unnamed: 0,comment,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
df_train.emotion.value_counts()

emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [9]:
df_train['label'] = df_train.emotion.map({
    'joy':0,
    'sadness':1,
    'anger':2,
    'fear':3,
    'love':4,
    'surprise':5
})

df_train.head()

Unnamed: 0,comment,emotion,label
0,i didnt feel humiliated,sadness,1
1,i can go from feeling so hopeless to so damned...,sadness,1
2,im grabbing a minute to post i feel greedy wrong,anger,2
3,i am ever feeling nostalgic about the fireplac...,love,4
4,i am feeling grouchy,anger,2


In [11]:
from sklearn.model_selection import train_test_split


X_train,X_test,y_train,y_test = train_test_split(
    df_train.comment,
    df_train.label,
    test_size=0.2,
    random_state=2022,
    stratify=df_train.label
)


In [41]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(3,3))),
    ('model',RandomForestClassifier())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.47      0.56      0.51      1072
           1       0.52      0.40      0.46       933
           2       0.47      0.18      0.26       432
           3       0.20      0.49      0.28       387
           4       0.52      0.13      0.20       261
           5       0.71      0.10      0.18       115

    accuracy                           0.40      3200
   macro avg       0.48      0.31      0.32      3200
weighted avg       0.47      0.40      0.40      3200



In [45]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,2))),
    ('model',MultinomialNB())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.62      0.96      0.76      1072
           1       0.67      0.91      0.77       933
           2       0.93      0.29      0.44       432
           3       0.89      0.29      0.44       387
           4       0.82      0.05      0.10       261
           5       0.83      0.04      0.08       115

    accuracy                           0.67      3200
   macro avg       0.80      0.42      0.43      3200
weighted avg       0.74      0.67      0.60      3200



In [46]:
clf = Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,2))),
    ('model',RandomForestClassifier())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.96      0.87      1072
           1       0.92      0.89      0.90       933
           2       0.90      0.80      0.84       432
           3       0.88      0.79      0.83       387
           4       0.88      0.67      0.76       261
           5       0.87      0.63      0.73       115

    accuracy                           0.86      3200
   macro avg       0.87      0.79      0.82      3200
weighted avg       0.87      0.86      0.86      3200



In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

clf = Pipeline([
    ('vectorizer',TfidfVectorizer()),
    ('model',RandomForestClassifier())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.93      0.87      1072
           1       0.90      0.88      0.89       933
           2       0.87      0.80      0.84       432
           3       0.83      0.82      0.82       387
           4       0.86      0.70      0.77       261
           5       0.87      0.67      0.75       115

    accuracy                           0.86      3200
   macro avg       0.86      0.80      0.82      3200
weighted avg       0.86      0.86      0.85      3200



In [49]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 


#use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [51]:
df_train['preprocessed_comment'] = df_train.comment.apply(preprocess)

In [52]:
X_train,X_test,y_train,y_test = train_test_split(
    df_train.preprocessed_comment,
    df_train.label,
    test_size=0.2,
    random_state=2022,
    stratify=df_train.label
)


In [53]:
clf = Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,2))),
    ('model',RandomForestClassifier())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.91      1072
           1       0.88      0.92      0.90       933
           2       0.85      0.86      0.85       432
           3       0.87      0.83      0.85       387
           4       0.81      0.77      0.79       261
           5       0.85      0.72      0.78       115

    accuracy                           0.88      3200
   macro avg       0.86      0.84      0.85      3200
weighted avg       0.88      0.88      0.88      3200



In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

clf = Pipeline([
    ('vectorizer',TfidfVectorizer()),
    ('model',RandomForestClassifier())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1072
           1       0.90      0.88      0.89       933
           2       0.86      0.81      0.83       432
           3       0.80      0.82      0.81       387
           4       0.80      0.67      0.73       261
           5       0.80      0.66      0.72       115

    accuracy                           0.85      3200
   macro avg       0.83      0.79      0.81      3200
weighted avg       0.85      0.85      0.85      3200



In [1]:
t = "i am afraid"

p = pd.Series(t)

pred = clf.predict(p)
print(pred)

NameError: name 'pd' is not defined