In [2]:
#import pandas library
import pandas as pd

#read the dataset with name "Emotion_classify_Data.csv" and store it in a variable df
df = pd.read_csv("Emotion_classify_Data.csv")

#print the shape of dataframe
print(df.shape)

#print top 5 rows
df.head(5)

(5937, 2)


Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [3]:
#check the distribution of Emotion
df["Emotion"].value_counts()

Emotion
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

In [4]:
#Add the new column "Emotion_num" which gives a unique number to each of these Emotions
#anger --> 0, joy --> 1, fear --> 2

df["Emotion_num"] = df.Emotion.map({
    "anger":0,
    "joy":1,
    "fear":2,
})

#checking the results by printing top 5 rows
df.head(5)

Unnamed: 0,Comment,Emotion,Emotion_num
0,i seriously hate one subject to death but now ...,fear,2
1,im so full of life i feel appalled,anger,0
2,i sit here to write i start to dig out my feel...,fear,2
3,ive been really angry with r and i feel like a...,joy,1
4,i feel suspicious if there is no one outside l...,fear,2


## Modelling without Pre-processing Text data


In [5]:
#import train-test split

from sklearn.model_selection import train_test_split

#Do the 'train-test' splitting with test size of 20%
#Note: Give Random state 2024 and also do the stratify sampling

X_train, X_test, y_train, y_test = train_test_split(
    df.Comment,
    df.Emotion_num,
    test_size=0.2,
    random_state=2024,
    stratify=df.Emotion_num,
)

In [6]:
# print the shapes of X_tarin and X_test

print("X_train: ",X_train.shape)
print("X_test: ",X_test.shape)

X_train:  (4749,)
X_test:  (1188,)


### Attempt 1 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
### Note:

* using CountVectorizer with only trigrams.
* use `RandomForest` as the classifier.
* print the classification report.

In [7]:
#import CountVectorizer, RandomForest, pipeline, classification_report from sklearn 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
    ("CountVectorizer", CountVectorizer(ngram_range=(3,3))), # Step 1: Convert text data to TF-IDF features
    ("RandomForestClassifier", RandomForestClassifier()), # Step 2: Apply RandomForest classification
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)

#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.25      0.34       400
           1       0.59      0.23      0.33       400
           2       0.37      0.80      0.51       388

    accuracy                           0.43      1188
   macro avg       0.50      0.43      0.39      1188
weighted avg       0.50      0.43      0.39      1188




### Attempt 2 :
using the sklearn pipeline module create a classification pipeline to classify the Data.

### Note:

* using CountVectorizer with both unigram and bigrams.
* use `Multinomial Naive Bayes` as the classifier.
* print the classification report.

In [9]:
#import MultinomialNB from sklearn
from sklearn.naive_bayes import MultinomialNB

#1. create a pipeline object
clf = Pipeline([
    ("CountVectorizer", CountVectorizer(ngram_range=(1,2))),
    ("Multi Nomila", MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       400
           1       0.86      0.85      0.86       400
           2       0.86      0.84      0.85       388

    accuracy                           0.86      1188
   macro avg       0.86      0.86      0.86      1188
weighted avg       0.86      0.86      0.86      1188



### Attempt 3 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
### Note:

* using CountVectorizer with both unigram and Bigrams.
* use `RandomForest` as the classifier.
* print the classification report.

In [14]:
#1. create a pipeline object

clf = Pipeline([
    ("CountVectorizer", CountVectorizer(ngram_range=(1,2))),
    ("RandomForest", RandomForestClassifier())
])


#2. fit with X_train and y_train

clf.fit(X_train, y_train)

#3. get the predictions for X_test and store it in y_pred

y_pred = clf.predict(X_test)

#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.90      0.92       400
           1       0.85      0.95      0.90       400
           2       0.95      0.86      0.91       388

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



### Attempt 4 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
### Note:

* using TF-IDF vectorizer for Pre-processing the text.
* use RandomForest as the classifier.
* print the classification report.

In [17]:
#import TfidfVectorizer from sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

#1. create a pipeline object
clf = Pipeline([
    ("TfidfVectorizer", TfidfVectorizer()),
    ("RandomForest", RandomForestClassifier())
])


#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

#4. print the classfication report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91       400
           1       0.86      0.93      0.89       400
           2       0.91      0.88      0.89       388

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188



#### Use text pre-processing to remove stop words, punctuations and apply lemmatization


In [20]:
import spacy

# Load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")

# use this utility function to get the preprocessed text to data
def preprocess(text):
    # Remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_token = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_token.append(token.lemma_)
    return " ".join(filtered_token)

In [21]:
# create a new column "preprocessed_comment" and use the utility function above to get the clean data
# this will take some time, please be patient

df["preprocessed_comment"] = df["Comment"].apply(preprocess)

#### Build a model with pre processed text

In [23]:
#Do the 'train-test' splitting with test size of 20% with random state of 2024 and stratify sampling too
#Note: Use the preprocessed_Comment

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_comment,
    df.Emotion_num,
    test_size=0.2,
    random_state=2024,
    stratify=df.Emotion_num
)

### Let's check the scores with our best model till now

Random Forest
### Attempt1 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
### Note:

* using `CountVectorizer` with both unigrams and bigrams.
* use `RandomForest` as the classifier.
*print the classification report.

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
    ("CountVectorizer", CountVectorizer(ngram_range=(1,2))),
    ("Random Forest", RandomForestClassifier())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)

#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.93      0.92       400
           1       0.91      0.94      0.92       400
           2       0.95      0.90      0.93       388

    accuracy                           0.92      1188
   macro avg       0.92      0.92      0.92      1188
weighted avg       0.92      0.92      0.92      1188



### Attempt 2 :

using the sklearn pipeline module create a classification pipeline to classify the data.
### Note:

* using TF-IDF vectorizer for pre-processing the text.
* use RandomForest as the classifier.
* print the classification report.

### Attempt 2 :

using the sklearn pipeline module create a classification pipeline to classify the data.
### Note:

* using `TF-IDF vectorizer` for pre-processing the text.
* use `RandomForest` as the classifier.
* print the classification report.

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
    ("TfidfVectorizer", TfidfVectorizer()),
    ("Random Forest", RandomForestClassifier()),
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)

#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.92      0.92       400
           1       0.92      0.94      0.93       400
           2       0.93      0.93      0.93       388

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188

