In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('train.txt', sep=';', header=None, names=['text', 'emotion'])

In [109]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger


In [110]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [111]:
df['emotion'].value_counts()

emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [112]:
emotions = df['emotion'].unique()
emotions

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [113]:
emotions_numbers = {}

In [114]:
i = 0
for emo in emotions:
    emotions_numbers[emo] = i
    i +=1
#replacing emotions into their corresponding assign numbers through array's indexing.
df['emotion'] = df['emotion'].map(emotions_numbers)

In [115]:
emotions_numbers

{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}

In [116]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


***Text Preprocessing:***

***1.** lowercasing:*

In [117]:
# df['text'] = df['text'].apply(lambda x : x.lower())

In [118]:
df['text'] = df['text'].str.lower()

***2.** remove Punctuation:*
<br>*(, $ # @ % ^ & )*

In [119]:
import string

def remove_punc(txt):
    return txt.translate(str.maketrans('','', string.punctuation))

In [120]:
df['text'] = df['text'].apply(remove_punc)

***3.** remove Numbers:*

In [121]:
def remove_numbers(txt):
    new = ""

    for i in txt:
        if not i.isdigit(): #not acceptable for digits
            new = new + i
    return new

df['text'] = df['text'].apply(remove_numbers)

*remove  URLs / Links*
<br>*remove HTMLs tags*

***4.** remove Emojis & Special chr:*

In [122]:
def remove_emojis(txt):
    new = ""
    for i in txt:
        if i.isascii(): #not acceptable for emojis & special chr, Only acceptable for ascii values(a,b,..z)
            new += i
    return new

df['text'] = df['text'].apply(remove_emojis)

***5.** remove Stopwords:*

In [123]:
import nltk

In [124]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [125]:
nltk.download('punkt') #used for tokenization
nltk.download('stopwords')

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arrma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arrma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\arrma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [126]:
stopwords = set(stopwords.words('english'))
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [127]:
len(stopwords)

198

In [128]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [129]:
def remove(txt):
    words = word_tokenize(txt)
    cleaned = []

    for i in words:
        if not i in stopwords:
            cleaned.append(i)
        
    return ' '.join(cleaned)

In [130]:
df['text'] = df['text'].apply(remove)

In [131]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

***

### ***Feature Extraction/vectorization:***

***

***One hot encoding:***
<br>*-not majorly in use due to size difference (tends to impposible with work in ML)*

In [132]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(documents)

print('Vocabulary:', vectorizer.get_feature_names_out())
print('\nOne-Hot Matrix:\n', X.toarray())

Vocabulary: ['best' 'great' 'is' 'love' 'pasta' 'pizza' 'the']

One-Hot Matrix:
 [[0 0 0 1 0 1 0]
 [1 0 1 0 0 1 1]
 [0 1 1 0 1 0 0]]


***

***Bag Of Words:***
<br>*fixed size because it uses a global vocabulary, work with ML*

In [133]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "I love pizza",
    "Pizza is the best",
    "Pasta is great"
]

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(documents)

print('Vocabulary:', vectorizer.get_feature_names_out())
print('\nBow Matrix:\n', X.toarray())

Vocabulary: ['best' 'great' 'is' 'love' 'pasta' 'pizza' 'the']

Bow Matrix:
 [[0 0 0 1 0 1 0]
 [1 0 1 0 0 1 1]
 [0 1 1 0 1 0 0]]


***Cons:***
* *Sparse Matrix*
* *OOV (out of vocabulary)*
* *out of order*
* *Semantic meaning (but better than OnehotEncoding)*

##### ***Note:*** *Bag of Words is a word-level one-hot encoding where values represent word frequency instead of binary presence.*

***N-grams:***
<br>*different possible pair of words*

In [134]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "I love pizza",
    "Pizza is the best",
    "Pasta is great"
]

vectorizer = CountVectorizer( ngram_range=(2,3) ) #ngrams

X = vectorizer.fit_transform(documents)

print('Vocabulary:', vectorizer.get_feature_names_out())
print('\nBow Matrix:\n', X.toarray())

Vocabulary: ['is great' 'is the' 'is the best' 'love pizza' 'pasta is'
 'pasta is great' 'pizza is' 'pizza is the' 'the best']

Bow Matrix:
 [[0 0 0 1 0 0 0 0 0]
 [0 1 1 0 0 0 1 1 1]
 [1 0 0 0 1 1 0 0 0]]


***

##### ***TF - IDF (Term Frequency ‚Äì Inverse Document Frequency) Vectorizer:*** *TF-IDF is a statistical measure used to evaluate how important a word is to a document in a collection.*

***
***Term Frequency (TF):***
<br>***TF** = count of term in document / total terms in document*

*** 
***Inverse Document Frequency (IDF):***
* IDF(ùë°) = log( **ùëÅ** / **DF(ùë°)** )

*where:*<br>
**ùëÅ** *‚Üí Total number of documents*
<br>***DF*** **(ùë°)** *‚Üí Number of documents containing term* ùë°

***
*IDF (with **scikit-learn smoothing**):*
* IDF(t) = ( log( (1 + **N**) / (1 + **DF(t)**) ) + 1

*where:*<br>
**ùëÅ** *‚Üí Total number of documents*
<br>***DF*** **(ùë°)** *‚Üí Number of documents containing term* ùë°

***Why smoothing?***<br>
* *Prevents division by zero*
* *Ensures every term has a non-zero weight*
* *Produces more stable results*
***
***Key Properties:***<br>
- *High TF ‚Üí important in the document*  
- *Low DF ‚Üí rare across documents*
- *Penalizes common words*  
- *Produces fixed-size vectors based on vocabulary*

***Matrix Shape:***<br>
*( text{number of documents}, text{vocabulary size} )*

***Use Cases:***<br>
- *Text classification*  
- *Information retrieval*  
- *Search engines*  
- *NLP feature extraction*

In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
    "I love pizza",
    "Pizza is the best",
    "Pasta is great"
]

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(documents)

print('Vocabulary:', vectorizer.get_feature_names_out())
print('\nBow Matrix:\n', X.toarray())

Vocabulary: ['best' 'great' 'is' 'love' 'pasta' 'pizza' 'the']

Bow Matrix:
 [[0.         0.         0.         0.79596054 0.         0.60534851
  0.        ]
 [0.5628291  0.         0.42804604 0.         0.         0.42804604
  0.5628291 ]
 [0.         0.62276601 0.4736296  0.         0.62276601 0.
  0.        ]]


***

In [None]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [138]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size=0.20, random_state=42)

In [140]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)


from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)


pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test, pred_bow))

0.7678125


In [141]:
pred_bow

array([0, 5, 0, ..., 5, 5, 0])

In [142]:
y_test

8756     0
4660     5
6095     0
304      5
8241     0
        ..
15578    5
5746     5
6395     5
7624     5
15245    0
Name: emotion, Length: 3200, dtype: int64

In [143]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)

In [144]:
y_pred = nb2_model.predict(X_test_tfidf)

In [145]:
print(accuracy_score(y_test, y_pred))

0.6609375


In [146]:
from sklearn.linear_model import LogisticRegression

In [147]:
logistic_model = LogisticRegression(max_iter=1000)

In [148]:
logistic_model.fit(X_train_tfidf,y_train)

In [149]:
log_pred = logistic_model.predict(X_test_tfidf)

In [150]:
print(accuracy_score(y_test,log_pred ))

0.8615625
