In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore")

/kaggle/input/emotions-dataset-for-nlp/val.txt
/kaggle/input/emotions-dataset-for-nlp/test.txt
/kaggle/input/emotions-dataset-for-nlp/train.txt


In [2]:
train_df = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/train.txt", sep=';', header=None, names=["Text", "Emotion"])
test_df = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/test.txt", sep=';', header=None, names=["Text", "Emotion"])
val_df = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/val.txt", sep=';', header=None, names=["Text", "Emotion"])

In [3]:
train_df["Emotion"].value_counts()

Emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [4]:
train_df.head(5)

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     16000 non-null  object
 1   Emotion  16000 non-null  object
dtypes: object(2)
memory usage: 250.1+ KB


In [6]:
train_df.describe()

Unnamed: 0,Text,Emotion
count,16000,16000
unique,15969,6
top,im still not sure why reilly feels the need to...,joy
freq,2,5362


In [7]:
print(f"Duplicated rows on train.txt: {train_df.duplicated().sum()}")

Duplicated rows on train.txt: 1


In [8]:
train_df.drop_duplicates(inplace=True)

In [9]:
train_df.isna().sum()

Text       0
Emotion    0
dtype: int64

In [10]:
test_df["Emotion"].value_counts()

Emotion
joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: count, dtype: int64

In [11]:
test_df.head(5)

Unnamed: 0,Text,Emotion
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


In [12]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     2000 non-null   object
 1   Emotion  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [13]:
test_df.describe()

Unnamed: 0,Text,Emotion
count,2000,2000
unique,2000,6
top,i feel all weird when i have to meet w people ...,joy
freq,1,695


In [14]:
print(f"Duplicated rows on test.txt: {test_df.duplicated().sum()}")

Duplicated rows on test.txt: 0


In [15]:
test_df.isna().sum()

Text       0
Emotion    0
dtype: int64

In [16]:
val_df["Emotion"].value_counts()

Emotion
joy         704
sadness     550
anger       275
fear        212
love        178
surprise     81
Name: count, dtype: int64

In [17]:
val_df.head(5)

Unnamed: 0,Text,Emotion
0,im feeling quite sad and sorry for myself but ...,sadness
1,i feel like i am still looking at a blank canv...,sadness
2,i feel like a faithful servant,love
3,i am just feeling cranky and blue,anger
4,i can have for a treat or if i am feeling festive,joy


In [18]:
val_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     2000 non-null   object
 1   Emotion  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [19]:
val_df.describe()

Unnamed: 0,Text,Emotion
count,2000,2000
unique,1998,6
top,i feel so tortured by it,joy
freq,2,704


In [20]:
print(f"Duplicated rows on val.txt: {val_df.duplicated().sum()}")

Duplicated rows on val.txt: 0


In [21]:
val_df.isna().sum()

Text       0
Emotion    0
dtype: int64

In [22]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string

!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

def get_text_length(text):
    return len(word_tokenize(text))

lemma = WordNetLemmatizer()
stemmer = PorterStemmer()
eng_stopwords = stopwords.words('english')

def preprocess(text):
    token = word_tokenize(text)

    token = [word for word in token if word not in string.punctuation and word not in eng_stopwords and word.isalpha()]
    token = [stemmer.stem(lemma.lemmatize(word)) for word in token]
    
    return token

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

In [23]:
train_df["TextLength"] = train_df["Text"].apply(lambda x: get_text_length(x))
test_df["TextLength"] = test_df["Text"].apply(lambda x: get_text_length(x))
val_df["TextLength"] = val_df["Text"].apply(lambda x: get_text_length(x))

In [24]:
train_df["TokenizedText"] = train_df["Text"].apply(lambda x: preprocess(x))
test_df["TokenizedText"] = test_df["Text"].apply(lambda x: preprocess(x))
val_df["TokenizedText"] = val_df["Text"].apply(lambda x: preprocess(x))

In [25]:
print(train_df["TokenizedText"])

0                                    [didnt, feel, humili]
1        [go, feel, hopeless, damn, hope, around, someo...
2             [im, grab, minut, post, feel, greedi, wrong]
3        [ever, feel, nostalg, fireplac, know, still, p...
4                                          [feel, grouchi]
                               ...                        
15995    [brief, time, beanbag, said, anna, feel, like,...
15996    [turn, feel, pathet, still, wait, tabl, sub, t...
15997                         [feel, strong, good, overal]
15998                [feel, like, rude, comment, im, glad]
15999                   [know, lot, feel, stupid, portray]
Name: TokenizedText, Length: 15999, dtype: object


In [26]:
print(test_df["TokenizedText"])

0           [im, feel, rather, rotten, im, ambiti, right]
1                         [im, updat, blog, feel, shitti]
2       [never, make, separ, ever, want, feel, like, a...
3       [left, bouquet, red, yellow, tulip, arm, feel,...
4                                [feel, littl, vain, one]
                              ...                        
1995    [keep, feel, like, someon, unkind, wrong, thin...
1996      [im, feel, littl, cranki, neg, doctor, appoint]
1997        [feel, use, peopl, give, great, feel, achiev]
1998    [im, feel, comfort, derbi, feel, though, start...
1999    [feel, weird, meet, w, peopl, text, like, dont...
Name: TokenizedText, Length: 2000, dtype: object


In [27]:
print(val_df["TokenizedText"])

0           [im, feel, quit, sad, sorri, ill, snap, soon]
1       [feel, like, still, look, blank, canva, blank,...
2                            [feel, like, faith, servant]
3                                    [feel, cranki, blue]
4                                   [treat, feel, festiv]
                              ...                        
1995    [im, ssa, examin, tomorrow, morn, im, quit, we...
1996    [constantli, worri, fight, natur, push, limit,...
1997           [feel, import, share, info, experi, thing]
1998    [truli, feel, passion, enough, someth, stay, t...
1999    [feel, like, wan, na, buy, cute, make, see, on...
Name: TokenizedText, Length: 2000, dtype: object


In [33]:
train_df.describe()

Unnamed: 0,TextLength
count,15999.0
mean,19.175761
std,10.992922
min,2.0
25%,11.0
50%,17.0
75%,25.0
max,66.0


In [29]:
test_df.describe()

Unnamed: 0,TextLength
count,2000.0
mean,19.161
std,11.015432
min,3.0
25%,10.75
50%,17.0
75%,26.0
max,61.0


In [30]:
val_df.describe()

Unnamed: 0,TextLength
count,2000.0
mean,18.877
std,10.818058
min,2.0
25%,10.0
50%,17.0
75%,25.0
max,61.0


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

train_df["PreprocessedText"] = train_df["TokenizedText"].apply(lambda x: " ".join(x))
test_df["PreprocessedText"] = test_df["TokenizedText"].apply(lambda x: " ".join(x))
val_df["PreprocessedText"] = val_df["TokenizedText"].apply(lambda x: " ".join(x))

train_df_matrix = vectorizer.fit_transform(train_df["PreprocessedText"])
test_df_matrix = vectorizer.transform(test_df["PreprocessedText"])
val_df_matrix = vectorizer.transform(val_df["PreprocessedText"])