In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df = pd.read_parquet("twit.parquet")
df.head()

Unnamed: 0,text,label
0,i feel awful about it too because it s my job ...,0
1,im alone i feel awful,0
2,ive probably mentioned this before but i reall...,1
3,i was feeling a little low few days back,0
4,i beleive that i am much more sensitive to oth...,2


Preproccesing

In [None]:
#stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#stemmer
ps = PorterStemmer()

In [None]:
#stemming
def stemming(dataframe):
  stemmed_data = re.sub('[^a-zA-Z]', ' ', dataframe)
  stemmed_data = stemmed_data.lower()
  stemmed_data = stemmed_data.split()
  stemmed_data = [ps.stem(word) for word in stemmed_data if not word in stopwords.words('english')]
  stemmed_data = ' '.join(stemmed_data)

  return stemmed_data

In [None]:
df['text'] = df['text'].apply(stemming)
df.head()

Unnamed: 0,text,label
0,feel aw job get posit succeed happen,0
1,im alon feel aw,0
2,ive probabl mention realli feel proud actual k...,1
3,feel littl low day back,0
4,beleiv much sensit peopl feel tend compassion,2


In [None]:
#split the dataset into training set, validation set, test set (75/15/15)
train, temp =  train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
valid , test =  train_test_split(temp, test_size=0.5, random_state=42, stratify=temp['label'])
train.head()

Unnamed: 0,text,label
200374,look forward feel delight,1
34801,feel purpos understand thought sweet empath,1
222662,feel like im alway put fake side impress peopl...,0
304942,feel worth support,2
83570,find daunt feel soon chang wish rise challeng ...,1


In [None]:
#split into two parts x and y
def split_data(dataframe):
  X = dataframe['text'].values
  y = dataframe['label'].values
  return X,y

In [None]:
X_train,y_train = split_data(train)
X_valid,y_valid= split_data(valid)
X_test,y_test= split_data(test)

In [None]:
#sampling label count (y dataframe)
def sampling_label_count(dataframe):
  for i in range(6):
    label = ""
    if i==0:
      label = "sadness"
    elif i==1:
      label = "joy"
    elif i==2:
      label = "love"
    elif i==3:
      label = "anger"
    elif i==4:
      label = "fear"
    elif i==5:
      label = "surprise"

    print(label + ": " + str(np.sum(i == dataframe)))

In [None]:
sampling_label_count(y_train)

sadness: 98747
joy: 98747
love: 98747
anger: 98747
fear: 98747
surprise: 98747


In [None]:
train['text'].head()

Unnamed: 0,text
200374,look forward feel delight
34801,feel purpos understand thought sweet empath
222662,feel like im alway put fake side impress peopl...
304942,feel worth support
83570,find daunt feel soon chang wish rise challeng ...


In [None]:
#convert text data to numerical data
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_valid = vectorizer.transform(X_valid)
X_test = vectorizer.transform(X_test)

In [None]:
print(X_train)
print(X_valid)
print(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5362533 stored elements and shape (592482, 43408)>
  Coords	Values
  (0, 22187)	10.527847717367738
  (0, 13826)	34.24017567575555
  (0, 12966)	2.865523355497212
  (0, 9307)	30.196274635417613
  (1, 12966)	2.008055190503955
  (1, 30196)	31.378880873910216
  (1, 39903)	13.064243403044136
  (1, 38199)	9.543606432448572
  (1, 37015)	11.78432904718738
  (1, 11614)	103.43981624483834
  (2, 12966)	1.9500091326619982
  (2, 21793)	2.4604333447407427
  (2, 18024)	2.4570654416292093
  (2, 1138)	7.250215348804874
  (2, 30227)	11.575535258376624
  (2, 12690)	15.852877024627771
  (2, 34278)	20.27195190307652
  (2, 18171)	15.241077077725343
  (2, 28274)	5.463496584686445
  (2, 22812)	4.944411960433657
  (2, 14051)	7.922192415824949
  (3, 12966)	3.5385933679718162
  (3, 42560)	41.95251146915286
  (3, 36843)	16.831735728774312
  (4, 12966)	1.3920381827362034
  :	:
  (592478, 21793)	3.068162569304981
  (592478, 42300)	18.622398515844324
  (59

In [None]:
#scale the dataset into equal parts
def scale_dataset(X_dataframe,y_dataframe):
  X = X_dataframe
  y = y_dataframe

  scaler = StandardScaler(with_mean=False)
  X = scaler.fit_transform(X)

  ros = RandomOverSampler()
  X, y = ros.fit_resample(X, y)

  return X, y

In [None]:
X_train, y_train = scale_dataset(X_train,y_train)

Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [None]:
#train the data with naive bayes
nb_model = MultinomialNB(alpha=350000, fit_prior=False)
nb_model.fit(X_train, y_train)

In [None]:
#check
y_pred = nb_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.74      0.82     18178
           1       0.93      0.68      0.78     21160
           2       0.47      0.87      0.61      5183
           3       0.78      0.82      0.80      8598
           4       0.71      0.82      0.76      7157
           5       0.36      0.95      0.52      2246

    accuracy                           0.76     62522
   macro avg       0.70      0.81      0.72     62522
weighted avg       0.83      0.76      0.77     62522

