In [46]:
import nltk
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from nltk.stem import LancasterStemmer, PorterStemmer
from customtkinter import *

In [47]:
df =pd.read_csv('archive_16/emotion-labels-train.csv')
df

Unnamed: 0,text,label
0,Just got back from seeing @GaryDelaney in Burs...,joy
1,Oh dear an evening of absolute hilarity I don'...,joy
2,Been waiting all week for this game ❤️❤️❤️ #ch...,joy
3,"@gardiner_love : Thank you so much, Gloria! Yo...",joy
4,I feel so blessed to work with the family that...,joy
...,...,...
3608,@VivienLloyd Thank you so much! Just home - st...,sadness
3609,Just put the winter duvet on ☃️❄️🌬☔️,sadness
3610,@SilkInSide @TommyJoeRatliff that's so pretty!...,sadness
3611,@BluesfestByron second artist announcement loo...,sadness


In [48]:
df.isna().sum()

text     0
label    0
dtype: int64

In [49]:
df.duplicated().sum()

np.int64(0)

In [50]:
mappings = {
    'joy': 0, 'fear': 1, 'anger': 2, 'sadness': 3
}
df['label'] = df['label'].map(mappings)

In [51]:
df.head()

Unnamed: 0,text,label
0,Just got back from seeing @GaryDelaney in Burs...,0
1,Oh dear an evening of absolute hilarity I don'...,0
2,Been waiting all week for this game ❤️❤️❤️ #ch...,0
3,"@gardiner_love : Thank you so much, Gloria! Yo...",0
4,I feel so blessed to work with the family that...,0


In [52]:
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    text = text.lower()
    text = re.sub(r'@\w+','',text)
    text = re.sub(r'http\S+','',text)
    text = text.replace('Ã¢â‚¬Â¦', '...')
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'\s+',' ',text)
    text = re.sub(r'\d+','',text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    return (stemmed_tokens) #' '.join

In [53]:
df['text'] = df['text'].apply(preprocess)

KeyboardInterrupt: 

In [9]:
df.head()

Unnamed: 0,text,label
0,"[got, back, see, burslem, amaz, face, still, h...",0
1,"[oh, dear, even, absolut, hilar, dont, think, ...",0
2,"[wait, week, game, cheer, friday]",0
3,"[thank, much, gloria, your, sweet, thought, ma...",0
4,"[feel, bless, work, famili, nanni, noth, love,...",0


In [10]:
def tf(dox):
    tf_dict = {}
    for term in dox:
        if term in tf_dict:
            tf_dict[term] += 1
        else:
            tf_dict[term] = 1
            
    for term in tf_dict:
        tf_dict[term] = tf_dict[term] / len(dox)
    return tf_dict

In [11]:
term_freq = df['text'].apply(tf)

In [12]:
df['tf val'] = term_freq
term_freq

0       {'got': 0.09090909090909091, 'back': 0.0909090...
1       {'oh': 0.09090909090909091, 'dear': 0.09090909...
2       {'wait': 0.2, 'week': 0.2, 'game': 0.2, 'cheer...
3       {'thank': 0.1, 'much': 0.1, 'gloria': 0.1, 'yo...
4       {'feel': 0.09090909090909091, 'bless': 0.09090...
                              ...                        
3608    {'thank': 0.1, 'much': 0.1, 'home': 0.1, 'stun...
3609    {'put': 0.3333333333333333, 'winter': 0.333333...
3610    {'that': 0.1, 'pretti': 0.1, 'love': 0.1, 'sky...
3611    {'second': 0.1111111111111111, 'artist': 0.111...
3612    {'liter': 0.05555555555555555, 'eat': 0.055555...
Name: text, Length: 3613, dtype: object

In [13]:
# Check TF result
index = 90

print('%20s' % "term", "\t", "TF\n")
for key in term_freq[index]:
    print('%20s' % key, "\t", term_freq[index][key])

                term 	 TF

               thank 	 0.125
              disney 	 0.125
               theme 	 0.125
              episod 	 0.125
                 let 	 0.125
              discov 	 0.125
                amaz 	 0.125
              hilari 	 0.125


In [14]:
def calc_df(tf):
    count = {}
    for dox in tf:
        for term in dox:
            if term in count:
                count[term] += 1
            else:
                count[term] = 1
    return count

dict_freq = calc_df(df['tf val'])

In [15]:
dict_freq

{'got': 88,
 'back': 77,
 'see': 89,
 'burslem': 2,
 'amaz': 74,
 'face': 45,
 'still': 92,
 'hurt': 17,
 'laugh': 41,
 'much': 68,
 'hilari': 39,
 'oh': 34,
 'dear': 9,
 'even': 77,
 'absolut': 24,
 'hilar': 16,
 'dont': 194,
 'think': 105,
 'long': 34,
 'time': 119,
 'wait': 46,
 'week': 50,
 'game': 37,
 'cheer': 79,
 'friday': 9,
 'thank': 64,
 'gloria': 1,
 'your': 52,
 'sweet': 15,
 'thought': 41,
 'made': 35,
 'day': 142,
 'joy': 32,
 'love': 130,
 'feel': 116,
 'bless': 16,
 'work': 75,
 'famili': 14,
 'nanni': 1,
 'noth': 26,
 'amp': 137,
 'appreci': 5,
 'make': 144,
 'smile': 69,
 'today': 89,
 'reach': 4,
 'subscrib': 3,
 'yt': 2,
 'goodday': 6,
 'good': 104,
 'morn': 28,
 'happi': 90,
 'first': 53,
 'fall': 14,
 'let': 84,
 'awesom': 7,
 'autumnmemori': 2,
 'annabailey': 2,
 'laughter': 31,
 'bridgetjonesbabi': 2,
 'best': 48,
 'thing': 65,
 'ive': 51,
 'seen': 20,
 'age': 15,
 'funni': 24,
 'miss': 22,
 'bridget': 2,
 'teammark': 2,
 'holiday': 5,
 'could': 45,
 'get': 222

In [16]:
df.head()

Unnamed: 0,text,label,tf val
0,"[got, back, see, burslem, amaz, face, still, h...",0,"{'got': 0.09090909090909091, 'back': 0.0909090..."
1,"[oh, dear, even, absolut, hilar, dont, think, ...",0,"{'oh': 0.09090909090909091, 'dear': 0.09090909..."
2,"[wait, week, game, cheer, friday]",0,"{'wait': 0.2, 'week': 0.2, 'game': 0.2, 'cheer..."
3,"[thank, much, gloria, your, sweet, thought, ma...",0,"{'thank': 0.1, 'much': 0.1, 'gloria': 0.1, 'yo..."
4,"[feel, bless, work, famili, nanni, noth, love,...",0,"{'feel': 0.09090909090909091, 'bless': 0.09090..."


In [17]:
n_dox = len(df)

def calc_idf(__n_dox, __dict_freq):
    idf_dict = {}
    for term in __dict_freq:
        idf_dict[term] = np.log10(__n_dox / (__dict_freq[term] + 1))
    return idf_dict

In [18]:
inverse_df = calc_idf(n_dox,dict_freq)

In [19]:
df.head()

Unnamed: 0,text,label,tf val
0,"[got, back, see, burslem, amaz, face, still, h...",0,"{'got': 0.09090909090909091, 'back': 0.0909090..."
1,"[oh, dear, even, absolut, hilar, dont, think, ...",0,"{'oh': 0.09090909090909091, 'dear': 0.09090909..."
2,"[wait, week, game, cheer, friday]",0,"{'wait': 0.2, 'week': 0.2, 'game': 0.2, 'cheer..."
3,"[thank, much, gloria, your, sweet, thought, ma...",0,"{'thank': 0.1, 'much': 0.1, 'gloria': 0.1, 'yo..."
4,"[feel, bless, work, famili, nanni, noth, love,...",0,"{'feel': 0.09090909090909091, 'bless': 0.09090..."


In [20]:
inverse_df

{'got': np.float64(1.6084779549231094),
 'back': np.float64(1.6657733588775419),
 'see': np.float64(1.6036254521286974),
 'burslem': np.float64(3.08074670684836),
 'amaz': np.float64(1.6828066981763221),
 'face': np.float64(1.8951101298864481),
 'still': np.float64(1.5893850130140872),
 'hurt': np.float64(2.302595456464716),
 'laugh': np.float64(1.9346186711701217),
 'much': np.float64(1.719018870830767),
 'hilari': np.float64(1.95580797024006),
 'oh': np.float64(2.0137999172177468),
 'dear': np.float64(2.5578679615680224),
 'even': np.float64(1.6657733588775419),
 'absolut': np.float64(2.1599279528959845),
 'hilar': np.float64(2.3274190401897483),
 'dont': np.float64(1.2678333502055044),
 'think': np.float64(1.532562096303252),
 'long': np.float64(2.0137999172177468),
 'time': np.float64(1.4786867155203973),
 'wait': np.float64(1.8857701036323047),
 'week': np.float64(1.850297785470086),
 'game': np.float64(1.978084364951212),
 'cheer': np.float64(1.6547779745760787),
 'friday': np.fl

In [21]:
def tfidf(tf):
    tfidf_dict = {}
    for term in tf:
        tfidf_dict[term] = tf[term] * inverse_df[term]
    return tfidf_dict

df['tf idf'] = df['tf val'].apply(tfidf)

In [22]:
index = 50
print('%20s' % "term", "\t", '%10s' % "TF", "\t", '%20s' % "TF-IDF\n")
for key in df['tf idf'][index]:
    print('%20s' % key, "\t", df["tf val"][index][key] ,"\t" , df["tf idf"][index][key])

                term 	         TF 	              TF-IDF

               thank 	 0.2 	 0.34899092098503337
             balloon 	 0.2 	 0.616149341369672
               today 	 0.2 	 0.3207250904257395
               smile 	 0.2 	 0.3425539843107531
             goodday 	 0.2 	 0.5425539843107531


In [23]:
print(df.iloc[2,0])
print(df.iloc[2,2])
print(df.iloc[2,3])

['wait', 'week', 'game', 'cheer', 'friday']
{'wait': 0.2, 'week': 0.2, 'game': 0.2, 'cheer': 0.2, 'friday': 0.2}
{'wait': np.float64(0.377154020726461), 'week': np.float64(0.3700595570940172), 'game': np.float64(0.39561687299024245), 'cheer': np.float64(0.33095559491521576), 'friday': np.float64(0.5115735923136046)}


In [24]:
df.head()

Unnamed: 0,text,label,tf val,tf idf
0,"[got, back, see, burslem, amaz, face, still, h...",0,"{'got': 0.09090909090909091, 'back': 0.0909090...","{'got': 0.1462252686293736, 'back': 0.15143394..."
1,"[oh, dear, even, absolut, hilar, dont, think, ...",0,"{'oh': 0.09090909090909091, 'dear': 0.09090909...","{'oh': 0.1830727197470679, 'dear': 0.232533451..."
2,"[wait, week, game, cheer, friday]",0,"{'wait': 0.2, 'week': 0.2, 'game': 0.2, 'cheer...","{'wait': 0.377154020726461, 'week': 0.37005955..."
3,"[thank, much, gloria, your, sweet, thought, ma...",0,"{'thank': 0.1, 'much': 0.1, 'gloria': 0.1, 'yo...","{'thank': 0.17449546049251669, 'much': 0.17190..."
4,"[feel, bless, work, famili, nanni, noth, love,...",0,"{'feel': 0.09090909090909091, 'bless': 0.09090...","{'feel': 0.13542564543835095, 'bless': 0.21158..."


In [25]:
sorted_dictfreq = sorted(dict_freq.items(), key= lambda kv: kv[1], reverse=True)[:n_dox]

unique_term = [item[0] for item in sorted_dictfreq]

def tfidf_vectorizer(__tfidf):
    tfidf_vect =  [0.0] * len(unique_term)
    
    for i,term in enumerate(unique_term):
        if term in __tfidf:
            tfidf_vect[i] = __tfidf[term]
    return tfidf_vect

In [26]:
df['tf idf vector'] = df['tf idf'].apply(tfidf_vectorizer)

print("print first row matrix TF_IDF_Vec Series\n")
print(df['tf idf vector'][0])

print("\nmatrix size : ", len(df['tf idf vector'][0]))

print first row matrix TF_IDF_Vec Series

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.float64(0.1444895466376443), 0.0, 0.0, 0.0, np.float64(0.14578413201169976), 0.0, np.float64(0.1462252686293736), 0.0, 0.0, 0.0, 0.0, 0.0, np.float64(0.15143394171614016), 0.0, 0.0, 0.0, 0.0, 0.0, np.float64(0.15298242710693838), 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.float64(0.156274442802797), 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.float64(0.1722827390805862), 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.float64(0.17587442465182926), 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.float64(0.1778007245672782), 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

In [27]:
vector_list = np.array(df['tf idf vector'].to_list())

vector_sum = vector_list.sum(axis=0)

data = []

for col, term in enumerate(unique_term):
    data.append([term, vector_sum[col]])
    
vector_rank = pd.DataFrame(data,columns=['term','rank'])
vector_rank.sort_values('rank',ascending=False)

Unnamed: 0,term,rank
0,im,43.790527
15,live,32.091926
1,like,30.350883
2,get,29.056407
3,dont,28.937560
...,...,...
3481,helpless,0.191579
3586,funnydont,0.191579
3552,apj,0.180935
3551,bunk,0.180935


In [28]:
df.head()

Unnamed: 0,text,label,tf val,tf idf,tf idf vector
0,"[got, back, see, burslem, amaz, face, still, h...",0,"{'got': 0.09090909090909091, 'back': 0.0909090...","{'got': 0.1462252686293736, 'back': 0.15143394...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[oh, dear, even, absolut, hilar, dont, think, ...",0,"{'oh': 0.09090909090909091, 'dear': 0.09090909...","{'oh': 0.1830727197470679, 'dear': 0.232533451...","[0.0, 0.0, 0.0, 0.1152575772914095, 0.0, 0.0, ..."
2,"[wait, week, game, cheer, friday]",0,"{'wait': 0.2, 'week': 0.2, 'game': 0.2, 'cheer...","{'wait': 0.377154020726461, 'week': 0.37005955...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[thank, much, gloria, your, sweet, thought, ma...",0,"{'thank': 0.1, 'much': 0.1, 'gloria': 0.1, 'yo...","{'thank': 0.17449546049251669, 'much': 0.17190...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1402531924102..."
4,"[feel, bless, work, famili, nanni, noth, love,...",0,"{'feel': 0.09090909090909091, 'bless': 0.09090...","{'feel': 0.13542564543835095, 'bless': 0.21158...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.12695454175754978,..."


In [29]:
df['label'].value_counts()

label
1    1147
2     857
0     823
3     786
Name: count, dtype: int64

In [30]:
major = df[df['label'] == 1]
mid1 = df[df['label'] == 2]
mid2 = df[df['label'] == 0]
minor = df[df['label'] == 3]
somple_size = round(3612 / 4)

major_scaled = major.sample(somple_size, replace=True)
mid1_scaled = mid1.sample(somple_size,replace=True)
mid2_scaled = mid2.sample(somple_size,replace=True)
minor_scaled = minor.sample(somple_size,replace=True)

dfs = pd.concat([major_scaled,minor_scaled,mid1_scaled,mid2_scaled], ignore_index=True)

df = dfs.sample(frac=1,random_state=42).reset_index(drop=True)

In [31]:
df['label'].value_counts()

label
1    903
2    903
0    903
3    903
Name: count, dtype: int64

In [32]:
df.head()

Unnamed: 0,text,label,tf val,tf idf,tf idf vector
0,"[think, your, good, go, alreadi, dont, worri]",1,"{'think': 0.14285714285714285, 'your': 0.14285...","{'think': 0.218937442329036, 'your': 0.2619417...","[0.0, 0.0, 0.0, 0.18111905002935774, 0.1969844..."
1,"[ive, want, salti, fri, mcdonald, sinc, yester...",2,"{'ive': 0.07692307692307693, 'want': 0.0769230...","{'ive': 0.14168189368717102, 'want': 0.1172649...","[0.08347011632698538, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,"[week, massacr, theatr, pert, first, one, thin...",1,"{'week': 0.1111111111111111, 'massacr': 0.1111...","{'week': 0.20558864283000955, 'massacr': 0.342...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.15583688..."
3,"[think, must, scare, cowork, im, eat, like, ra...",2,"{'think': 0.09090909090909091, 'must': 0.09090...","{'think': 0.1393238269366593, 'must': 0.205167...","[0.09864650111371, 0.10739009131922822, 0.0, 0..."
4,"[hospit, elouis, water, gone, panic, labour, l...",1,"{'hospit': 0.125, 'elouis': 0.125, 'water': 0....","{'hospit': 0.385093338356045, 'elouis': 0.3850...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [33]:
x = df['tf idf vector']
y =df['label']

In [34]:
def strat(x,y,train_size=0.8):
    np.random.seed(42)
    
    yidx= {}
    for i,label in enumerate(y):
        if label not in yidx:
            yidx[label] = [i]
        else:
            yidx[label].append(i)
            
    tr_idx = []
    te_idx = []
    
    for label,i in yidx.items():
        idx = np.random.permutation(i)
        train = max(1,int(len(idx) * train_size))
        tr_idx.extend(idx[:train])
        te_idx.extend(idx[train:])
        
    xtr = x.iloc[tr_idx].reset_index(drop=True)
    xte = x.iloc[te_idx].reset_index(drop=True)
    ytr = y.iloc[tr_idx].reset_index(drop=True)
    yte = y.iloc[te_idx].reset_index(drop=True)

    return xtr,xte,ytr,yte

In [35]:
xtr,xte,ytr,yte = strat(x,y,train_size=0.8)

In [36]:
# xtrain = xtr.to_numpy()
# ytrain = ytr.to_numpy()
# xtest = xte.to_numpy()
# ytest = yte.to_numpy()

# xtr = np.array(xtr, dtype=float)
# xte = np.array(xte, dtype=float)
# ytr = np.array(ytr, dtype=float)
# yte = np.array(yte, dtype=float)

xtr = xtr.values
xte = xte.values


In [37]:
type(xtr)

numpy.ndarray

In [38]:
xtr_flat = np.array([np.array(row) for row in xtr])
xte_flat = np.array([np.array(row) for row in xte])

In [39]:
xtr_flat

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.08639736, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [40]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [41]:
svm = LinearSVC(random_state=42)
svm.fit(xtr_flat,ytr)

In [42]:
ypredsvm = svm.predict(xte_flat)

In [43]:
print(f"acc = {accuracy_score(yte,ypredsvm)}")
print("\nClassification Report:\n", classification_report(yte, ypredsvm))

acc = 0.9406077348066298

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       181
           1       0.92      0.95      0.94       181
           2       0.98      0.93      0.95       181
           3       0.91      0.92      0.92       181

    accuracy                           0.94       724
   macro avg       0.94      0.94      0.94       724
weighted avg       0.94      0.94      0.94       724



In [56]:
def gui_call():
    baseplate = CTk()
    baseplate.title('Emotion Detector')
    baseplate.geometry('630x350')
    set_appearance_mode('dark')

    svm_model = svm

    def prep_gui():
        try:
            input_raw = input_data.get()
            if not input_raw.strip():
                label_output.configure(text='Masukkan dulu inputnya', text_color='red')
                return
            series_input = pd.Series(input_raw)
            gui_preprocessed = series_input.apply(preprocess)
            gui_tf = gui_preprocessed.apply(tf)
            gui_tfidf = gui_tf.apply(tfidf)
            gui_vectored = gui_tfidf.apply(tfidf_vectorizer)
            gui_array = np.array(gui_vectored.tolist())
            gui_predict = svm_model.predict(gui_array)
            
            #'joy': 0, 'fear': 1, 'anger': 2, 'sadness': 3
            if gui_predict[0] == 0:
                label_output.configure(text='Prediksi: Senang', text_color='yellow')
            elif gui_predict[0] == 1:
                label_output.configure(text='Prediksi: Takut', text_color='white')
            elif gui_predict[0] == 2:
                label_output.configure(text='Prediksi: Marah', text_color='red')
            else:
                label_output.configure(text='Prediksi: Sedih', text_color='blue')
        
        except Exception as e:
            label_output.configure(text=f'Error: {str(e)}', text_color='red')

    label_title = CTkLabel(master=baseplate, text='Pendeteksi Emosi Komentar', font=('Times New Roman', 20))
    label_title.place(relx=0.37, rely=0)

    label_input = CTkLabel(master=baseplate, text='Masukkan Komentar:', font=('Times New Roman', 14))
    label_input.place(relx=0.005, rely=0.21)
    
    input_data = CTkEntry(master=baseplate, placeholder_text='Masukkan Komentar di sini...', width=300, height=35)
    input_data.place(relx=0.20, rely=0.2)
    
    button = CTkButton(master=baseplate, text='Enter', command=prep_gui, height=35, width=85)
    button.place(relx=0.68, rely=0.2)
    
    label_output = CTkLabel(master=baseplate, text='', font=('Times New Roman', 15))
    label_output.place(relx=0.40, rely=0.35)

    baseplate.mainloop()

In [57]:
gui_call()