In [7]:
import pandas as pd 
import numpy as np
import nltk

In [8]:
import json

with open('datasets/emotion_twitter_data.json') as fopen:
    myfile = json.load(fopen)

In [9]:
data_anger = pd.DataFrame(myfile['anger'], columns = ['Text'])
data_anger['Emotion'] = 0
print(data_anger.head())
print(len(data_anger))

                                                Text  Emotion
0  Hidup ni jgn terlalu nk mendongak ke atas, nan...        0
1             @AyekKamal yer lah sbb sombong mmg lah        0
2  Ni pukul berapa tah nak sampai ukm. Tetiba jal...        0
3  Jenis-jenis orang stalking di media sosial:\n-...        0
4  Aku ada motor racing ,\naku bawa ronda ,\nawek...        0
108813


In [10]:
data_fear = pd.DataFrame(myfile['fear'], columns = ['Text'])
data_fear['Emotion'] = 1
print(data_fear.head())
print(len(data_fear))

                                                Text  Emotion
0  mau nonton annabelle tunggu partner setiaku av...        1
1  Banyak orang yang masih ragu dan merasa takut ...        1
2  @sshazazul Takut aaaaaaa sebab dok baca belaka...        1
3                 Mau tidur takut diketawain bantal         1
4                 @enanuars Takut nak percaya nan :)        1
20316


In [11]:
data_happy = pd.DataFrame(myfile['happy'], columns = ['Text'])
data_happy['Emotion'] = 2
print(data_happy.head())
print(len(data_happy))

                                                Text  Emotion
0  @kompascom Bapa saya suka pake Oppo..saya suka...        2
1  Pak prabowo itu vibesnya kebun binatang banget...        2
2                    @SyedSaddiq Happy fasting, yb!!        2
3  Ya Allah happy nya air asia ade sale in few da...        2
4  Happy Gawai &amp; Hari Raya Puasa 2019 https:/...        2
30962


In [12]:
data_love = pd.DataFrame(myfile['love'], columns = ['Text'])
data_love['Emotion'] = 3
print(data_love.head())
print(len(data_love))

                                                Text  Emotion
0                     Hi, Baby baru bangun Baby emo.        3
1                      Kenapa suami orang handsome ?        3
2  Alhamdulillah landed sudah di malaysia.. sumpa...        3
3  Aku tak rindu kau tapi asal kau selalu ade dal...        3
4  @indomymenfess pada saat ngga sengaja ketemu k...        3
20783


In [13]:
data_sadness = pd.DataFrame(myfile['sadness'], columns = ['Text'])
data_sadness['Emotion'] = 4
print(data_sadness.head())
print(len(data_sadness))

                                                Text  Emotion
0  ternyata kl lg sdih bisa ngasilin makanan enak...        4
1                                  Kekasih bayangan.        4
2                                  kecewa...........        4
3  Senin, 22 April 2019 kita memperingati hari Bu...        4
4       aku sedih ni tak ada siapa nak hiburkan ke ?        4
26468


In [14]:
data_surprise = pd.DataFrame(myfile['surprise'], columns = ['Text'])
data_surprise['Emotion'] = 5
print(data_surprise.head())
print(len(data_surprise))

                                                Text  Emotion
0                             Hilang nyawaku aku tgk        5
1  @ShoutOut3Sub Miki yang sedang tidak fokus pun...        5
2  Aku syak lecturer aku ni suka buat surprise bi...        5
3                                 Terkejut terheran2        5
4     Nak surprise boyfriend tapi maaing2 jauh zzzz         5
13107


In [15]:
# Combine both dataframes into one master dataframe
data = pd.concat([data_anger, data_fear, data_happy, data_love, data_sadness, data_surprise], ignore_index = True)
print(data)
print(len(data))

                                                     Text  Emotion
0       Hidup ni jgn terlalu nk mendongak ke atas, nan...        0
1                  @AyekKamal yer lah sbb sombong mmg lah        0
2       Ni pukul berapa tah nak sampai ukm. Tetiba jal...        0
3       Jenis-jenis orang stalking di media sosial:\n-...        0
4       Aku ada motor racing ,\naku bawa ronda ,\nawek...        0
...                                                   ...      ...
220444  Tokti pun terkejut Aafiyah pandai sebut nama d...        5
220445  Tokti pun terkejut Aafiyah pandai sebut nama d...        5
220446  Tokti pun terkejut Aafiyah pandai sebut nama d...        5
220447  perlis negeri pertama clear covid takdelah ter...        5
220448  Tokti pun terkejut Aafiyah pandai sebut nama d...        5

[220449 rows x 2 columns]
220449


In [48]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from collections import Counter
import string
import re

# download required library from nltk
# nltk.download('stopwords')
# nltk.download('punkt')

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

stop_words_ind = list(stopwords.words('indonesian'))
stop_words_eng = list(stopwords.words('english'))
stop_words_custom = ['kau', 'yg', 'mcm', 'gak', 'nak', 'ni', 'tu', 'la', 'je', 'kat', 'ya', 'dgn', 'tau', 'org', 'rt', 'aja', 'nk', 'dah',
                        'orang', 'sy', 'ga', 'kalo', 'kena']
stop_words = np.unique(stop_words_ind+stop_words_eng+stop_words_custom)

def text_preprocessing(text):

    # remove numbers
    text = re.sub(r'\d+', '', text)
    # remove links
    text = re.sub('http[s]?://\S+', '', text)

    # tokennization
    tokens = word_tokenize(text)

    # lemmetization and remove punctuation
    words = []
    for token in tokens:
        if token not in string.punctuation:
            temp = stemmer.stem(token)
            words.append(temp)

    # remove stopwords
    cleaned = []
    for word in words:
        if word not in stop_words:
            cleaned.append(word)

    # traverse in the string     
    complete_sentence = ' '.join([str(word) for word in cleaned])
    
    return complete_sentence

def most_used_words(data,str_input):

    words_list = []
    temp = []
    for index, row in data.iterrows():
        tokens = str(row[str_input]).split() 
        for word in tokens:
            words_list.append(word)

    common_words = Counter(words_list).most_common(50)
    for key, value in common_words:
        temp.append({'name': key, 'value': value})

    return temp

In [49]:
text_cleaning = lambda x: text_preprocessing(x)
data['Cleaned_Text'] = pd.DataFrame(data['Text'].apply(text_cleaning))
data['Cleaned_Text'].head()

0                    hidup jgn dongak jatuh padan muka
1                        ayekkamal yer sbb sombong mmg
2              tah ukm tetiba jalan tutup pulak jalan 
3    jenis stalking media sosial pakai akun palsu p...
4    motor racing bawa ronda awek lu bonceng dar da...
Name: Cleaned_Text, dtype: object

In [50]:
# get common words in data
word_list = most_used_words(data,'Cleaned_Text')
word_list

[{'name': 'happy', 'value': 22757},
 {'name': 'bodoh', 'value': 20963},
 {'name': 'sakit', 'value': 19251},
 {'name': 'takut', 'value': 18280},
 {'name': 'hati', 'value': 15708},
 {'name': 'kecewa', 'value': 14304},
 {'name': 'malas', 'value': 12463},
 {'name': 'komunis', 'value': 11711},
 {'name': 'mati', 'value': 11681},
 {'name': 'rindu', 'value': 10302},
 {'name': 'suka', 'value': 10252},
 {'name': 'kejut', 'value': 10140},
 {'name': 'tinggal', 'value': 9795},
 {'name': 'sayang', 'value': 9316},
 {'name': 'cinta', 'value': 9047},
 {'name': 'sedih', 'value': 8195},
 {'name': 'marah', 'value': 8163},
 {'name': 'kes', 'value': 7949},
 {'name': 'tengok', 'value': 7867},
 {'name': 'benci', 'value': 7614},
 {'name': 'kapitalis', 'value': 7401},
 {'name': 'jatuh', 'value': 7400},
 {'name': 'amp', 'value': 7327},
 {'name': 'muka', 'value': 7180},
 {'name': 'dukacita', 'value': 7098},
 {'name': 'pergi', 'value': 6880},
 {'name': 'cakap', 'value': 6707},
 {'name': 'rumah', 'value': 6695},
 {

In [51]:
# SPLIT TRAINING & TESTING DATA
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['Cleaned_Text'],data['Emotion'],test_size=0.2,shuffle=True, random_state=42)
print(X_train.shape, y_train.shape)

(176359,) (176359,)


In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC 
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

def sentiment_pipeline(data_train_input,data_train_target,model_type):
    # Classifier selection
    if model_type == "linear":
        classifier = LinearSVC()
    elif model_type == "logistic":
        classifier = LogisticRegression(max_iter=1000)
    elif model_type == "sgd":
        classifier = SGDClassifier()
    elif model_type == "naive_bayes":
        classifier = MultinomialNB()
    elif model_type == "xgboost":
        classifier = XGBClassifier(use_label_encoder=False,eta=0.1,gamma=0.3, n_estimators=100, learning_rate=0.5, min_child_weight=5, 
        max_depth=5, colsample_bytree=0.7,objective="multi:softmax", eval_metric="mlogloss",verbosity=0)

    tfidf = TfidfVectorizer()

    # Pipeline setup
    clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

    model = clf.fit(data_train_input,data_train_target)

    return model

def sentiment_model_predict(model,data_test_input,data_test_target):
    data_prediction=model.predict(data_test_input)
    conf_matrix = confusion_matrix(data_test_target,data_prediction)
    acc_score = accuracy_score(data_test_target, data_prediction)
    pre_score = precision_score(data_test_target, data_prediction, average="macro")
    re_score = recall_score(data_test_target, data_prediction, average="macro")
    f_score = f1_score(data_test_target, data_prediction, average="macro")

    print("Accuracy : "+str(round(acc_score*100,2)))
    print("Precision : "+str(round(pre_score*100,2)))
    print("Recall : "+str(round(re_score*100,2)))
    print("F1-Score :"+str(round(f_score*100,2)))
    print(conf_matrix)

In [53]:
# Support Vector Classification
svm_model = sentiment_pipeline(X_train, y_train, 'linear')
sentiment_model_predict(svm_model,X_test,y_test)

Accuracy : 98.13
Precision : 97.78
Recall : 97.78
F1-Score :97.78
[[21411    60    50    73    68    60]
 [   55  3987     3     8     5     7]
 [   31     7  6019    16    11     5]
 [   93    10    19  4072    20     7]
 [  108    12    14    16  5260     3]
 [   42     5     7     5     5  2516]]


In [54]:
# Logistic Regression
lr_model = sentiment_pipeline(X_train, y_train, 'logistic')
sentiment_model_predict(lr_model,X_test,y_test)

Accuracy : 97.9
Precision : 97.55
Recall : 97.4
F1-Score :97.47
[[21394    66    54    79    65    64]
 [   65  3970     6     9     6     9]
 [   37     8  6009    17    14     4]
 [   99    12    18  4064    18    10]
 [  128    13    14    23  5232     3]
 [   63    10     5     6     3  2493]]


In [55]:
# Stochastic Gradient Descent
sgd_model = sentiment_pipeline(X_train, y_train, 'sgd')
sentiment_model_predict(sgd_model,X_test,y_test)

Accuracy : 97.08
Precision : 96.63
Recall : 96.54
F1-Score :96.58
[[21241   160    73    98    75    75]
 [   68  3970     5     8     4    10]
 [   80    12  5961    22     9     5]
 [  144    33    25  3993    14    12]
 [  176    32    12    22  5166     5]
 [   88     9     2     7     1  2473]]


In [56]:
# Multinomial Naive Bayes
nb_model = sentiment_pipeline(X_train, y_train, 'naive_bayes')
sentiment_model_predict(nb_model,X_test,y_test)

Accuracy : 73.33
Precision : 92.05
Recall : 54.58
F1-Score :64.32
[[21679     1    11     8    21     2]
 [ 2691  1335    24     3    11     1]
 [ 2758     4  3295     8    24     0]
 [ 2021     5   125  2040    27     3]
 [ 2312     5    35     8  3052     1]
 [ 1588     3    52     2     6   929]]


In [57]:
# Xgboost
xg_model = sentiment_pipeline(X_train, y_train, 'xgboost')
sentiment_model_predict(xg_model,X_test,y_test)

Accuracy : 98.53
Precision : 98.14
Recall : 98.22
F1-Score :98.18
[[21466    50    45    57    46    58]
 [   51  3991     3     4     9     7]
 [   28     9  6026     9    12     5]
 [   63     6    16  4104    29     3]
 [   33    13    10    12  5342     3]
 [   50     4     6     2     4  2514]]


In [58]:
import joblib

# save model
joblib_file_svm = "joblib_SVM_Model.pkl"  
joblib.dump(svm_model, joblib_file_svm)

joblib_file_xg = "joblib_XGB_Model.pkl"  
joblib.dump(xg_model, joblib_file_xg)

joblib_file_lr = "joblib_LR_Model.pkl"  
joblib.dump(lr_model, joblib_file_lr)

joblib_file_sgd = "joblib_SGD_Model.pkl"  
joblib.dump(sgd_model, joblib_file_sgd)

['joblib_SVM_Model.pkl']

In [60]:
# load model
joblib_SVM_model = joblib.load(joblib_file_svm)
sentiment_model_predict(joblib_SVM_model,X_test,y_test)

joblib_XGB_model = joblib.load(joblib_file_xg)
sentiment_model_predict(joblib_XGB_model,X_test,y_test)

joblib_LR_model = joblib.load(joblib_file_lr)
sentiment_model_predict(joblib_LR_model,X_test,y_test)

joblib_SGD_model = joblib.load(joblib_file_sgd)
sentiment_model_predict(joblib_SGD_model,X_test,y_test)

Accuracy : 98.13
Precision : 97.78
Recall : 97.78
F1-Score :97.78
[[21411    60    50    73    68    60]
 [   55  3987     3     8     5     7]
 [   31     7  6019    16    11     5]
 [   93    10    19  4072    20     7]
 [  108    12    14    16  5260     3]
 [   42     5     7     5     5  2516]]


In [62]:
# gembira: 'Seronok dapat balik kampung tahun depan'
# takut: 'Cuak do malam ni'
# marah: 'Awat bodo sgt perangai. Tkde otak ka?'
# cinta: 'Ahhh rindunya awek aku. Lama tk jumpa'
# sedih: 'Sedih la asyik habis stock ja barang ni'
# terkejut: 'Tiba2 lecturer buat surprise quiz harini'

test = {'Text': [
    'Seronok dapat balik kampung tahun depan',
    'Cuak do malam ni',
    'Awat bodo sgt perangai. Tkde otak ka?',
    'Ahhh rindunya awek aku. Lama tk jumpa',
    'Sedih la asyik habis stock ja barang ni',
    'Tiba2 lecturer buat surprise quiz harini',
]}
check_data = pd.DataFrame(test)
# clean text
text_cleaning = lambda x: text_preprocessing(x)
check_data['Cleaned_Text'] = pd.DataFrame(check_data['Text'].apply(text_cleaning))
check_data

Unnamed: 0,Text,Cleaned_Text
0,Seronok dapat balik kampung tahun depan,seronok kampung
1,Cuak do malam ni,cuak malam
2,Awat bodo sgt perangai. Tkde otak ka?,awat bodo sgt perangai tkde otak ka
3,Ahhh rindunya awek aku. Lama tk jumpa,ahhh rindu awek tk jumpa
4,Sedih la asyik habis stock ja barang ni,sedih asyik habis stock ja barang
5,Tiba2 lecturer buat surprise quiz harini,lecturer surprise quiz harini


In [63]:
# Predict
pred_data=joblib_SVM_model.predict(check_data['Cleaned_Text'])
check_data['Predicted'] = pred_data
check_data
# if pred_data[0] == 0:
#     print('Marah')
# elif pred_data[0] == 1:
#     print('Takut')
# elif pred_data[0] == 2:
#     print('Gembira')
# elif pred_data[0] == 3:
#     print('Cinta')
# elif pred_data[0] == 4:
#     print('Sedih')
# elif pred_data[0] == 5:
#     print('Terkejut')

Unnamed: 0,Text,Cleaned_Text,Predicted
0,Seronok dapat balik kampung tahun depan,seronok kampung,2
1,Cuak do malam ni,cuak malam,1
2,Awat bodo sgt perangai. Tkde otak ka?,awat bodo sgt perangai tkde otak ka,0
3,Ahhh rindunya awek aku. Lama tk jumpa,ahhh rindu awek tk jumpa,3
4,Sedih la asyik habis stock ja barang ni,sedih asyik habis stock ja barang,4
5,Tiba2 lecturer buat surprise quiz harini,lecturer surprise quiz harini,5


In [64]:
# Predict
pred_data=joblib_XGB_model.predict(check_data['Cleaned_Text'])
check_data['Predicted'] = pred_data
check_data

Unnamed: 0,Text,Cleaned_Text,Predicted
0,Seronok dapat balik kampung tahun depan,seronok kampung,2
1,Cuak do malam ni,cuak malam,1
2,Awat bodo sgt perangai. Tkde otak ka?,awat bodo sgt perangai tkde otak ka,0
3,Ahhh rindunya awek aku. Lama tk jumpa,ahhh rindu awek tk jumpa,3
4,Sedih la asyik habis stock ja barang ni,sedih asyik habis stock ja barang,4
5,Tiba2 lecturer buat surprise quiz harini,lecturer surprise quiz harini,5
