In [2]:
import pandas as pd
import numpy as np
from models.MKNN import ModifiedKNN
import neattext.functions as nfx
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
df = pd.read_csv('Twitter_fresh/twitter_crawling.csv',encoding='latin1', usecols=['date','text'])
df.head()

# Preprocessing

In [None]:
def casefolding(Text):
    Text = Text.lower()
    return Text

In [None]:
df['text'] = df['text'].apply(casefolding)
df.head()

In [None]:
def punc_clean(Text):
        Text = nfx.remove_urls(Text)
        Text = nfx.remove_punctuations(Text)
        Text = nfx.remove_emojis(Text)
        Text = nfx.remove_special_characters(Text)
        Text = nfx.remove_numbers(Text)
        return Text

In [None]:
df['text'] = df['text'].apply(punc_clean)
df.head()

In [None]:
def word_tokenize_wrapper(Text):
        return word_tokenize(Text)

In [None]:
df['text'] = df['text'].apply(word_tokenize_wrapper)
df.head()

In [None]:
def word_norm(tweets):
    word_dict = pd.read_csv('data/indonesia_slangWords.csv')
    norm_word_dict = {}
    for index, row in word_dict.iterrows():
        if row[0] not in norm_word_dict:
            norm_word_dict[row[0]] = row[1]
    return [norm_word_dict[term] if term in norm_word_dict else term for term in tweets]

In [None]:
df['text'] = df['text'].apply(word_norm)
df.head()

In [None]:
def remove_stopword(Text):
    stopW = stopwords.words('indonesian', 'english')
    sw = pd.read_csv('data/stopwordbahasa.csv')
    stopW.extend(sw)
    remove_sw = ' '.join(Text)
    clean_sw = [word for word in remove_sw.split() if word.lower() not in stopW]
    return clean_sw

In [None]:
df['text'] = df['text'].apply(remove_stopword)
df.head()

In [None]:
def indo_stem(Text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    result = []
    for w in Text:
        result.append(stemmer.stem(w))
        result.append(" ")
    return " ".join(result)

In [None]:
df['text'] = df['text'].apply(indo_stem)
df.head()

# Sentiment

In [3]:
from sklearn.model_selection import KFold
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import json

In [35]:
def Sentiment_analysis():
    df = pd.read_csv('Twitter_fresh\Clean Data\Twitter_clean.csv', encoding='utf-8')

    # LexiconVader dengan custom Lexicon(bahasa indonesia)
    sia1A, sia1B = SentimentIntensityAnalyzer(), SentimentIntensityAnalyzer()
    # Hapus Default lexicon VADER
    sia1A.lexicon.clear()
    sia1B.lexicon.clear()

    # Read custom Lexicon Bahasa Indonesia
    data1A = open('data/lexicon_sentimen_negatif.txt', 'r').read()
    data1B = open('data/lexicon_sentimen_positif.txt', 'r').read()
    
    # convert lexicon to dictonary
    insetNeg = json.loads(data1A)
    insetPos = json.loads(data1B)

    # update lexicon vader with custom lexicon (b.indo)
    sia1A.lexicon.update(insetNeg)
    sia1B.lexicon.update(insetPos)

    # method untuk cek apa sentimen pos,neg,neu
    def is_positive_inset(Text: str) -> bool:
        return sia1A.polarity_scores(Text)["compound"] + sia1B.polarity_scores(Text)["compound"] > 0
    
    tweets = df['text'].to_list()

    with open('output/Sentiment-result.txt', 'w+') as f:
        for tweet in tweets:
            label = "Positive" if is_positive_inset(tweet) else "Negative"
            f.write(str(label + "\n"))

    sen = pd.read_csv('output/Sentiment-result.txt', names=['Sentiment'])
    df = df.join(sen)

    ## Save clean Dataset
    #df.to_csv('CleanText_Sentiment.csv', index=False)
    return df

In [36]:
sentiment_result = Sentiment_analysis()

In [37]:
sentiment_result['Sentiment'].value_counts()

Negative    1444
Positive     556
Name: Sentiment, dtype: int64

In [39]:
sentiment_result.to_csv('output/Sentiment_result.csv', index=False)

# Models Algorithm

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import f1_score, recall_score
from heapq import nsmallest as nMin

In [46]:
k_value = 1 # 1 - 25
X = sentiment_result['text'].values
y = sentiment_result['Sentiment'].values
fold_i = 1
fold_n = 5 # 3 5 7 10
sum_accuracy = 0
kfold = KFold(fold_n, shuffle=True, random_state=42)
enc = LabelEncoder()
fol = []
acc, rc, pr, f1 = [], [], [], []

for train_index, test_index in kfold.split(X):
    fol.append(fold_i)
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    
    svf = open('output/ResultX.txt', 'w')
    sv_text = '\n'.join(str(item) for item in X_test).replace("   "," ")
    svf.write(sv_text)
    svY = open ('output/y_train.txt', 'w')
    svY.write('\n'.join(str(item) for item in y_train))

    #TFIDF
    tf = TfidfVectorizer(decode_error="replace")
    X_train = tf.fit_transform(X_train)
    X_test = tf.transform(X_test)
    
    y_train = enc.fit_transform(y_train)
    y_test = enc.transform(y_test)

    # Algorithm
    clf = ModifiedKNN(k_value)
    clf.fit(X_train, y_train)
    pred, jarak = clf.predict(X_test)
    neigbor_index = clf.get_neigbors(X_test)

    # Confusion Matrix
    #cm = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)*100
    precision = precision_score(y_test, pred)*100
    recall = recall_score(y_test, pred)*100
    f1_scores = f1_score(y_test, pred)*100
    #plot_conf_metrics(y_test, pred)

    sum_accuracy += accuracy

    fold_i += 1
    acc.append(accuracy)
    pr.append(precision)
    rc.append(recall)
    f1.append(f1_scores)

0 = neigbor: [0]
0 = neigbor: [0]
1 = neigbor: [1]
0 = neigbor: [0]
0 = neigbor: [1]
1 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [0]
1 = neigbor: [1]
0 = neigbor: [0]
0 = neigbor: [1]
0 = neigbor: [0]
0 = neigbor: [1]
0 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [1]
0 = neigbor: [0]
1 = neigbor: [1]
1 = neigbor: [1]
0 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [0]
1 = neigbor: [1]
0 = neigbor: [0]
1 = neigbor: [1]
1 = neigbor: [0]
1 = neigbor: [1]
0 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [1]
0 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [1]
0 = neigbor: [0]
0 = neigbor: [0]
1 = neigbor: [1]
0 = neigbor: [0]
1 = neigbor: [1]
0 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [1]
1 = neigbor: [1]
0 = neigbor: [1]
0 = neigbor: [0]
0 = neigbor: [1]
0 = neigbor: [0]
1 = neigbor: [0]
0 = neigbor: [0]
0 = neigbor: [0]
1 = neigbor: [1]
0 = neigbor: [0]
0 = neigbor: [

In [47]:
with open("output/MKNN_prediction.txt", "w") as f:
    mknn_predited_label ='\n'.join(str(item) for item in pred)
    f.write(mknn_predited_label)
with open('output/jarak_ttg.txt', 'w') as g:
    jarak = [nMin(k_value,map(float,i)) for i in jarak]
    mknn_distance = '\n'.join(str(ls) for ls in jarak)
    g.write(mknn_distance)
with open('output/index_ttg.txt', 'w') as j:
    j.write('\n'.join(str(a) for a in neigbor_index))

In [48]:
knn_pred = pd.read_csv('output/MKNN_prediction.txt', names=['Sentiment'])
jarak_pred = pd.read_csv('output/jarak_ttg.txt', names=['Distance'], sep='\t')
text_test = pd.read_csv('output/ResultX.txt', names=['text'])
index_pred = pd.read_csv('output/index_ttg.txt', names=['Neigbor'])
text_test = text_test.join(knn_pred)
text_test = text_test.join(jarak_pred)
text_test = text_test.join(index_pred)
text_test['Sentiment'] = text_test['Sentiment'].apply(lambda x: 'Positive' if x == 1 else 'Negative')
text_test = text_test.dropna()

In [49]:
text_test.head(11)

Unnamed: 0,text,Sentiment,Distance,Neigbor
0,selesai watching money heist korea ending gant...,Negative,[1.1079990385537086],[270]
1,namatin money heist season kangen nairobi,Negative,[1.005394169933547],[1546]
2,tema money heist iya sayanga,Positive,[0.9585774097246824],[7]
3,sebel banget deh nonton film film bagus netfli...,Negative,[0.24177405417761336],[1095]
4,norak banget nonton money heist spanyol seru b...,Positive,[0.8553832625540789],[1337]
5,scene cctv polisi series money heist,Negative,[1.0964759255118195],[294]
6,inget ending film money heist keruk bagiin rak...,Negative,[1.0536712127723509e-08],[32]
7,senin baca review money heist versi korea pena...,Positive,[1.0536712127723509e-08],[15]
8,chara peama akun ursula corbero tokyo money he...,Positive,[1.1198043418750985],[1426]
9,rio money heist korea park bogum encem,Negative,[1.177037673434913],[1252]


In [50]:
new_frame = pd.DataFrame(X_test)
new_frame = new_frame.join(knn_pred)

avg_acc = sum_accuracy/fold_n
maxs = max(acc)
mins = min(acc)
res_df = pd.DataFrame({'iterasi':fol, 'Accuracy': acc, 'Precison':pr, 'Recall':rc, 'f1 score':f1})
print("Avearge accuracy : ", str("%.4f" % avg_acc)+'%')
print("Max Score : ",str(maxs),"in Fold : ", str(acc.index(maxs)+1))
print("Min Score : ",str(mins), "in Fold : ", str(acc.index(mins)+1))

Avearge accuracy :  77.1000%
Max Score :  78.75 in Fold :  1
Min Score :  74.5 in Fold :  2


In [51]:
res_df.head(11)

Unnamed: 0,K Fold,Accuracy,Precison,Recall,f1 score
0,1,78.75,61.445783,49.038462,54.545455
1,2,74.5,58.441558,39.130435,46.875
2,3,77.25,63.095238,46.902655,53.807107
3,4,77.25,65.853659,46.153846,54.271357
4,5,77.75,58.823529,56.074766,57.416268


In [None]:
import matplotlib.pyplot as plot

In [None]:
res_df[['Accuracy', 'Precison', 'Recall', 'f1 score']].plot.line(title="Akurasi tiap Fold")
plot.show(block=True)