In [1]:
import pandas as pd
import numpy as np
from models.MKNN import ModifiedKNN
import neattext.functions as nfx
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
df = pd.read_csv('Twitter_fresh/twitter_crawling.csv',encoding='latin1', usecols=['date','text'])
df.head()

# Preprocessing

In [None]:
def casefolding(Text):
    Text = Text.lower()
    return Text

In [None]:
df['text'] = df['text'].apply(casefolding)
df.head()

In [None]:
def punc_clean(Text):
        Text = nfx.remove_urls(Text)
        Text = nfx.remove_punctuations(Text)
        Text = nfx.remove_emojis(Text)
        Text = nfx.remove_special_characters(Text)
        Text = nfx.remove_numbers(Text)
        return Text

In [None]:
df['text'] = df['text'].apply(punc_clean)
df.head()

In [None]:
def word_tokenize_wrapper(Text):
        return word_tokenize(Text)

In [None]:
df['text'] = df['text'].apply(word_tokenize_wrapper)
df.head()

In [None]:
def word_norm(tweets):
    word_dict = pd.read_csv('data/indonesia_slangWords.csv')
    norm_word_dict = {}
    for index, row in word_dict.iterrows():
        if row[0] not in norm_word_dict:
            norm_word_dict[row[0]] = row[1]
    return [norm_word_dict[term] if term in norm_word_dict else term for term in tweets]

In [None]:
df['text'] = df['text'].apply(word_norm)
df.head()

In [None]:
def remove_stopword(Text):
    stopW = stopwords.words('indonesian', 'english')
    sw = pd.read_csv('data/stopwordbahasa.csv')
    stopW.extend(sw)
    remove_sw = ' '.join(Text)
    clean_sw = [word for word in remove_sw.split() if word.lower() not in stopW]
    return clean_sw

In [None]:
df['text'] = df['text'].apply(remove_stopword)
df.head()

In [None]:
def indo_stem(Text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    result = []
    for w in Text:
        result.append(stemmer.stem(w))
        result.append(" ")
    return " ".join(result)

In [None]:
df['text'] = df['text'].apply(indo_stem)
df.head()

# Sentiment

In [2]:
from sklearn.model_selection import KFold
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import json

In [6]:
def Sentiment_analysis():
    df = pd.read_csv('Twitter_fresh/Clean Data/Twitter_clean_2000.csv', encoding='utf-8')

    # LexiconVader dengan custom Lexicon(bahasa indonesia)
    sia1A, sia1B = SentimentIntensityAnalyzer(), SentimentIntensityAnalyzer()
    senti = SentimentIntensityAnalyzer()
    # Hapus Default lexicon VADER
    sia1A.lexicon.clear()
    sia1B.lexicon.clear()
    senti.lexicon.clear()

    # Read custom Lexicon Bahasa Indonesia
    data1A = open('data/lexicon_sentimen_negatif.txt', 'r').read()
    data1B = open('data/lexicon_sentimen_positif.txt', 'r').read()
    data_senti = open('data/sentiwords_id.txt', 'r').read()
    
    # convert lexicon to dictonary
    insetNeg = json.loads(data1A)
    insetPos = json.loads(data1B)
    sa = json.loads(data_senti)

    # update lexicon vader with custom lexicon (b.indo)
    sia1A.lexicon.update(insetNeg)
    sia1B.lexicon.update(insetPos)
    senti.lexicon.update(sa)

    # method untuk cek apa sentimen pos,neg,neu
    def is_positive_inset(Text: str) -> bool:
        return sia1A.polarity_scores(Text)["compound"] + sia1B.polarity_scores(Text)["compound"] >= 0.05
    
    tweets = df['text'].to_list()

    with open('output/Sentiment-result.txt', 'w+') as f:
        for tweet in tweets:
            label = "Positive" if is_positive_inset(tweet) else "Negative"
            f.write(str(label + "\n"))

    sen = pd.read_csv('output/Sentiment-result.txt', names=['Sentiment'])
    df = df.join(sen)

    ## Save clean Dataset
    #df.to_csv('CleanText_Sentiment.csv', index=False)
    return df

In [7]:
sentiment_result = Sentiment_analysis()

In [8]:
sentiment_result['Sentiment'].value_counts()

Negative    1560
Positive     440
Name: Sentiment, dtype: int64

In [10]:
sentiment_result.to_csv('output/Sentiment_result.csv', index=False)

# Models Algorithm M KNN

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import f1_score, recall_score
from heapq import nsmallest as nMin
from tqdm import tqdm
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
def TFIDF_word_weight(vect, word_weight):
    feature_name = np.array(vect.get_feature_names_out())
    data = word_weight.data
    indptr = word_weight.indptr
    indices = word_weight.indices
    n_docs = word_weight.shape[0]

    word_weght_list = []
    for i in range(n_docs):
        doc = slice(indptr[i], indptr[i + 1])
        count, idx = data[doc], indices[doc]
        feature = feature_name[idx]
        word_weght_dict = dict(dict(zip(feature, count)))
        word_weght_list.append(word_weght_dict)
    word_weght_df = pd.DataFrame(word_weght_list)
    word_weght_df = word_weght_df.fillna(0)
    return word_weght_df

In [None]:
tf = TfidfVectorizer(decode_error="replace")

In [None]:
Xfeatures = sentiment_result.text
Xfeatures = tf.fit_transform(Xfeatures)
df_tfidf = TFIDF_word_weight(tf, Xfeatures)

In [None]:
df_tfidf.to_csv('output/tfidf_res.csv', index=False)

In [None]:
def jarak_euc(x,y):
    return euclidean_distances(x,y)

In [None]:
X_train = jarak_euc(Xfeatures,Xfeatures)
print(X_train)

In [None]:
dd = pd.DataFrame(X_train)
dd.to_csv('euc.csv', index=False)

In [None]:
k_value = 9 # 1 - 25
X = sentiment_result['text'].values
y = sentiment_result['Sentiment'].values
fold_i = 1
fold_n = 10 # 3 5 7 10
sum_accuracy = 0
kfold = KFold(fold_n, shuffle=True, random_state=42)
enc = LabelEncoder()
fol = []
acc, rc, pr, f1 = [], [], [], []

for train_index, test_index in tqdm(kfold.split(X)):
    fol.append(fold_i)
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    
    svf = open('output/ResultX.txt', 'w')
    sv_text = '\n'.join(str(item) for item in X_test).replace("   "," ")
    svf.write(sv_text)
    svY = open ('output/y_train.txt', 'w')
    svY.write('\n'.join(str(item) for item in y_train))

    #TFIDF
    tf = TfidfVectorizer(decode_error="replace")
    X_train = tf.fit_transform(X_train)
    X_test = tf.transform(X_test)
    
    y_train = enc.fit_transform(y_train)
    y_test = enc.transform(y_test)

    # Algorithm
    clf = ModifiedKNN(k_value)
    clf.fit(X_train, y_train)
    pred, jarak = clf.predict(X_test)
    neigbor_index = clf.get_neigbors(X_test)

    # Confusion Matrix
    #tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
    accuracy = accuracy_score(y_test, pred)*100
    precision = precision_score(y_test, pred)*100
    recall = recall_score(y_test, pred)*100
    f1_scores = f1_score(y_test, pred)*100
    #accuracy = (tp + tn) / (tp + fp + tn + fn)*100
    #precision = (tp) / (tp + fp)*100
    #recall = (tp) / (tp + fn)*100
    #f1_scores = (2 * precision * recall) / (precision + recall)
    #plot_conf_metrics(y_test, pred)

    sum_accuracy += accuracy
    pred = enc.inverse_transform(pred)

    fold_i += 1
    acc.append(accuracy)
    pr.append(precision)
    rc.append(recall)
    f1.append(f1_scores)

In [None]:
with open("output/MKNN_prediction.txt", "w") as f:
    mknn_predited_label ='\n'.join(str(item) for item in pred)
    f.write(mknn_predited_label)
with open('output/jarak_ttg.txt', 'w') as g:
    jarak = [nMin(k_value,map(float,i)) for i in jarak]
    mknn_distance = '\n'.join(str(ls) for ls in jarak)
    g.write(mknn_distance)
with open('output/index_ttg.txt', 'w') as j:
    j.write('\n'.join(str(a) for a in neigbor_index))

In [None]:
knn_pred = pd.read_csv('output/MKNN_prediction.txt', names=['Sentiment'])
jarak_pred = pd.read_csv('output/jarak_ttg.txt', names=['Distance'], sep='\t')
text_test = pd.read_csv('output/ResultX.txt', names=['text'])
index_pred = pd.read_csv('output/index_ttg.txt', names=['Neigbor'])
text_test = text_test.join(knn_pred)
text_test = text_test.join(jarak_pred)
text_test = text_test.join(index_pred)
#text_test['Sentiment'] = text_test['Sentiment'].apply(lambda x: 'Positive' if x == 1 else 'Negative')
text_test = text_test.dropna()

In [None]:
text_test.head()

In [None]:
new_frame = pd.DataFrame(X_test)
new_frame = new_frame.join(knn_pred)

avg_acc = sum_accuracy/fold_n
maxs = max(acc)
mins = min(acc)
res_df = pd.DataFrame({'iterasi':fol, 'Accuracy': acc, 'Precison':pr, 'Recall':rc, 'f1 score':f1})
print("Avearge accuracy : ", str("%.4f" % avg_acc)+'%')
print("Max Score : ",str(maxs),"in Fold : ", str(acc.index(maxs)+1))
print("Min Score : ",str(mins), "in Fold : ", str(acc.index(mins)+1))

In [None]:
res_df.head(11)

In [None]:
import matplotlib.pyplot as plot

In [None]:
res_df[['Accuracy', 'Precison', 'Recall', 'f1 score']].plot.line(title="Akurasi tiap Fold")
plot.show(block=True)

# Algorithm K-NN

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import f1_score, recall_score
from sklearn.neighbors import KNeighborsClassifier

In [127]:
k_value = 9 # 1 - 25
X = sentiment_result['text'].values
y = sentiment_result['Sentiment'].values
fold_i = 1
fold_n = 10 # 3 5 7 10
sum_accuracy = 0
kfold = KFold(fold_n, shuffle=True, random_state=42)
enc = LabelEncoder()
fol = []
acc, rc, pr, f1 = [], [], [], []

for train_index, test_index in kfold.split(X):
    fol.append(fold_i)
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    
    svf = open('output/ResultX.txt', 'w')
    sv_text = '\n'.join(str(item) for item in X_test).replace("   "," ")
    svf.write(sv_text)
    svY = open ('output/y_train.txt', 'w')
    svY.write('\n'.join(str(item) for item in y_train))

    #TFIDF
    tf = TfidfVectorizer(decode_error="replace")
    X_train = tf.fit_transform(X_train)
    X_test = tf.transform(X_test)
    
    y_train = enc.fit_transform(y_train)
    y_test = enc.transform(y_test)

    # Algorithm
    clf = KNeighborsClassifier(k_value)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)

    # Confusion Matrix
    #tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
    accuracy = accuracy_score(y_test, pred)*100
    precision = precision_score(y_test, pred)*100
    recall = recall_score(y_test, pred)*100
    f1_scores = f1_score(y_test, pred)*100
    #accuracy = (tp + tn) / (tp + fp + tn + fn)*100
    #precision = (tp) / (tp + fp)*100
    #recall = (tp) / (tp + fn)*100
    #f1_scores = (2 * precision * recall) / (precision + recall)
    #plot_conf_metrics(y_test, pred)

    sum_accuracy += accuracy
    pred = enc.inverse_transform(pred)

    fold_i += 1
    acc.append(accuracy)
    pr.append(precision)
    rc.append(recall)
    f1.append(f1_scores)

In [128]:
with open("output/MKNN_prediction.txt", "w") as f:
    mknn_predited_label ='\n'.join(str(item) for item in pred)
    f.write(mknn_predited_label)

In [129]:
knn_pred = pd.read_csv('output/MKNN_prediction.txt', names=['Sentiment'])
text_test = pd.read_csv('output/ResultX.txt', names=['text'])
text_test = text_test.join(knn_pred)
#text_test['Sentiment'] = text_test['Sentiment'].apply(lambda x: 'Positive' if x == 1 else 'Negative')
text_test = text_test.dropna()

In [130]:
text_test.head()

Unnamed: 0,text,Sentiment
0,tema money heist iya sayanga,Negative
1,sebel banget deh nonton film film bagus netfli...,Negative
2,norak banget nonton money heist spanyol seru b...,Positive
3,scene cctv polisi series money heist,Negative
4,senin baca review money heist versi korea pena...,Negative


In [131]:
new_frame = pd.DataFrame(X_test)
new_frame = new_frame.join(knn_pred)

avg_acc = sum_accuracy/fold_n
maxs = max(acc)
mins = min(acc)
res_df = pd.DataFrame({'iterasi':fol, 'Accuracy': acc, 'Precison':pr, 'Recall':rc, 'f1 score':f1})
print("Avearge accuracy : ", str("%.4f" % avg_acc)+'%')
print("Max Score : ",str(maxs),"in Fold : ", str(acc.index(maxs)+1))
print("Min Score : ",str(mins), "in Fold : ", str(acc.index(mins)+1))

Avearge accuracy :  79.0500%
Max Score :  85.0 in Fold :  2
Min Score :  75.0 in Fold :  1


In [132]:
res_df.to_csv('hs.txt', index=False)
res_df.head(11)

Unnamed: 0,iterasi,Accuracy,Precison,Recall,f1 score
0,1,75.0,55.555556,28.301887,37.5
1,2,85.0,78.378378,56.862745,65.909091
2,3,78.0,80.0,28.571429,42.105263
3,4,80.5,83.333333,42.372881,56.179775
4,5,79.5,68.965517,38.461538,49.382716
5,6,76.0,74.074074,32.786885,45.454545
6,7,80.0,80.0,41.37931,54.545455
7,8,75.5,64.705882,37.288136,47.311828
8,9,84.0,90.322581,49.122807,63.636364
9,10,77.0,55.882353,38.0,45.238095
