# Data Test Preprocessing

In [1]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from nltk import word_tokenize, sent_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords
from wordcloud import WordCloud, ImageColorGenerator
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

ind = stopwords.words('indonesian')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data_test = pd.read_csv('data/tweet_test.csv')
data_test['Datetime'] = pd.to_datetime(data_test['Datetime'])
data_test['Datetime'] = data_test['Datetime'].dt.date
data_test

Unnamed: 0,Datetime,Tweet Id,Text,Username
0,2022-04-02,1510366219278331905,mbanking bca error ya?,lostgameee
1,2022-04-02,1510293394329063424,Apa mbanking bca masih error?,cherrypunyarasa
2,2022-04-02,1510289897881411584,@mecathee Mbanking bca punyaku baik2 saja dari...,andontknoww
3,2022-04-02,1510274838211878917,@tommyxap Mbanking bca sama sopi bbrp org dari...,faithfullysour
4,2022-04-02,1510273928240918537,"Nasib gapunya kartu, cardless terus sih enak e...",ethernalski
...,...,...,...,...
709,2022-04-13,1514313433184817152,mbanking bca eror kahhhh,saudadee_p
710,2022-04-13,1514313050677194752,hadeuh mbanking bca erorr 🙂,Sanielyong
711,2022-04-13,1514312872628658178,"Mbanking bca maunye ape sih, mau depo kan gabisa",platinageserr
712,2022-04-13,1514312779104423936,Mbanking BCA error terussss. Heran gw @HaloBCA,Rahmaniya__


In [3]:
df_processed = data_test.copy()

In [4]:
# Lower

df_processed['Text'] = df_processed['Text'].apply(lambda x : x.lower())
df_processed.head()

Unnamed: 0,Datetime,Tweet Id,Text,Username
0,2022-04-02,1510366219278331905,mbanking bca error ya?,lostgameee
1,2022-04-02,1510293394329063424,apa mbanking bca masih error?,cherrypunyarasa
2,2022-04-02,1510289897881411584,@mecathee mbanking bca punyaku baik2 saja dari...,andontknoww
3,2022-04-02,1510274838211878917,@tommyxap mbanking bca sama sopi bbrp org dari...,faithfullysour
4,2022-04-02,1510273928240918537,"nasib gapunya kartu, cardless terus sih enak e...",ethernalski


In [5]:
# mengubah kalimat menjadi kata

df_processed['Text'] = df_processed['Text'].apply(lambda x : word_tokenize(x))
df_processed['Text']

0                          [mbanking, bca, error, ya, ?]
1                  [apa, mbanking, bca, masih, error, ?]
2      [@, mecathee, mbanking, bca, punyaku, baik2, s...
3      [@, tommyxap, mbanking, bca, sama, sopi, bbrp,...
4      [nasib, gapunya, kartu, ,, cardless, terus, si...
                             ...                        
709                        [mbanking, bca, eror, kahhhh]
710                    [hadeuh, mbanking, bca, erorr, 🙂]
711    [mbanking, bca, maunye, ape, sih, ,, mau, depo...
712    [mbanking, bca, error, terussss, ., heran, gw,...
713    [mbanking, bca, kalo, lg, di, butuhin, knp, ga...
Name: Text, Length: 714, dtype: object

In [6]:
# Hapus stopwords indonesia

def removeStopWords(text):
    words = []
    for word in text:
        if word not in ind:
            words.append(word)
    return words

df_processed['Text'] = df_processed['Text'].apply(lambda x : removeStopWords(x))
df_processed['Text']

0                          [mbanking, bca, error, ya, ?]
1                              [mbanking, bca, error, ?]
2      [@, mecathee, mbanking, bca, punyaku, baik2, k...
3      [@, tommyxap, mbanking, bca, sopi, bbrp, org, ...
4      [nasib, gapunya, kartu, ,, cardless, sih, enak...
                             ...                        
709                        [mbanking, bca, eror, kahhhh]
710                    [hadeuh, mbanking, bca, erorr, 🙂]
711    [mbanking, bca, maunye, ape, sih, ,, depo, gab...
712    [mbanking, bca, error, terussss, ., heran, gw,...
713    [mbanking, bca, kalo, lg, butuhin, knp, ganggu...
Name: Text, Length: 714, dtype: object

In [7]:
def hapusSimbol(sentence): #tokenize first
    words = []
    for word in sentence:
        if word.isalpha():
            words.append(word)
    return words

df_processed['Text'] = df_processed['Text'].apply(lambda x : hapusSimbol(x))
df_processed['Text']

0                             [mbanking, bca, error, ya]
1                                 [mbanking, bca, error]
2            [mecathee, mbanking, bca, punyaku, kemarin]
3      [tommyxap, mbanking, bca, sopi, bbrp, org, kmr...
4      [nasib, gapunya, kartu, cardless, sih, enak, e...
                             ...                        
709                        [mbanking, bca, eror, kahhhh]
710                       [hadeuh, mbanking, bca, erorr]
711      [mbanking, bca, maunye, ape, sih, depo, gabisa]
712    [mbanking, bca, error, terussss, heran, gw, ha...
713    [mbanking, bca, kalo, lg, butuhin, knp, ganggu...
Name: Text, Length: 714, dtype: object

In [8]:
# Stemming

lemmatizer_obj = WordNetLemmatizer()
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemm(data):
    words = []
    for word in data:
        stm = stemmer.stem(word)
        words.append(stm)
    return words

df_processed['Text'] = df_processed['Text'].apply(lambda x: [stemmer.stem(y) for y in x])
df_processed['Text']

0                             [mbanking, bca, error, ya]
1                                 [mbanking, bca, error]
2              [mecathee, mbanking, bca, punya, kemarin]
3      [tommyxap, mbanking, bca, sopi, bbrp, org, kmr...
4      [nasib, gapunya, kartu, cardless, sih, enak, e...
                             ...                        
709                        [mbanking, bca, eror, kahhhh]
710                       [hadeuh, mbanking, bca, erorr]
711      [mbanking, bca, maunye, ape, sih, depo, gabisa]
712    [mbanking, bca, error, terussss, heran, gw, ha...
713    [mbanking, bca, kalo, lg, butuhin, knp, ganggu...
Name: Text, Length: 714, dtype: object

In [9]:
kata_perlu_dihapus = ['mbanking','bca', 'mobile', 'https', 'halobca']
# kata_perlu_dihapus = ['https', 'halobca']

def hapus_kata_kosong(sentence):
    words = []
    for word in sentence:
        if word not in kata_perlu_dihapus :
            words.append(word)
    return words

df_processed['Text'] = df_processed['Text'].apply(lambda x : hapus_kata_kosong(x))
df_processed['Text']

0                                            [error, ya]
1                                                [error]
2                             [mecathee, punya, kemarin]
3                [tommyxap, sopi, bbrp, org, kmrn, eror]
4      [nasib, gapunya, kartu, cardless, sih, enak, e...
                             ...                        
709                                       [eror, kahhhh]
710                                      [hadeuh, erorr]
711                     [maunye, ape, sih, depo, gabisa]
712                         [error, terussss, heran, gw]
713                 [kalo, lg, butuhin, knp, ganggu, si]
Name: Text, Length: 714, dtype: object

In [10]:
# menghapus kata duplikat pada 1 dokumen

df_processed['Text'] = [ set(w) for w in df_processed['Text']]
df_processed['Text']

0                                            {error, ya}
1                                                {error}
2                             {kemarin, mecathee, punya}
3                {sopi, bbrp, eror, kmrn, org, tommyxap}
4      {sih, cardless, enak, ganggu, telek, sebenerny...
                             ...                        
709                                       {kahhhh, eror}
710                                      {hadeuh, erorr}
711                     {ape, sih, maunye, depo, gabisa}
712                         {terussss, gw, error, heran}
713                 {ganggu, butuhin, si, knp, lg, kalo}
Name: Text, Length: 714, dtype: object

In [11]:
# Hapus kata kurang dari sama dengan 3 huruf

def hapusKata3huruf(sentence): #tokenize first
    words = []
    for word in sentence:
        if len(word) > 3 :
            words.append(word)
    return words

df_processed['Text'] = df_processed['Text'].apply(lambda x : hapusKata3huruf(x))
df_processed['Text']

0                                                [error]
1                                                [error]
2                             [kemarin, mecathee, punya]
3                     [sopi, bbrp, eror, kmrn, tommyxap]
4      [cardless, enak, ganggu, telek, sebenernya, ka...
                             ...                        
709                                       [kahhhh, eror]
710                                      [hadeuh, erorr]
711                               [maunye, depo, gabisa]
712                             [terussss, error, heran]
713                              [ganggu, butuhin, kalo]
Name: Text, Length: 714, dtype: object

In [12]:
# memeriksa data kosong akibat cleaning
dataKosong = 0
for i in df_processed['Text']:
    if len(i)==0:
        dataKosong +=1
print(dataKosong)

21


In [13]:
# menghapus data yang kosong akibat cleaning

for idx in range(len(df_processed['Text'])):
    if len(df_processed['Text'].loc[idx]) <= 1:
        df_processed = df_processed.drop(index=idx)
df_processed

Unnamed: 0,Datetime,Tweet Id,Text,Username
2,2022-04-02,1510289897881411584,"[kemarin, mecathee, punya]",andontknoww
3,2022-04-02,1510274838211878917,"[sopi, bbrp, eror, kmrn, tommyxap]",faithfullysour
4,2022-04-02,1510273928240918537,"[cardless, enak, ganggu, telek, sebenernya, ka...",ethernalski
5,2022-04-02,1510263968157552640,"[mulu, eror]",heraaaaah
6,2022-04-02,1510254577513566208,"[jadul, syariah, update, lagipula, tampilanya,...",WahyuAdhiPR
...,...,...,...,...
709,2022-04-13,1514313433184817152,"[kahhhh, eror]",saudadee_p
710,2022-04-13,1514313050677194752,"[hadeuh, erorr]",Sanielyong
711,2022-04-13,1514312872628658178,"[maunye, depo, gabisa]",platinageserr
712,2022-04-13,1514312779104423936,"[terussss, error, heran]",Rahmaniya__


In [14]:
df_processed.reset_index(inplace=True, drop=True)

In [15]:
# memeriksa data kosong akibat cleaning
dataKosong = 0
for i in df_processed['Text']:
    if len(i)==0:
        dataKosong +=1
print(dataKosong)

0


## Labellin Lexicon

In [16]:
# import kamus lexicon

df_lc = pd.read_csv('../Data baru/Lexicon/modified_full_lexicon.csv') 
df_lc

Unnamed: 0,word,weight,number_of_words
0,hai,3,1
1,merekam,2,1
2,ekstensif,3,1
3,paripurna,1,1
4,detail,2,1
...,...,...,...
10245,sontoloyo,-5,1
10246,tai,-5,1
10247,telek,-5,1
10248,tolol,-5,1


In [17]:
def hitung_polaritas(sentence):
    sentiment_score = 0
    for word in sentence:
        score = df_lc[df_lc['word']==word]['weight'].sum()
#         print(score)
        sentiment_score += score
    return sentiment_score

df_processed['sentiment_score'] = df_processed['Text'].apply(lambda x : hitung_polaritas(x))
df_processed

Unnamed: 0,Datetime,Tweet Id,Text,Username,sentiment_score
0,2022-04-02,1510289897881411584,"[kemarin, mecathee, punya]",andontknoww,3
1,2022-04-02,1510274838211878917,"[sopi, bbrp, eror, kmrn, tommyxap]",faithfullysour,-4
2,2022-04-02,1510273928240918537,"[cardless, enak, ganggu, telek, sebenernya, ka...",ethernalski,-8
3,2022-04-02,1510263968157552640,"[mulu, eror]",heraaaaah,-3
4,2022-04-02,1510254577513566208,"[jadul, syariah, update, lagipula, tampilanya,...",WahyuAdhiPR,0
...,...,...,...,...,...
562,2022-04-13,1514313433184817152,"[kahhhh, eror]",saudadee_p,-4
563,2022-04-13,1514313050677194752,"[hadeuh, erorr]",Sanielyong,0
564,2022-04-13,1514312872628658178,"[maunye, depo, gabisa]",platinageserr,0
565,2022-04-13,1514312779104423936,"[terussss, error, heran]",Rahmaniya__,-4


In [18]:
# klasifikasi dari sentimen score

def lc_classifier(score):
    if score >= 0:
        return 1
    elif score < 0:
        return -1

df_processed['class'] = df_processed['sentiment_score'].apply(lambda x : lc_classifier(x))
df_processed

Unnamed: 0,Datetime,Tweet Id,Text,Username,sentiment_score,class
0,2022-04-02,1510289897881411584,"[kemarin, mecathee, punya]",andontknoww,3,1
1,2022-04-02,1510274838211878917,"[sopi, bbrp, eror, kmrn, tommyxap]",faithfullysour,-4,-1
2,2022-04-02,1510273928240918537,"[cardless, enak, ganggu, telek, sebenernya, ka...",ethernalski,-8,-1
3,2022-04-02,1510263968157552640,"[mulu, eror]",heraaaaah,-3,-1
4,2022-04-02,1510254577513566208,"[jadul, syariah, update, lagipula, tampilanya,...",WahyuAdhiPR,0,1
...,...,...,...,...,...,...
562,2022-04-13,1514313433184817152,"[kahhhh, eror]",saudadee_p,-4,-1
563,2022-04-13,1514313050677194752,"[hadeuh, erorr]",Sanielyong,0,1
564,2022-04-13,1514312872628658178,"[maunye, depo, gabisa]",platinageserr,0,1
565,2022-04-13,1514312779104423936,"[terussss, error, heran]",Rahmaniya__,-4,-1


In [19]:
df_processed['class'].value_counts()

-1    363
 1    204
Name: class, dtype: int64

In [62]:
df_new = df_processed.copy()

In [38]:
df_new = pd.concat([df_processed[df_processed['sentiment_score'] > 0 ],
                   df_processed[df_processed['sentiment_score'] < -5 ]], axis=0)
df_new.reset_index(drop=True)

Unnamed: 0,Datetime,Tweet Id,Text,Username,sentiment_score,class
0,2022-04-02,1510289897881411584,"[kemarin, mecathee, punya]",andontknoww,3,1
1,2022-04-02,1510114192551936004,"[november, tanggal, tiket, jual, sesuai, justi...",NikenWP,5,1
2,2022-04-02,1510110820599951361,"[milik, dunia, bisik, baik, angin, golong, ren...",MoelyonovDjalil,1,1
3,2022-04-02,1510104201740783616,"[saudi, iran, persipura, ramadan, perintah, ke...",tribunpontianak,5,1
4,2022-04-02,1510094736094162951,"[tipu, maksimal, tiket, open, jastip, biodata,...",namjukoo,4,1
...,...,...,...,...,...,...
263,2022-04-13,1514316400974004224,"[ngeselin, error, yaaa]",icetealover19,-10,-1
264,2022-04-13,1514315989231423492,"[huhu, hadeeeeh, error, jelek, jaring]",lemylens,-7,-1
265,2022-04-13,1514315221870936065,"[ganggu, gagal, nyaman, depo, akibat, error]",agusolot999,-11,-1
266,2022-04-13,1514314629983670275,"[depo, error, gagal]",Riki82542518,-9,-1


## Mengelompokkan data berdasarkan hari

In [41]:
df_new.reset_index(drop=True, inplace=True)

In [42]:
df_new.head()

Unnamed: 0,Datetime,Tweet Id,Text,Username,sentiment_score,class
0,2022-04-02,1510289897881411584,"[kemarin, mecathee, punya]",andontknoww,3,1
1,2022-04-02,1510114192551936004,"[november, tanggal, tiket, jual, sesuai, justi...",NikenWP,5,1
2,2022-04-02,1510110820599951361,"[milik, dunia, bisik, baik, angin, golong, ren...",MoelyonovDjalil,1,1
3,2022-04-02,1510104201740783616,"[saudi, iran, persipura, ramadan, perintah, ke...",tribunpontianak,5,1
4,2022-04-02,1510094736094162951,"[tipu, maksimal, tiket, open, jastip, biodata,...",namjukoo,4,1


In [43]:
twt_day = df_new.groupby('Datetime').agg({'class': 'sum'}).rename(columns={'class': 'sentimen_score'}).reset_index()
def lc_classifier(score):
    if score >= 0:
        return 1
    elif score < 0:
        return 0

twt_day['sentimen'] = twt_day['sentimen_score'].apply(lambda x : lc_classifier(x))
twt_day

Unnamed: 0,Datetime,sentimen_score,sentimen
0,2022-04-02,-11,0
1,2022-04-03,13,1
2,2022-04-04,-3,0
3,2022-04-05,-11,0
4,2022-04-06,10,1
5,2022-04-07,2,1
6,2022-04-08,2,1
7,2022-04-09,-14,0
8,2022-04-10,23,1
9,2022-04-11,-2,0


# Testing Data 

In [44]:
# mengubah dari set/list ke text

df_new['Text'] = [ " ".join(data) for data in df_new['Text']]
df_new['Text']

0                                 kemarin mecathee punya
1      november tanggal tiket jual sesuai justiceworl...
2      milik dunia bisik baik angin golong rendah gru...
3      saudi iran persipura ramadan perintah keras ma...
4      tipu maksimal tiket open jastip biodata konser...
                             ...                        
263                                  ngeselin error yaaa
264                     huhu hadeeeeh error jelek jaring
265                ganggu gagal nyaman depo akibat error
266                                     depo error gagal
267      bisa wkwkwkwkwkkw listrik pulsa abis yeee error
Name: Text, Length: 268, dtype: object

In [45]:
df_new.head()

Unnamed: 0,Datetime,Tweet Id,Text,Username,sentiment_score,class
0,2022-04-02,1510289897881411584,kemarin mecathee punya,andontknoww,3,1
1,2022-04-02,1510114192551936004,november tanggal tiket jual sesuai justiceworl...,NikenWP,5,1
2,2022-04-02,1510110820599951361,milik dunia bisik baik angin golong rendah gru...,MoelyonovDjalil,1,1
3,2022-04-02,1510104201740783616,saudi iran persipura ramadan perintah keras ma...,tribunpontianak,5,1
4,2022-04-02,1510094736094162951,tipu maksimal tiket open jastip biodata konser...,namjukoo,4,1


In [46]:
feature_test = df_new['Text'].values.tolist()

In [47]:
len(feature_test)

268

## Load TFIDF

In [48]:
import pickle

# Load TFIDF Vectorizer
filename = 'pickles/tfidf_ngram2.sav'
tv = pickle.load(open(filename, 'rb'))

In [49]:
tfid_test = tv.transform(feature_test)
print(tfid_test.shape)

(268, 850)


## Load Model

In [50]:
from sklearn.metrics import confusion_matrix,roc_auc_score, accuracy_score, classification_report, recall_score, precision_score, f1_score

In [51]:
import pickle

# Load Model
filename = 'pickles/sentimen_svm_95.sav'
svc = pickle.load(open(filename, 'rb'))

In [52]:
y_predict = svc.predict(tfid_test)

print('SVM : ', accuracy_score(df_new['class'],y_predict))
print(confusion_matrix(df_new['class'],y_predict))
print()
print(classification_report(df_new['class'],y_predict))

SVM :  0.9626865671641791
[[132   5]
 [  5 126]]

              precision    recall  f1-score   support

          -1       0.96      0.96      0.96       137
           1       0.96      0.96      0.96       131

    accuracy                           0.96       268
   macro avg       0.96      0.96      0.96       268
weighted avg       0.96      0.96      0.96       268



In [53]:
df_new['sentimen_test'] = y_predict
df_new

Unnamed: 0,Datetime,Tweet Id,Text,Username,sentiment_score,class,sentimen_test
0,2022-04-02,1510289897881411584,kemarin mecathee punya,andontknoww,3,1,1
1,2022-04-02,1510114192551936004,november tanggal tiket jual sesuai justiceworl...,NikenWP,5,1,1
2,2022-04-02,1510110820599951361,milik dunia bisik baik angin golong rendah gru...,MoelyonovDjalil,1,1,1
3,2022-04-02,1510104201740783616,saudi iran persipura ramadan perintah keras ma...,tribunpontianak,5,1,1
4,2022-04-02,1510094736094162951,tipu maksimal tiket open jastip biodata konser...,namjukoo,4,1,1
...,...,...,...,...,...,...,...
263,2022-04-13,1514316400974004224,ngeselin error yaaa,icetealover19,-10,-1,-1
264,2022-04-13,1514315989231423492,huhu hadeeeeh error jelek jaring,lemylens,-7,-1,-1
265,2022-04-13,1514315221870936065,ganggu gagal nyaman depo akibat error,agusolot999,-11,-1,-1
266,2022-04-13,1514314629983670275,depo error gagal,Riki82542518,-9,-1,-1


# HASIL TEST SENTIMEN

In [54]:
test_sentimen = df_new.groupby('Datetime').agg({'sentimen_test': 'sum'}).rename(columns={'sentimen_test': 'sentimen_score'}).reset_index()
def lc_classifier(score):
    if score >= 0:
        return 1
    elif score < 0:
        return 0

test_sentimen['sentimen'] = test_sentimen['sentimen_score'].apply(lambda x : lc_classifier(x))
test_sentimen

Unnamed: 0,Datetime,sentimen_score,sentimen
0,2022-04-02,-9,0
1,2022-04-03,15,1
2,2022-04-04,-3,0
3,2022-04-05,-11,0
4,2022-04-06,6,1
5,2022-04-07,2,1
6,2022-04-08,2,1
7,2022-04-09,-14,0
8,2022-04-10,23,1
9,2022-04-11,-2,0


# SENTIMEN LEXICON (AWAL)

In [55]:
lxc_sentimen = df_new.groupby('Datetime').agg({'class': 'sum'}).rename(columns={'class': 'sentimen_score'}).reset_index()
def lc_classifier(score):
    if score >= 0:
        return 1
    elif score < 0:
        return 0

lxc_sentimen['sentimen'] = lxc_sentimen['sentimen_score'].apply(lambda x : lc_classifier(x))
lxc_sentimen

Unnamed: 0,Datetime,sentimen_score,sentimen
0,2022-04-02,-11,0
1,2022-04-03,13,1
2,2022-04-04,-3,0
3,2022-04-05,-11,0
4,2022-04-06,10,1
5,2022-04-07,2,1
6,2022-04-08,2,1
7,2022-04-09,-14,0
8,2022-04-10,23,1
9,2022-04-11,-2,0


In [56]:
print('SVM : ', accuracy_score(lxc_sentimen['sentimen'],test_sentimen['sentimen']))

SVM :  1.0


# Testing gabungan data sentimen & saham

In [57]:
data_test = pd.read_csv('data/testing_data.csv')
data_test['Date'] = pd.to_datetime(data_test['Date'])
data_test

Unnamed: 0,Date,Close,label,open_Equal,open_Positive,open_Negative,High_Equal,High_Positive,High_Negative,Low_Equal,Low_Negative,Low_Positive
0,2022-04-02,7900.0,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,2022-04-03,7900.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,2022-04-04,7900.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,2022-04-05,7900.0,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,2022-04-06,7750.0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
5,2022-04-07,7750.0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
6,2022-04-08,7850.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7,2022-04-09,7808.333333,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
8,2022-04-10,7766.666667,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9,2022-04-11,7725.0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [61]:
data_test['sentimen'] = test_sentimen['sentimen'][:-1]
data_test

Unnamed: 0,Date,Close,label,open_Equal,open_Positive,open_Negative,High_Equal,High_Positive,High_Negative,Low_Equal,Low_Negative,Low_Positive,sentimen
0,2022-04-02,7900.0,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
1,2022-04-03,7900.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
2,2022-04-04,7900.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
3,2022-04-05,7900.0,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
4,2022-04-06,7750.0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
5,2022-04-07,7750.0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1
6,2022-04-08,7850.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
7,2022-04-09,7808.333333,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0
8,2022-04-10,7766.666667,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
9,2022-04-11,7725.0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0


In [62]:
data_test

Unnamed: 0,Date,Close,label,open_Equal,open_Positive,open_Negative,High_Equal,High_Positive,High_Negative,Low_Equal,Low_Negative,Low_Positive,sentimen
0,2022-04-02,7900.0,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
1,2022-04-03,7900.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
2,2022-04-04,7900.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
3,2022-04-05,7900.0,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
4,2022-04-06,7750.0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
5,2022-04-07,7750.0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1
6,2022-04-08,7850.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
7,2022-04-09,7808.333333,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0
8,2022-04-10,7766.666667,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
9,2022-04-11,7725.0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0


In [63]:
import pickle

# Load Model gabungan
filename = 'pickles/movement_model_knn78.sav'
knn = pickle.load(open(filename, 'rb'))

In [75]:
X = data_test[['sentimen', 'Close', 'open_Positive', 'open_Equal', 
               'open_Negative','High_Positive', 'High_Equal', 'High_Negative', 'Low_Equal',
               'Low_Negative', 'Low_Positive']]
y = data_test['label']

In [76]:
X['sentimen'] = X['sentimen'].astype('float')
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sentimen       13 non-null     float64
 1   Close          13 non-null     float64
 2   open_Positive  13 non-null     float64
 3   open_Equal     13 non-null     float64
 4   open_Negative  13 non-null     float64
 5   High_Positive  13 non-null     float64
 6   High_Equal     13 non-null     float64
 7   High_Negative  13 non-null     float64
 8   Low_Equal      13 non-null     float64
 9   Low_Negative   13 non-null     float64
 10  Low_Positive   13 non-null     float64
dtypes: float64(11)
memory usage: 1.2 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sentimen'] = X['sentimen'].astype('float')


In [77]:
y

0     0
1     1
2     1
3     1
4     0
5     1
6     1
7     0
8     0
9     0
10    1
11    1
12    0
Name: label, dtype: int64

In [78]:
y_predict

array(['Raise', 'Raise', 'Raise', 'Raise', 'Raise', 'Raise', 'Raise',
       'Raise', 'Raise', 'Raise', 'Raise', 'Raise', 'Raise'], dtype=object)

In [79]:
y_predict = knn.predict(X)
y_pred = pd.Series(y_predict)
y_pred = y_pred.apply(lambda x: 1 if x == 'Raise' else 0 )

print('KNN : ', accuracy_score(y,y_pred))
print(confusion_matrix(y,y_pred))
print()
print(classification_report(y,y_pred))

KNN :  0.6153846153846154
[[6 0]
 [5 2]]

              precision    recall  f1-score   support

           0       0.55      1.00      0.71         6
           1       1.00      0.29      0.44         7

    accuracy                           0.62        13
   macro avg       0.77      0.64      0.58        13
weighted avg       0.79      0.62      0.57        13



In [80]:
from sklearn.metrics import roc_auc_score
y_proba = knn.predict_proba(X)[:,1]
y_proba

array([0.  , 0.  , 0.  , 0.  , 0.25, 0.25, 0.5 , 0.5 , 0.25, 0.25, 0.75,
       0.75, 0.5 ])

In [81]:
print('auc', roc_auc_score(y,y_proba))

auc 0.49999999999999994


hasil testing sentimen per document = 94%

Hasil testing sentimen setelah di kelompok per hari mendapatkan akurasi = 100%

hasil testing gabungan data saham dan sentimen = 61%

to do list:
- perhatikan urutan kolom input untuk testing harus sama dengan kolom ketika training
- 