<a href="https://colab.research.google.com/github/B21-CAP0133/verify-android-app/blob/master/ML-dir/04_VERIFY_Build_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from matplotlib import pyplot as plt
import seaborn as sns
import pickle
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score

In [4]:
df = pd.read_csv('verify_dataset_clean.csv')
df.sample(5)

Unnamed: 0,Judul,Label
2594,google translate jokowi thu khang thi phu,1
1622,kursi gbk rusak parah pasca laga indonesia vs ...,0
3053,sdh legal maksiat,1
4208,jelas menko polhukam isu politik kini,1
2618,hebat polisi malaysia,1


In [5]:
x_train_text, x_test_text, y_train, y_test = train_test_split(df['Judul'], df['Label'], test_size=0.1)

In [6]:
transformer = TfidfVectorizer(
    ngram_range = (1, 2), 
    max_features = None,
    max_df = 0.5,
    use_idf = True,
    norm = 'l2'
    )

In [19]:
x_train = transformer.fit_transform(x_train_text)
x_test = transformer.transform(x_test_text)

In [20]:
x_train.shape, x_test.shape

((6549, 41464), (728, 41464))

In [9]:
len(y_train), len(y_test)

(6549, 728)

In [10]:
verify_logreg = LogisticRegression(
    C = 774.2636826811278, 
    solver='saga', 
    penalty = 'l1',
    max_iter = 5000
    )

In [11]:
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=17
    )

In [None]:
cv_results = cross_val_score(
    verify_logreg, 
    x_train, 
    y_train, 
    cv=skf, 
    scoring='accuracy'
    )

In [13]:
cv_results, cv_results.mean()

In [14]:
verify_logreg.fit(x_train, y_train)

LogisticRegression(C=774.2636826811278, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=5000, multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
test_preds = verify_logreg.predict(x_test)

In [22]:
dfP = pd.DataFrame(x_test_text, columns=['Judul'])
dfP['Predict'] = test_preds
dfP['True'] = y_test

In [23]:
dfP.head()

Unnamed: 0,Judul,Predict,True
4853,foto aliansi kerja panjang sudirmankuningan,1,1
845,yang tau di lengan kanan bagi atas seragam ten...,1,1
4058,umat islam new zealand bakar gereja,1,1
5515,menpora dukung batal kirim atlet china masters,1,0
4464,uang pecah rp 200 ribu resmi luncur,1,1


In [40]:
def my_metrics(y_true, y_pred):
    accuracy=accuracy_score(y_true, y_pred)
    precision=precision_score(y_true, y_pred,average='weighted')
    f1Score=f1_score(y_true, y_pred, average='weighted') 
    print("Accuracy  : {}".format(accuracy))
    print("Precision : {}".format(precision))
    print("f1Score : {}".format(f1Score))
    
    return accuracy, precision, f1Score

my_metrics(y_test, test_preds)

Accuracy  : 0.8104395604395604
Precision : 0.8099377215721315
f1Score : 0.8101323696853504


(0.8104395604395604, 0.8099377215721315, 0.8101323696853504)

In [25]:
confusion_matrix = pd.crosstab(dfP['True'], dfP['Predict'], rownames=['Actual'], colnames=['Predicted'])
print(confusion_matrix)
#sns.heatmap(confusion_matrix, annot=True)
#plt.show()

Predicted    0    1
Actual             
0          226   72
1           66  364


In [26]:
verify_logreg_model = 'verify_logreg_model.sav'
pickle.dump(verify_logreg, open(verify_logreg_model, 'wb'))

In [27]:
predict_proba = verify_logreg.predict_proba(x_test)

In [28]:
i = 0
hoax_prob_long = []
for hoax in predict_proba:
  hoax_prob_long.append(predict_proba[i,1]*100)
  i+=1

hoax_prob = []
for j in hoax_prob_long:
  hoax_prob.append('{:.1f}'.format(j))

In [29]:
dfP['Hoax Probaility'] = hoax_prob

In [30]:
dfP.head()

Unnamed: 0,Judul,Predict,True,Hoax Probaility
4853,foto aliansi kerja panjang sudirmankuningan,1,1,99.7
845,yang tau di lengan kanan bagi atas seragam ten...,1,1,99.8
4058,umat islam new zealand bakar gereja,1,1,99.6
5515,menpora dukung batal kirim atlet china masters,1,0,86.3
4464,uang pecah rp 200 ribu resmi luncur,1,1,99.1


In [31]:
dfP.loc[dfP['True'] == 1, 
       'True'] = 'HOAX'
dfP.loc[dfP['True'] == 0, 
       'True'] = 'BUKAN HOAX'

In [32]:
dfP_final = dfP.drop(columns=['Predict'])

In [33]:
dfP_final.columns = ['Judul', 'Actual', 'Hoax Probability']

dfP_final = dfP_final[['Judul', 'Hoax Probability', 'Actual']]

dfP_final.head()

Unnamed: 0,Judul,Hoax Probability,Actual
4853,foto aliansi kerja panjang sudirmankuningan,99.7,HOAX
845,yang tau di lengan kanan bagi atas seragam ten...,99.8,HOAX
4058,umat islam new zealand bakar gereja,99.6,HOAX
5515,menpora dukung batal kirim atlet china masters,86.3,BUKAN HOAX
4464,uang pecah rp 200 ribu resmi luncur,99.1,HOAX


In [34]:
dfP_final.to_csv('hasil_verify_logreg_model.csv', index=False)