In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from matplotlib import pyplot as plt
import seaborn as sns
import pickle
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score

In [2]:
df = pd.read_csv('verify_dataset_clean.csv')
df.sample(5)

Unnamed: 0,Judul,Label
1559,nomor porsi haji berangkat tahun 2018 hingga 2030,1
6585,aniaya polisi anggota dprd sumut 16 teman tangkap,0
4151,video detikdetik lahir 17 anak kembar,1
5418,iniesta sebut xavi pilih baik barcelona,0
5771,guardiola tak senang rashford absen mu vs man ...,0


In [3]:
x_train_text, x_test_text, y_train, y_test = train_test_split(df['Judul'], df['Label'], test_size=0.1)

In [4]:
vectorizer = CountVectorizer(
    ngram_range = (1, 2),
    decode_error = 'replace',
    max_df = 0.5,
    max_features = None
)

pickle.dump(vectorizer.vocabulary, open('fitur.pkl', 'wb'))

In [5]:
transformer = TfidfTransformer(
    norm = 'l2',
    use_idf = True
)

In [6]:
#loaded_vec = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open("fitur.pkl", "rb")))

x_train = transformer.fit_transform(vectorizer.fit_transform(np.array(x_train_text)))
x_test = transformer.transform(vectorizer.transform(np.array(x_test_text)))

In [7]:
x_train.shape, x_test.shape

((6549, 41554), (728, 41554))

In [8]:
len(y_train), len(y_test)

(6549, 728)

In [9]:
verify_logreg = LogisticRegression(
    C = 774.2636826811278, 
    solver='saga', 
    penalty = 'l1',
    max_iter = 5000
    )

In [10]:
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=17
    )

In [11]:
cv_results = cross_val_score(
    verify_logreg, 
    x_train, 
    y_train, 
    cv=skf, 
    scoring='accuracy'
    )

In [12]:
cv_results, cv_results.mean()

(array([0.81603053, 0.81984733, 0.8129771 , 0.81832061, 0.82352941]),
 0.8181409968567579)

In [13]:
verify_logreg.fit(x_train, y_train)

LogisticRegression(C=774.2636826811278, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=5000, multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
test_preds = verify_logreg.predict(x_test)

In [15]:
dfP = pd.DataFrame(x_test_text, columns=['Judul'])
dfP['Predict'] = test_preds
dfP['True'] = y_test

In [16]:
dfP.head()

Unnamed: 0,Judul,Predict,True
5913,gempa magnitudo 58 sigi tak potensi tsunami,0,0
5164,32 rekening milik presiden joko widodo istri 2...,1,1
603,polisi ntb bantah ada kabar ampo kopang pasca ...,0,0
3992,foto formasi 2019gantipresiden jasmerahmob umm,1,1
2221,flyer jalan ganggu siap surat terang hasil rap...,1,1


In [17]:
def my_metrics(y_true, y_pred):
    accuracy=accuracy_score(y_true, y_pred)
    precision=precision_score(y_true, y_pred,average='weighted')
    f1Score=f1_score(y_true, y_pred, average='weighted') 
    print("Accuracy  : {}".format(accuracy))
    print("Precision : {}".format(precision))
    print("f1Score : {}".format(f1Score))
    
    return accuracy, precision, f1Score

my_metrics(y_test, test_preds)

Accuracy  : 0.8186813186813187
Precision : 0.8185166221858126
f1Score : 0.8179694165490385


(0.8186813186813187, 0.8185166221858126, 0.8179694165490385)

In [18]:
confusion_matrix = pd.crosstab(dfP['True'], dfP['Predict'], rownames=['Actual'], colnames=['Predicted'])
print(confusion_matrix)
#sns.heatmap(confusion_matrix, annot=True)
#plt.show()

Predicted    0    1
Actual             
0          247   76
1           56  349


In [19]:
verify_logreg_model = 'verify_logreg_model.sav'
pickle.dump(verify_logreg, open(verify_logreg_model, 'wb'))

In [20]:
predict_proba = verify_logreg.predict_proba(x_test)

In [21]:
i = 0
hoax_prob_long = []
for hoax in predict_proba:
  hoax_prob_long.append(predict_proba[i,1]*100)
  i+=1

hoax_prob = []
for j in hoax_prob_long:
  hoax_prob.append('{:.1f}'.format(j))

In [22]:
dfP['Hoax Probaility'] = hoax_prob

In [23]:
dfP.head()

Unnamed: 0,Judul,Predict,True,Hoax Probaility
5913,gempa magnitudo 58 sigi tak potensi tsunami,0,0,0.0
5164,32 rekening milik presiden joko widodo istri 2...,1,1,99.9
603,polisi ntb bantah ada kabar ampo kopang pasca ...,0,0,0.1
3992,foto formasi 2019gantipresiden jasmerahmob umm,1,1,100.0
2221,flyer jalan ganggu siap surat terang hasil rap...,1,1,71.6


In [24]:
dfP.loc[dfP['True'] == 1, 
       'True'] = 'HOAX'
dfP.loc[dfP['True'] == 0, 
       'True'] = 'BUKAN HOAX'

In [25]:
dfP_final = dfP.drop(columns=['Predict'])

In [26]:
dfP_final.columns = ['Judul', 'Actual', 'Hoax Probability']

dfP_final = dfP_final[['Judul', 'Hoax Probability', 'Actual']]

dfP_final.head()

Unnamed: 0,Judul,Hoax Probability,Actual
5913,gempa magnitudo 58 sigi tak potensi tsunami,0.0,BUKAN HOAX
5164,32 rekening milik presiden joko widodo istri 2...,99.9,HOAX
603,polisi ntb bantah ada kabar ampo kopang pasca ...,0.1,BUKAN HOAX
3992,foto formasi 2019gantipresiden jasmerahmob umm,100.0,HOAX
2221,flyer jalan ganggu siap surat terang hasil rap...,71.6,HOAX


In [27]:
dfP_final.to_csv('hasil_verify_logreg_model.csv', index=False)