# Fake News Detector

## Import Library

In [28]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

## Import Data

In [29]:
true = pd.read_csv('True.csv', engine='python', encoding='utf-8', on_bad_lines='skip')
fake = pd.read_csv('Fake.csv', engine='python', encoding='utf-8', on_bad_lines='skip')

In [30]:
true['label'] = 1
fake['label'] = 0

# Data Preprocessing

## Data Integration

In [31]:
news = pd.concat([fake, true], axis=0)

In [32]:
news.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [33]:
news.tail()

Unnamed: 0,title,text,subject,date,label
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


## Data Cleaning

### Checking Null Values

In [34]:
news.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

### Dropping Unnecessary Column

In [35]:
news.drop(['title', 'subject', 'date'], axis=1)

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",1
21414,MINSK (Reuters) - In the shadow of disused Sov...,1
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,1


In [36]:
news

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


### Scramble Data

In [37]:
news = news.sample(frac=1)
news.reset_index(inplace=True)
news.drop(['index'], axis = 1, inplace=True)

In [38]:
news

Unnamed: 0,title,text,subject,date,label
0,Mexico arrests former high-ranking PRI officia...,MEXICO CITY (Reuters) - Mexican authorities on...,worldnews,"December 21, 2017",1
1,Israeli police resume interview of Netanyahu i...,JERUSALEM (Reuters) - Israeli police officers ...,worldnews,"November 19, 2017",1
2,“STOP BLAMING WHITE PEOPLE For Trump’s Win Las...,Stop blaming white people for Trumps win last ...,politics,"Nov 9, 2016",0
3,"Chances of 'no deal' Brexit not rising, says U...",LONDON (Reuters) - The chance that Britain lea...,worldnews,"October 16, 2017",1
4,"Texas, four other states sue over U.S. transge...","AUSTIN, Texas (Reuters) - Texas and four other...",politicsNews,"August 23, 2016",1
...,...,...,...,...,...
44893,"Trump defeated Clinton by 10,704 votes in Mich...",WASHINGTON (Reuters) - Republican President-el...,politicsNews,"November 23, 2016",1
44894,Kansas judge extends voting rights for those r...,(Reuters) - A Kansas judge extended voting rig...,politicsNews,"September 27, 2016",1
44895,AP Reports Hillary Clinton Has Reached The De...,Just one day before California and New Jersey ...,News,"June 6, 2016",0
44896,(VIDEO) WATCH OUR CHILDISH PRESIDENT TURN HIS ...,It s obvious that the Iraqi PM is trying to ge...,politics,"Jun 9, 2015",0


### WordOPT

In [39]:
def wordopt(text):
    # Convert into lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|\www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove digits
    text = re.sub(r'\d', '', text)

    # Remove newline characters
    text = re.sub(r'\n', ' ', text)

    return text


In [40]:
news['text'] = news['text'].apply(wordopt)

In [41]:
news['text']

0        mexico city reuters  mexican authorities on we...
1        jerusalem reuters  israeli police officers on ...
2        stop blaming white people for trumps win last ...
3        london reuters  the chance that britain leaves...
4        austin texas reuters  texas and four other sta...
                               ...                        
44893    washington reuters  republican presidentelect ...
44894    reuters  a kansas judge extended voting rights...
44895    just one day before california and new jersey ...
44896    it s obvious that the iraqi pm is trying to ge...
44897    remember when the national rifle association w...
Name: text, Length: 44898, dtype: object

## Stopwords and Stemming

In [42]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kalea\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [43]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [44]:
def preprocess_text(text):
    # Tokenisasi teks
    words = text.split()

    # Hapus stopwords dan stem kata
    processed_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]

    # Gabungkan kembali kata yang telah diproses
    return ' '.join(processed_words)

In [45]:
X_stem = [preprocess_text(sentence) for sentence in news['text']]

In [46]:
def custom_tokenizer(text):
    return preprocess_text(text).split()

In [47]:
stem_tf_idf_vectorization = TfidfVectorizer()

In [48]:
x_stem_tf_idf = stem_tf_idf_vectorization.fit_transform(X_stem)

## Feature Extraction (TF-IDF)

In [49]:
x = news['text']
y = news['label']

### TF-IDF Vectorizer

In [50]:
tf_idf_vectorization = TfidfVectorizer()

In [51]:
x_tfidf = tf_idf_vectorization.fit_transform(x)

## Split Data

### Split Data: TF-IDF

In [52]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.3, random_state=10)

### Split Data: TF IDF & Stemming & Stopwords

In [53]:
x_train_stem, x_test_stem, y_train_stem, y_test_stem = train_test_split(x_stem_tf_idf, y, test_size=0.3, random_state=10)

# Model

## Logistic Regression Model

### Model Logistic Regression

In [54]:
tf_idf_LR = LogisticRegression()
stem_LR = LogisticRegression()

In [55]:
tf_idf_LR.fit(x_train, y_train)
stem_LR.fit(x_train_stem, y_train_stem)

### Model Evaluation

#### TF-IDF Score

In [56]:
tf_idf_train_pred_lr = tf_idf_LR.predict(x_train)
tf_idf_test_pred_lr = tf_idf_LR.predict(x_test)

In [94]:
tf_idf_train_score_LR = tf_idf_LR.score(x_train, y_train)
tf_idf_test_score_LR = tf_idf_LR.score(x_test, y_test)

0.9885671863400148

In [95]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16452
           1       0.99      0.99      0.99     14976

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7029
           1       0.99      0.99      0.99      6441

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [96]:
print(f"Logistic Regression TF IDF Train Score: {tf_idf_train_score_LR}")
print(f"Logistic Regression TF IDF Test Score: {tf_idf_test_score_LR}")

Logistic Regression TF IDF Train Score: 0.9926180476008655
Logistic Regression TF IDF Test Score: 0.9885671863400148


#### TF-IDF with Stemming and Stopwords Score

In [60]:
stem_train_pred_lr = stem_LR.predict(x_train_stem)
stem_test_pred_lr = stem_LR.predict(x_test_stem)

In [61]:
stem_train_score_LR = stem_LR.score(x_train_stem, y_train_stem)

In [62]:
stem_test_score_LR = stem_LR.score(x_test_stem, y_test_stem)

In [63]:
print(classification_report(y_train_stem, stem_train_pred_lr))
print(classification_report(y_test_stem, stem_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16452
           1       0.99      0.99      0.99     14976

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7029
           1       0.98      0.99      0.98      6441

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [97]:
print(f"Stemming, Stopwords, TF IDF Train Score: {stem_train_score_LR}")
print(f"Stemming, Stopwords, TF IDF Test Score: {stem_test_score_LR}")

Stemming, Stopwords, TF IDF Train Score: 0.990804378261423
Stemming, Stopwords, TF IDF Test Score: 0.9851521900519673


### Logistic Regression Parameter Optimization

In [65]:
# Definisikan parameter grid
tf_idf_LR_param_grid = {'C': [0.1, 1.0, 10.0], 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga'], 'max_iter': [1000, 5000]}

# Definisikan GridSearchCV
tf_idf_grid_search = GridSearchCV(estimator=tf_idf_LR, param_grid=tf_idf_LR_param_grid, cv=5, scoring='accuracy')

# Fit model
tf_idf_grid_search.fit(x_train, y_train)



In [67]:
# Definisikan parameter grid
stem_param_grid = {'C': [0.1, 1.0, 10.0], 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga'], 'max_iter': [1000, 5000]}

# Definisikan GridSearchCV
stem_grid_search = GridSearchCV(estimator=stem_LR, param_grid=stem_param_grid, cv=5, scoring='accuracy')

# Fit model
stem_grid_search.fit(x_train_stem, y_train_stem)



In [68]:
# Hasil terbaik
print(f"Best parameters for TF IDF Logistic Regression: {tf_idf_grid_search.best_params_}")
print(f"Best parameters for Stemming and Stopwords Logistic Regression: {stem_grid_search.best_params_}")

Best parameters for TF IDF Logistic Regression: {'C': 10.0, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}
Best parameters for Stemming and Stopwords Logistic Regression: {'C': 10.0, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}


### Logistic Regression Model after Optimisasi

In [69]:
optimized_tf_idf_LR = LogisticRegression(C=10, penalty='l1', solver='saga', max_iter=1000)
optimized_stem_LR = LogisticRegression(C=10, penalty='l1', solver='saga', max_iter=1000)

In [70]:
optimized_tf_idf_LR.fit(x_train, y_train)
optimized_stem_LR.fit(x_train_stem, y_train_stem)



#### TF IDF

In [71]:
optimized_tf_idf_train_pred_LR = optimized_tf_idf_LR.predict(x_train)
optimized_tf_idf_test_pred_LR = optimized_tf_idf_LR.predict(x_test)

In [87]:
optimized_tf_idf_train_score_LR = optimized_tf_idf_LR.score(x_train, y_train)
optimized_tf_idf_test_score_LR = optimized_tf_idf_LR.score(x_test, y_test)

In [78]:
print(classification_report(y_train, optimized_tf_idf_train_pred_LR))
print(classification_report(y_test, optimized_tf_idf_test_pred_LR))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16452
           1       1.00      1.00      1.00     14976

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7029
           1       1.00      1.00      1.00      6441

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [100]:
print(f"Optimized TF IDF Train Score: {optimized_tf_idf_train_score_LR}")
print(f"Optimized TF IDF Test Score: {optimized_tf_idf_test_score_LR}")

Optimized TF IDF Train Score: 0.9997136311569301
Optimized TF IDF Test Score: 0.996807720861173


#### TF IDF with Stemming and Stopwords

In [85]:
optimized_stem_train_pred_LR = optimized_stem_LR.predict(x_train_stem)
optimized_stem_test_pred_LR = optimized_stem_LR.predict(x_test_stem)

In [86]:
optimized_stem_train_score_LR = optimized_stem_LR.score(x_train_stem, y_train_stem)
optimized_stem_test_score_LR = optimized_stem_LR.score(x_test_stem, y_test_stem)

In [89]:
print(classification_report(y_train_stem, optimized_stem_train_pred_LR))
print(classification_report(y_test_stem, optimized_stem_test_pred_LR))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16452
           1       1.00      1.00      1.00     14976

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7029
           1       1.00      1.00      1.00      6441

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [121]:
print(f"Logistic Regression Optimized Stemming, Stopwords, TF IDF Train Score: {optimized_stem_train_score_LR}")
print(f"Logistic Regression Optimized Stemming, Stopwords, TF IDF Test Score: {optimized_stem_test_score_LR}")

Logistic Regression Optimized Stemming, Stopwords, TF IDF Train Score: 0.9997772686776123
Logistic Regression Optimized Stemming, Stopwords, TF IDF Test Score: 0.99547141796585


### Score Differential

#### TF IDF

In [102]:
print(f"Logistic Regression TF IDF Train Score: {tf_idf_train_score_LR}")
print(f"Logistic Regression TF IDF Test Score: {tf_idf_test_score_LR}")
print(f"Optimized Logistic Regression TF IDF Train Score: {optimized_tf_idf_train_score_LR}")
print(f"Optimized Logistic Regression TF IDF Test Score: {optimized_tf_idf_test_score_LR}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_tf_idf_train_score_LR - tf_idf_train_score_LR} = {((optimized_tf_idf_train_score_LR - tf_idf_train_score_LR) / tf_idf_train_score_LR) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_tf_idf_test_score_LR - tf_idf_test_score_LR}) = {((optimized_tf_idf_test_score_LR - tf_idf_test_score_LR) / tf_idf_test_score_LR) * 100:.2f}%")

Logistic Regression TF IDF Train Score: 0.9926180476008655
Logistic Regression TF IDF Test Score: 0.9885671863400148
Optimized Logistic Regression TF IDF Train Score: 0.9997136311569301
Optimized Logistic Regression TF IDF Test Score: 0.996807720861173

Perbedaan skor model setelah optimisasi pada train data: 0.007095583556064655 = 0.71%
Perbedaan skor model setelah optimisasi pada test data: 0.008240534521158138) = 0.83%


#### TF IDF with Stemming and Stopwords

In [103]:
print(f"Logistic Regression Stemming, Stopwords, TF IDF Train Score: {stem_train_score_LR}")
print(f"Logistic Regression Stemming, Stopwords, TF IDF Test Score: {stem_test_score_LR}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: {optimized_stem_train_score_LR}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: {optimized_stem_test_score_LR}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_stem_train_score_LR - stem_train_score_LR} = {((optimized_stem_train_score_LR - stem_train_score_LR) / stem_train_score_LR) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_stem_test_score_LR - stem_test_score_LR}) = {((optimized_stem_test_score_LR - stem_test_score_LR) / stem_test_score_LR) * 100:.2f}%")

Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.990804378261423
Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.9851521900519673
Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9997772686776123
Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.99547141796585

Perbedaan skor model setelah optimisasi pada train data: 0.008972890416189316 = 0.91%
Perbedaan skor model setelah optimisasi pada test data: 0.010319227913882734) = 1.05%


In [109]:
print(f"Optimized Logistic Regression TF IDF Train Score: {optimized_tf_idf_train_score_LR}")
print(f"Optimized Logistic Regression TF IDF Test Score: {optimized_tf_idf_test_score_LR}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: {optimized_stem_train_score_LR}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: {optimized_stem_test_score_LR}")

print("")

print(f"Perbedaan skor antara 2 model pada train data: {optimized_stem_train_score_LR - optimized_tf_idf_train_score_LR} = {((optimized_stem_train_score_LR - optimized_tf_idf_train_score_LR) / optimized_tf_idf_train_score_LR) * 100:.2f}%")
print(f"Perbedaan skor antara 2 model pada pada test data: {optimized_stem_test_score_LR - optimized_tf_idf_test_score_LR}) = {((optimized_stem_test_score_LR - optimized_tf_idf_test_score_LR) / optimized_tf_idf_test_score_LR) * 100:.2f}%")

Optimized Logistic Regression TF IDF Train Score: 0.9997136311569301
Optimized Logistic Regression TF IDF Test Score: 0.996807720861173
Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9997772686776123
Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.99547141796585

Perbedaan skor antara 2 model pada train data: 6.363752068216932e-05 = 0.01%
Perbedaan skor antara 2 model pada pada test data: -0.0013363028953229383) = -0.13%


## Pembuatan Model Decision Tree Classifier

### Model Decision Tree Classifier

In [110]:
tf_idf_DTC = DecisionTreeClassifier()
stem_DTC = DecisionTreeClassifier()

In [111]:
tf_idf_DTC.fit(x_train, y_train)
stem_DTC.fit(x_train_stem, y_train_stem)

### Model Evaluation

#### TF IDF

In [112]:
tf_idf_train_pred_DTC = tf_idf_DTC.predict(x_train)
tf_idf_test_pred_DTC = tf_idf_DTC.predict(x_test)

In [113]:
tf_idf_train_score_DTC = tf_idf_DTC.score(x_train, y_train)
tf_idf_test_score_DTC = tf_idf_DTC.score(x_test, y_test)

In [114]:
print(classification_report(y_train, tf_idf_train_pred_DTC))
print(classification_report(y_test, tf_idf_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16452
           1       1.00      1.00      1.00     14976

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7029
           1       1.00      1.00      1.00      6441

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [116]:
print(f"Decision Tree Classifier TF IDF Train Score: {tf_idf_train_score_DTC}")
print(f"Decision Tree Classifier TF IDF Test Score: {tf_idf_test_score_DTC}")

Decision Tree Classifier TF IDF Train Score: 1.0
Decision Tree Classifier TF IDF Test Score: 0.9961395694135116


#### TF IDF with Stemming and Stopwords Score

In [117]:
stem_train_pred_DTC = stem_DTC.predict(x_train_stem)
stem_test_pred_DTC = stem_DTC.predict(x_test_stem)

In [118]:
stem_train_score_DTC = stem_DTC.score(x_train_stem, y_train_stem)
stem_test_score_DTC = stem_DTC.score(x_test_stem, y_test_stem)

In [119]:
print(classification_report(y_train_stem, stem_train_pred_DTC))
print(classification_report(y_test_stem, stem_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16452
           1       1.00      1.00      1.00     14976

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7029
           1       1.00      1.00      1.00      6441

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [120]:
print(f"Decision Tree Classifier Stemming, Stopwords, TF IDF Train Score: {stem_train_score_DTC}")
print(f"Decision Tree Classifier Stemming, Stopwords, TF IDF Test Score: {stem_test_score_DTC}")

Decision Tree Classifier Stemming, Stopwords, TF IDF Train Score: 1.0
Decision Tree Classifier Stemming, Stopwords, TF IDF Test Score: 0.9962138084632517


### Decision Tree Classifier Parameter Optimization

In [None]:
# Definisikan parameter grid
tf_idf_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
tf_idf_grid_search_dt = GridSearchCV(estimator=tf_idf_DTC, param_grid=tf_idf_param_grid, cv=5, scoring='accuracy')

# Fit model
tf_idf_grid_search_dt.fit(x_train, y_train)

In [None]:
stem_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
stem_grid_search_dt = GridSearchCV(estimator=stem_DTC, param_grid=stem_param_grid, cv=5, scoring='accuracy')

# Fit model
stem_grid_search_dt.fit(x_train_stem, y_train_stem)

In [None]:
print(f"Best parameters for TF IDF Decision Tree Classifier: {tf_idf_grid_search_dt.best_params_}")
print(f"Best parameters for Stem and Stopwords Decision Tree Classifier: {stem_grid_search_dt.best_params_}")

Best parameters for TF IDF Decision Tree Classifier: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best parameters for Stem and Stopwords Decision Tree Classifier: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}


### Decision Tree Clasifier Model after Optimisasi

In [106]:
optimized_tf_idf_DTC = DecisionTreeClassifier(max_depth=30, min_samples_leaf=2, min_samples_split=10)
optimized_stem_DTC = DecisionTreeClassifier(max_depth=None, min_samples_leaf=2, min_samples_split=10)

In [107]:
optimized_tf_idf_DTC.fit(x_train, y_train)
optimized_stem_DTC.fit(x_train_stem, y_train_stem)

#### TF IDF

In [108]:
optimized_tf_idf_train_pred_DTC = optimized_tf_idf_DTC.predict(x_train)
optimized_tf_idf_test_pred_DTC = optimized_tf_idf_DTC.predict(x_test)

In [109]:
optimized_tf_idf_train_score_DTC = optimized_tf_idf_DTC.score(x_train, y_train)

In [110]:
optimized_tf_idf_test_score_DTC = optimized_tf_idf_DTC.score(x_test, y_test)

In [111]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16487
           1       0.99      0.99      0.99     14941

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6994
           1       0.98      0.99      0.99      6476

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



#### TF IDF With Stemming & Stopwords

In [112]:
optimized_stem_train_pred_DTC = optimized_stem_DTC.predict(x_train_stem)
optimized_stem_test_pred_DTC = optimized_stem_DTC.predict(x_test_stem)

In [113]:
optimized_stem_train_score_DTC = optimized_stem_DTC.score(x_train_stem, y_train_stem)

In [114]:
optimized_stem_test_score_DTC = optimized_stem_DTC.score(x_test_stem, y_test_stem)

In [115]:
print(classification_report(y_train_stem, optimized_stem_train_pred_DTC))
print(classification_report(y_test_stem, optimized_stem_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16487
           1       1.00      1.00      1.00     14941

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6994
           1       1.00      1.00      1.00      6476

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



### Score Differential

#### TF IDF

In [118]:
print(f"Decision Tree Classifier TF IDF Train Score: {tf_idf_train_score_LR}")
print(f"Decision Tree Classifier TF IDF Test Score: {tf_idf_test_score_LR}")
print(f"Optimized Decision Tree Classifier TF IDF Train Score: {optimized_tf_idf_train_score_LR}")
print(f"Optimized Decision Tree Classifier TF IDF Test Score: {optimized_tf_idf_test_score_LR}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_tf_idf_train_score_DTC - tf_idf_train_score_DTC} = {((optimized_tf_idf_train_score_DTC - tf_idf_train_score_DTC) / tf_idf_train_score_DTC) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_tf_idf_test_score_DTC - tf_idf_test_score_DTC}) = {((optimized_tf_idf_test_score_DTC - tf_idf_test_score_DTC) / tf_idf_test_score_DTC) * 100:.2f}%")

print(f"Perbedaan skor model setelah optimisasi pada Model Stemming, Stopwords, dan TF IDF  train data: {optimized_stem_train_score_DTC - stem_train_score_DTC} = {((optimized_stem_train_score_DTC - stem_train_score_DTC) / stem_train_score_DTC) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada Model Stemming, Stopwords, dan TF IDF  pada test data: {optimized_stem_test_score_DTC - stem_test_score_DTC}) = {((optimized_stem_test_score_DTC - stem_test_score_DTC) / stem_test_score_DTC) * 100:.2f}%")

Decision Tree Classifier TF IDF Train Score: 0.9933498790887108
Decision Tree Classifier TF IDF Test Score: 0.9878247958426132
Optimized Decision Tree Classifier TF IDF Train Score: 0.9999681812396589
Optimized Decision Tree Classifier TF IDF Test Score: 0.9976985894580549

Perbedaan skor model setelah optimisasi pada train data: -0.0015591192567138146 = -0.16%
Perbedaan skor model setelah optimisasi pada test data: -0.0008166295471417895) = -0.08%
Perbedaan skor model setelah optimisasi pada Model Stemming, Stopwords, dan TF IDF  train data: -0.0018454880997836876 = -0.18%
Perbedaan skor model setelah optimisasi pada Model Stemming, Stopwords, dan TF IDF  pada test data: -0.0007423904974015461) = -0.07%


#### TF IDF With Stemming & Stopwords

In [117]:
print(f"Logistic Regression TF IDF Train Score: {stem_train_score_LR}")
print(f"Logistic Regression TF IDF Test Score: {stem_test_score_LR}")
print(f"Optimized Logistic Regression TF IDF Train Score: {optimized_stem_train_score_LR}")
print(f"Optimized Logistic Regression TF IDF Test Score: {optimized_stem_test_score_LR}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_stem_train_score_LR - stem_train_score_LR} = {((optimized_stem_train_score_LR - stem_train_score_LR) / stem_train_score_LR) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_stem_test_score_LR - stem_test_score_LR}) = {((optimized_stem_test_score_LR - stem_test_score_LR) / stem_test_score_LR) * 100:.2f}%")

Logistic Regression TF IDF Train Score: 0.991822578592338
Logistic Regression TF IDF Test Score: 0.9836674090571641
Optimized Logistic Regression TF IDF Train Score: 0.9995863561155658
Optimized Logistic Regression TF IDF Test Score: 0.9955456570155902

Perbedaan skor model setelah optimisasi pada train data: 0.007763777523227766 = 0.78%
Perbedaan skor model setelah optimisasi pada test data: 0.01187824795842607) = 1.21%


## Model Implementation

In [None]:
def output_label(n):
  if n==0:
    return "It Is Fake News"
  elif n==1:
    return "It Is Genuine News"


In [None]:
def manual_testing(news):
    # Corrected syntax for defining dictionary
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_x_test = new_def_test["text"].apply(wordopt)

    # Assuming 'vectorization' is your vectorizer object
    new_xv_test = vectorization.transform(new_x_test)

    # Model predictions
    pred_lr = LR.predict(new_xv_test)
    pred_dtc = DTC.predict(new_xv_test)
    pred_rfc = RFC.predict(new_xv_test)

    return "\nLR Prediction: {}\nDTC Prediction: {}\nRFC Prediction: {}".format(
        output_label(pred_lr[0]),
        output_label(pred_dtc[0]),
        output_label(pred_rfc[0])
    )


In [None]:
news_article = str(input())

KeyboardInterrupt: Interrupted by user

In [None]:
manual_testing(news_article)