# Fake News Detector

## Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

## Import Data

In [2]:
true = pd.read_csv('True.csv', engine='python', encoding='utf-8', on_bad_lines='skip')
fake = pd.read_csv('Fake.csv', engine='python', encoding='utf-8', on_bad_lines='skip')

In [3]:
true['label'] = 1
fake['label'] = 0

# Data Preprocessing

## Data Integration

In [4]:
news = pd.concat([fake, true], axis=0)

In [5]:
news.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [6]:
news.tail()

Unnamed: 0,title,text,subject,date,label
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


## Data Cleaning

### Checking Null Values

In [7]:
news.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

### Dropping Unnecessary Column

In [8]:
news.drop(['title', 'subject', 'date'], axis=1)

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",1
21414,MINSK (Reuters) - In the shadow of disused Sov...,1
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,1


In [9]:
news

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


### Scramble Data

In [10]:
news = news.sample(frac=1)
news.reset_index(inplace=True)
news.drop(['index'], axis = 1, inplace=True)

In [11]:
news

Unnamed: 0,title,text,subject,date,label
0,Respected Forensic Pathologist Suggests Trump...,Hillary Clinton shocked America a few days ago...,News,"September 15, 2016",0
1,House leader McCarthy suggested Trump on Putin...,WASHINGTON (Reuters) - A leading Republican in...,politicsNews,"May 17, 2017",1
2,HOW REAGAN DEALT WITH RADICAL PROTESTERS At Be...,Reagan explains why the protesters got out of ...,left-news,"Mar 12, 2016",0
3,DUCK DYNASTY’S PHIL ROBERTSON Weighs In On Tru...,Phil Robertson of Duck Dynasty has endorsed Do...,politics,"May 24, 2016",0
4,Trevor Noah Roasts Jeb Bush’s Pathetically De...,"If Jeb Bush watched The Daily Show last night,...",News,"January 26, 2016",0
...,...,...,...,...,...
44893,China to prosecute former party boss of Chongqing,BEIJING (Reuters) - The former Communist Party...,worldnews,"September 29, 2017",1
44894,'Even more concerned' after May Brexit speech:...,BRUSSELS (Reuters) - The head of the European ...,worldnews,"September 22, 2017",1
44895,Key House Republican expects to see revised he...,WASHINGTON (Reuters) - The leader of a group o...,politicsNews,"April 4, 2017",1
44896,Arab League to hold emergency meeting on Jerus...,CAIRO (Reuters) - The Arab League is to hold a...,worldnews,"December 6, 2017",1


### WordOPT

In [12]:
def wordopt(text):
    # Convert into lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|\www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove digits
    text = re.sub(r'\d', '', text)

    # Remove newline characters
    text = re.sub(r'\n', ' ', text)

    return text


In [13]:
news['text'] = news['text'].apply(wordopt)

In [14]:
news['text']

0        hillary clinton shocked america a few days ago...
1        washington reuters  a leading republican in th...
2        reagan explains why the protesters got out of ...
3        phil robertson of duck dynasty has endorsed do...
4        if jeb bush watched the daily show last night ...
                               ...                        
44893    beijing reuters  the former communist party bo...
44894    brussels reuters  the head of the european par...
44895    washington reuters  the leader of a group of u...
44896    cairo reuters  the arab league is to hold an e...
44897    washington reuters  us president barack obama ...
Name: text, Length: 44898, dtype: object

## Stopwords and Stemming

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kalea\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [17]:
def preprocess_text(text):
    words = text.split()

    # Hapus stopwords dan stem kata
    processed_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]

    # Gabungkan kembali kata yang telah diproses
    return ' '.join(processed_words)

In [18]:
X_stem = [preprocess_text(sentence) for sentence in news['text']]

In [19]:
def custom_tokenizer(text):
    return preprocess_text(text).split()

In [20]:
stem_tf_idf_vectorization = TfidfVectorizer()

In [21]:
x_stem_tf_idf = stem_tf_idf_vectorization.fit_transform(X_stem)

## Feature Extraction (TF-IDF)

In [22]:
x = news['text']
y = news['label']

### TF-IDF Vectorizer

In [23]:
tf_idf_vectorization = TfidfVectorizer()

In [24]:
x_tfidf = tf_idf_vectorization.fit_transform(x)

## Split Data

### Split Data: TF-IDF

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.3, random_state=10)

### Split Data: TF IDF & Stemming & Stopwords

In [26]:
x_train_stem, x_test_stem, y_train_stem, y_test_stem = train_test_split(x_stem_tf_idf, y, test_size=0.3, random_state=10)

# Model

## Logistic Regression Model

### Model Logistic Regression

In [27]:
tf_idf_LR = LogisticRegression()
stem_LR = LogisticRegression()

In [28]:
tf_idf_LR.fit(x_train, y_train)
stem_LR.fit(x_train_stem, y_train_stem)

### Model Evaluation

#### TF-IDF Score

In [29]:
tf_idf_train_pred_lr = tf_idf_LR.predict(x_train)
tf_idf_test_pred_lr = tf_idf_LR.predict(x_test)

In [30]:
tf_idf_train_score_LR = tf_idf_LR.score(x_train, y_train)
tf_idf_test_score_LR = tf_idf_LR.score(x_test, y_test)

In [31]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16464
           1       0.99      0.99      0.99     14964

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      7017
           1       0.98      0.99      0.99      6453

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [32]:
print(f"Logistic Regression TF IDF Train Score: {tf_idf_train_score_LR}")
print(f"Logistic Regression TF IDF Test Score: {tf_idf_test_score_LR}")

Logistic Regression TF IDF Train Score: 0.9930316914852997
Logistic Regression TF IDF Test Score: 0.9869339272457313


#### TF-IDF with Stemming and Stopwords Score

In [33]:
stem_train_pred_lr = stem_LR.predict(x_train_stem)
stem_test_pred_lr = stem_LR.predict(x_test_stem)

In [34]:
stem_train_score_LR = stem_LR.score(x_train_stem, y_train_stem)

In [35]:
stem_test_score_LR = stem_LR.score(x_test_stem, y_test_stem)

In [36]:
print(classification_report(y_train_stem, stem_train_pred_lr))
print(classification_report(y_test_stem, stem_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16464
           1       0.99      0.99      0.99     14964

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      7017
           1       0.98      0.99      0.98      6453

    accuracy                           0.98     13470
   macro avg       0.98      0.98      0.98     13470
weighted avg       0.98      0.98      0.98     13470



In [37]:
print(f"Stemming, Stopwords, TF IDF Train Score: {stem_train_score_LR}")
print(f"Stemming, Stopwords, TF IDF Test Score: {stem_test_score_LR}")

Stemming, Stopwords, TF IDF Train Score: 0.9916316660302915
Stemming, Stopwords, TF IDF Test Score: 0.984112843355605


### Logistic Regression Parameter Optimization

In [65]:
# Definisikan parameter grid
tf_idf_LR_param_grid = {'C': [0.1, 1.0, 10.0], 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga'], 'max_iter': [1000, 5000]}

# Definisikan GridSearchCV
tf_idf_grid_search = GridSearchCV(estimator=tf_idf_LR, param_grid=tf_idf_LR_param_grid, cv=5, scoring='accuracy')

# Fit model
tf_idf_grid_search.fit(x_train, y_train)



In [67]:
# Definisikan parameter grid
stem_param_grid = {'C': [0.1, 1.0, 10.0], 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga'], 'max_iter': [1000, 5000]}

# Definisikan GridSearchCV
stem_grid_search = GridSearchCV(estimator=stem_LR, param_grid=stem_param_grid, cv=5, scoring='accuracy')

# Fit model
stem_grid_search.fit(x_train_stem, y_train_stem)



In [68]:
# Hasil terbaik
print(f"Best parameters for TF IDF Logistic Regression: {tf_idf_grid_search.best_params_}")
print(f"Best parameters for Stemming and Stopwords Logistic Regression: {stem_grid_search.best_params_}")

Best parameters for TF IDF Logistic Regression: {'C': 10.0, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}
Best parameters for Stemming and Stopwords Logistic Regression: {'C': 10.0, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}


### Logistic Regression Model after Optimisasi

In [38]:
optimized_tf_idf_LR = LogisticRegression(C=10, penalty='l1', solver='saga', max_iter=1000)
optimized_stem_LR = LogisticRegression(C=10, penalty='l1', solver='saga', max_iter=1000)

In [39]:
optimized_tf_idf_LR.fit(x_train, y_train)
optimized_stem_LR.fit(x_train_stem, y_train_stem)



#### TF IDF

In [40]:
optimized_tf_idf_train_pred_LR = optimized_tf_idf_LR.predict(x_train)
optimized_tf_idf_test_pred_LR = optimized_tf_idf_LR.predict(x_test)

In [51]:
optimized_tf_idf_train_score_LR = optimized_tf_idf_LR.score(x_train, y_train)
optimized_tf_idf_test_score_LR = optimized_tf_idf_LR.score(x_test, y_test)

In [52]:
print(classification_report(y_train, optimized_tf_idf_train_pred_LR))
print(classification_report(y_test, optimized_tf_idf_test_pred_LR))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16464
           1       1.00      1.00      1.00     14964

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7017
           1       1.00      1.00      1.00      6453

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [53]:
print(f"Optimized Logistic Regression TF IDF Train Score: {optimized_tf_idf_train_score_LR}")
print(f"Optimized Logistic Regression TF IDF Test Score: {optimized_tf_idf_test_score_LR}")

Optimized TF IDF Train Score: 0.9995545373552246
Optimized TF IDF Test Score: 0.9964365256124722


#### TF IDF with Stemming and Stopwords

In [54]:
optimized_stem_train_pred_LR = optimized_stem_LR.predict(x_train_stem)
optimized_stem_test_pred_LR = optimized_stem_LR.predict(x_test_stem)

In [55]:
optimized_stem_train_score_LR = optimized_stem_LR.score(x_train_stem, y_train_stem)
optimized_stem_test_score_LR = optimized_stem_LR.score(x_test_stem, y_test_stem)

In [56]:
print(classification_report(y_train_stem, optimized_stem_train_pred_LR))
print(classification_report(y_test_stem, optimized_stem_test_pred_LR))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16464
           1       1.00      1.00      1.00     14964

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      7017
           1       0.99      1.00      0.99      6453

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [57]:
print(f"Logistic Regression Optimized Stemming, Stopwords, TF IDF Train Score: {optimized_stem_train_score_LR}")
print(f"Logistic Regression Optimized Stemming, Stopwords, TF IDF Test Score: {optimized_stem_test_score_LR}")

Logistic Regression Optimized Stemming, Stopwords, TF IDF Train Score: 0.9995863561155658
Logistic Regression Optimized Stemming, Stopwords, TF IDF Test Score: 0.9947290274684484


### Score Differential

#### TF IDF

In [58]:
print(f"Logistic Regression TF IDF Train Score: {tf_idf_train_score_LR}")
print(f"Logistic Regression TF IDF Test Score: {tf_idf_test_score_LR}")
print(f"Optimized Logistic Regression TF IDF Train Score: {optimized_tf_idf_train_score_LR}")
print(f"Optimized Logistic Regression TF IDF Test Score: {optimized_tf_idf_test_score_LR}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_tf_idf_train_score_LR - tf_idf_train_score_LR} = {((optimized_tf_idf_train_score_LR - tf_idf_train_score_LR) / tf_idf_train_score_LR) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_tf_idf_test_score_LR - tf_idf_test_score_LR}) = {((optimized_tf_idf_test_score_LR - tf_idf_test_score_LR) / tf_idf_test_score_LR) * 100:.2f}%")

Logistic Regression TF IDF Train Score: 0.9930316914852997
Logistic Regression TF IDF Test Score: 0.9869339272457313
Optimized Logistic Regression TF IDF Train Score: 0.9995545373552246
Optimized Logistic Regression TF IDF Test Score: 0.9964365256124722

Perbedaan skor model setelah optimisasi pada train data: 0.006522845869924909 = 0.66%
Perbedaan skor model setelah optimisasi pada test data: 0.009502598366740944) = 0.96%


#### TF IDF with Stemming and Stopwords

In [87]:
print(f"Logistic Regression Stemming, Stopwords, TF IDF Train Score: {stem_train_score_LR}")
print(f"Logistic Regression Stemming, Stopwords, TF IDF Test Score: {stem_test_score_LR}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: {optimized_stem_train_score_LR}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: {optimized_stem_test_score_LR}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_stem_train_score_LR - stem_train_score_LR} = {((optimized_stem_train_score_LR - stem_train_score_LR) / stem_train_score_LR) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_stem_test_score_LR - stem_test_score_LR}) = {((optimized_stem_test_score_LR - stem_test_score_LR) / stem_test_score_LR) * 100:.2f}%")

Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9916316660302915
Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.984112843355605
Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9995863561155658
Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.9947290274684484

Perbedaan skor model setelah optimisasi pada train data: 0.007954690085274274 = 0.80%
Perbedaan skor model setelah optimisasi pada test data: 0.010616184112843374) = 1.08%


In [60]:
print(f"Optimized Logistic Regression TF IDF Train Score: {optimized_tf_idf_train_score_LR}")
print(f"Optimized Logistic Regression TF IDF Test Score: {optimized_tf_idf_test_score_LR}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: {optimized_stem_train_score_LR}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: {optimized_stem_test_score_LR}")

print("")

print(f"Perbedaan skor antara 2 model pada train data: {optimized_stem_train_score_LR - optimized_tf_idf_train_score_LR} = {((optimized_stem_train_score_LR - optimized_tf_idf_train_score_LR) / optimized_tf_idf_train_score_LR) * 100:.2f}%")
print(f"Perbedaan skor antara 2 model pada pada test data: {optimized_stem_test_score_LR - optimized_tf_idf_test_score_LR}) = {((optimized_stem_test_score_LR - optimized_tf_idf_test_score_LR) / optimized_tf_idf_test_score_LR) * 100:.2f}%")

Optimized Logistic Regression TF IDF Train Score: 0.9995545373552246
Optimized Logistic Regression TF IDF Test Score: 0.9964365256124722
Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9995863561155658
Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.9947290274684484

Perbedaan skor antara 2 model pada train data: 3.1818760341195684e-05 = 0.00%
Perbedaan skor antara 2 model pada pada test data: -0.0017074981440238224) = -0.17%


## Pembuatan Model Decision Tree Classifier

### Model Decision Tree Classifier

In [61]:
tf_idf_DTC = DecisionTreeClassifier()
stem_DTC = DecisionTreeClassifier()

In [62]:
tf_idf_DTC.fit(x_train, y_train)
stem_DTC.fit(x_train_stem, y_train_stem)

### Model Evaluation

#### TF IDF

In [63]:
tf_idf_train_pred_DTC = tf_idf_DTC.predict(x_train)
tf_idf_test_pred_DTC = tf_idf_DTC.predict(x_test)

In [64]:
tf_idf_train_score_DTC = tf_idf_DTC.score(x_train, y_train)
tf_idf_test_score_DTC = tf_idf_DTC.score(x_test, y_test)

In [65]:
print(classification_report(y_train, tf_idf_train_pred_DTC))
print(classification_report(y_test, tf_idf_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16464
           1       1.00      1.00      1.00     14964

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7017
           1       1.00      1.00      1.00      6453

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [66]:
print(f"Decision Tree Classifier TF IDF Train Score: {tf_idf_train_score_DTC}")
print(f"Decision Tree Classifier TF IDF Test Score: {tf_idf_test_score_DTC}")

Decision Tree Classifier TF IDF Train Score: 0.9999681812396589
Decision Tree Classifier TF IDF Test Score: 0.9962138084632517


#### TF IDF with Stemming and Stopwords Score

In [67]:
stem_train_pred_DTC = stem_DTC.predict(x_train_stem)
stem_test_pred_DTC = stem_DTC.predict(x_test_stem)

In [68]:
stem_train_score_DTC = stem_DTC.score(x_train_stem, y_train_stem)
stem_test_score_DTC = stem_DTC.score(x_test_stem, y_test_stem)

In [69]:
print(classification_report(y_train_stem, stem_train_pred_DTC))
print(classification_report(y_test_stem, stem_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16464
           1       1.00      1.00      1.00     14964

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7017
           1       1.00      1.00      1.00      6453

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [70]:
print(f"Decision Tree Classifier Stemming, Stopwords, TF IDF Train Score: {stem_train_score_DTC}")
print(f"Decision Tree Classifier Stemming, Stopwords, TF IDF Test Score: {stem_test_score_DTC}")

Decision Tree Classifier Stemming, Stopwords, TF IDF Train Score: 0.9999681812396589
Decision Tree Classifier Stemming, Stopwords, TF IDF Test Score: 0.9965850037119525


### Decision Tree Classifier Parameter Optimization

In [38]:
# Definisikan parameter grid
tf_idf_param_grid_DTC = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
tf_idf_grid_search_dt = GridSearchCV(estimator=tf_idf_DTC, param_grid=tf_idf_param_grid_DTC, cv=5, scoring='accuracy', n_jobs=-1)

# Fit model
tf_idf_grid_search_dt.fit(x_train, y_train)

In [41]:
stem_param_grid_DTC = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
stem_grid_search_dt = GridSearchCV(estimator=stem_DTC, param_grid=stem_param_grid_DTC, cv=5, scoring='accuracy', n_jobs=-1)

# Fit model
stem_grid_search_dt.fit(x_train_stem, y_train_stem)

In [42]:
print(f"Best parameters for TF IDF Decision Tree Classifier: {tf_idf_grid_search_dt.best_params_}")
print(f"Best parameters for Stem and Stopwords Decision Tree Classifier: {stem_grid_search_dt.best_params_}")

Best parameters for TF IDF Decision Tree Classifier: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best parameters for Stem and Stopwords Decision Tree Classifier: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


### Decision Tree Clasifier Model after Optimisasi

In [71]:
optimized_tf_idf_DTC = DecisionTreeClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2)
optimized_stem_DTC = DecisionTreeClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2)

In [72]:
optimized_tf_idf_DTC.fit(x_train, y_train)
optimized_stem_DTC.fit(x_train_stem, y_train_stem)

#### TF IDF

In [73]:
optimized_tf_idf_train_pred_DTC = optimized_tf_idf_DTC.predict(x_train)
optimized_tf_idf_test_pred_DTC = optimized_tf_idf_DTC.predict(x_test)

In [74]:
optimized_tf_idf_train_score_DTC = optimized_tf_idf_DTC.score(x_train, y_train)

In [75]:
optimized_tf_idf_test_score_DTC = optimized_tf_idf_DTC.score(x_test, y_test)

In [76]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16464
           1       0.99      0.99      0.99     14964

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      7017
           1       0.98      0.99      0.99      6453

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [77]:
print(f"Optimized Decision Tree Classifier TF IDF Train Score: {optimized_tf_idf_train_score_DTC}")
print(f"Optimized Decision Tree Classifier TF IDF Test Score: {optimized_tf_idf_test_score_DTC}")

Optimized Decision Tree Classifier TF IDF Train Score: 0.9999681812396589
Optimized Decision Tree Classifier TF IDF Test Score: 0.9966592427616926


#### TF IDF With Stemming & Stopwords

In [78]:
optimized_stem_train_pred_DTC = optimized_stem_DTC.predict(x_train_stem)
optimized_stem_test_pred_DTC = optimized_stem_DTC.predict(x_test_stem)

In [79]:
optimized_stem_train_score_DTC = optimized_stem_DTC.score(x_train_stem, y_train_stem)

In [80]:
optimized_stem_test_score_DTC = optimized_stem_DTC.score(x_test_stem, y_test_stem)

In [81]:
print(classification_report(y_train_stem, optimized_stem_train_pred_DTC))
print(classification_report(y_test_stem, optimized_stem_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16464
           1       1.00      1.00      1.00     14964

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7017
           1       1.00      1.00      1.00      6453

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [82]:
print(f"Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF Train Score: {optimized_stem_train_score_DTC}")
print(f"Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF Test Score: {optimized_stem_test_score_DTC}")

Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF Train Score: 0.9999681812396589
Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF Test Score: 0.996807720861173


### Score Differential

#### TF IDF

In [84]:
print(f"Decision Tree Classifier TF IDF Train Score: {tf_idf_train_score_DTC}")
print(f"Decision Tree Classifier TF IDF Test Score: {tf_idf_test_score_DTC}")
print(f"Optimized Decision Tree Classifier TF IDF Train Score: {optimized_tf_idf_train_score_DTC}")
print(f"Optimized Decision Tree Classifier TF IDF Test Score: {optimized_tf_idf_test_score_DTC}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_tf_idf_train_score_DTC - tf_idf_train_score_DTC} = {((optimized_tf_idf_train_score_DTC - tf_idf_train_score_DTC) / tf_idf_train_score_DTC) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_tf_idf_test_score_DTC - tf_idf_test_score_DTC}) = {((optimized_tf_idf_test_score_DTC - tf_idf_test_score_DTC) / tf_idf_test_score_DTC) * 100:.2f}%")

Decision Tree Classifier TF IDF Train Score: 0.9999681812396589
Decision Tree Classifier TF IDF Test Score: 0.9962138084632517
Optimized Decision Tree Classifier TF IDF Train Score: 0.9999681812396589
Optimized Decision Tree Classifier TF IDF Test Score: 0.9966592427616926

Perbedaan skor model setelah optimisasi pada train data: 0.0 = 0.00%
Perbedaan skor model setelah optimisasi pada test data: 0.00044543429844090543) = 0.04%


#### TF IDF With Stemming & Stopwords

In [86]:
print(f"Logistic Regression Stemming, Stopwords, TF IDF Train Score: {stem_train_score_DTC}")
print(f"Logistic Regression Stemming, Stopwords, TF IDF Test Score: {stem_test_score_DTC}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: {optimized_stem_train_score_DTC}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: {optimized_stem_test_score_DTC}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_stem_train_score_DTC - stem_train_score_DTC} = {((optimized_stem_train_score_DTC - stem_train_score_DTC) / stem_train_score_DTC) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_stem_test_score_DTC - stem_test_score_DTC}) = {((optimized_stem_test_score_DTC - stem_test_score_DTC) / stem_test_score_DTC) * 100:.2f}%")

Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9999681812396589
Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.9965850037119525
Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9999681812396589
Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.996807720861173

Perbedaan skor model setelah optimisasi pada train data: 0.0 = 0.00%
Perbedaan skor model setelah optimisasi pada test data: 0.00022271714922050823) = 0.02%


In [88]:
print(f"Optimized Decision Tree Classifier TF IDF Train Score: {optimized_tf_idf_train_score_DTC}")
print(f"Optimized Decision Tree Classifier TF IDF Test Score: {optimized_tf_idf_test_score_DTC}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: {optimized_stem_train_score_DTC}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: {optimized_stem_test_score_DTC}")

print("")

print(f"Perbedaan skor antara 2 model pada train data: {optimized_stem_train_score_DTC - optimized_tf_idf_train_score_DTC} = {((optimized_stem_train_score_DTC - optimized_tf_idf_train_score_DTC) / optimized_tf_idf_train_score_DTC) * 100:.2f}%")
print(f"Perbedaan skor antara 2 model pada pada test data: {optimized_stem_test_score_DTC - optimized_tf_idf_test_score_DTC}) = {((optimized_stem_test_score_DTC - optimized_tf_idf_test_score_DTC) / optimized_tf_idf_test_score_DTC) * 100:.2f}%")

Optimized Decision Tree Classifier TF IDF Train Score: 0.9999681812396589
Optimized Decision Tree Classifier TF IDF Test Score: 0.9966592427616926
Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9999681812396589
Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.996807720861173

Perbedaan skor antara 2 model pada train data: 0.0 = 0.00%
Perbedaan skor antara 2 model pada pada test data: 0.00014847809948037582) = 0.01%


## Model Implementation

In [None]:
def output_label(n):
  if n==0:
    return "It Is Fake News"
  elif n==1:
    return "It Is Genuine News"


In [None]:
def manual_testing(news):
    # Corrected syntax for defining dictionary
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_x_test = new_def_test["text"].apply(wordopt)

    # Assuming 'vectorization' is your vectorizer object
    new_xv_test = vectorization.transform(new_x_test)

    # Model predictions
    pred_lr = LR.predict(new_xv_test)
    pred_dtc = DTC.predict(new_xv_test)
    pred_rfc = RFC.predict(new_xv_test)

    return "\nLR Prediction: {}\nDTC Prediction: {}\nRFC Prediction: {}".format(
        output_label(pred_lr[0]),
        output_label(pred_dtc[0]),
        output_label(pred_rfc[0])
    )


In [None]:
news_article = str(input())

KeyboardInterrupt: Interrupted by user

In [None]:
manual_testing(news_article)