# Fake News Detector

## Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

## Import Data

In [2]:
true = pd.read_csv('True.csv', engine='python', encoding='utf-8', on_bad_lines='skip')
fake = pd.read_csv('Fake.csv', engine='python', encoding='utf-8', on_bad_lines='skip')

In [3]:
true['label'] = 1
fake['label'] = 0

# Data Preprocessing

## Data Integration

In [4]:
news = pd.concat([fake, true], axis=0)

In [5]:
news.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [6]:
news.tail()

Unnamed: 0,title,text,subject,date,label
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


## Data Cleaning

### Checking Null Values

In [7]:
news.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

### Dropping Unnecessary Column

In [8]:
news.drop(['title', 'subject', 'date'], axis=1)

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",1
21414,MINSK (Reuters) - In the shadow of disused Sov...,1
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,1


In [9]:
news

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


### Scramble Data

In [10]:
news = news.sample(frac=1)
news.reset_index(inplace=True)
news.drop(['index'], axis = 1, inplace=True)

In [11]:
news

Unnamed: 0,title,text,subject,date,label
0,WATCH: FL Governor Refuses To Acknowledge Mas...,During a telephone interview with CNN s Jake T...,News,"June 12, 2016",0
1,Holocaust Survivor Says Donald Trump Reminds ...,Donald Trump is literally scaring Holocaust su...,News,"March 7, 2016",0
2,Democrats sue Trump for alleged voter intimida...,WASHINGTON (Reuters) - Democratic Party offici...,politicsNews,"October 31, 2016",1
3,Trump ally: Border wall's cost was deemed off ...,WASHINGTON (Reuters) - A close ally of Donald ...,politicsNews,"September 4, 2016",1
4,Trump defends attacks on Mexican-American U.S....,WASHINGTON (Reuters) - Presumptive Republican ...,politicsNews,"June 3, 2016",1
...,...,...,...,...,...
44893,Air strikes kill 69 in Syrian east since Sunda...,BEIRUT (Reuters) - The Syrian Observatory for ...,worldnews,"September 12, 2017",1
44894,New RNC Hispanic Outreach Head’s Humiliating ...,"Somebody alert Mike Rowe, the dirtiest job in ...",News,"June 2, 2016",0
44895,Brazil Supreme Court blocks extradition of Ita...,BRASILIA (Reuters) - A Supreme Court injunctio...,worldnews,"October 13, 2017",1
44896,Control of information shifts up a gear in run...,PHNOM PENH (Reuters) - Critics of Cambodian Pr...,worldnews,"September 8, 2017",1


### WordOPT

In [12]:
def wordopt(text):
    # Convert into lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|\www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove digits
    text = re.sub(r'\d', '', text)

    # Remove newline characters
    text = re.sub(r'\n', ' ', text)

    return text


In [13]:
news['text'] = news['text'].apply(wordopt)

In [14]:
news['text']

0        during a telephone interview with cnn s jake t...
1        donald trump is literally scaring holocaust su...
2        washington reuters  democratic party officials...
3        washington reuters  a close ally of donald tru...
4        washington reuters  presumptive republican pre...
                               ...                        
44893    beirut reuters  the syrian observatory for hum...
44894    somebody alert mike rowe the dirtiest job in a...
44895    brasilia reuters  a supreme court injunction o...
44896    phnom penh reuters  critics of cambodian prime...
44897    washington reuters  president barack obama wel...
Name: text, Length: 44898, dtype: object

## Stopwords and Stemming

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [17]:
def preprocess_text(text):
    # Tokenisasi teks
    words = text.split()

    # Hapus stopwords dan stem kata
    processed_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]

    # Gabungkan kembali kata yang telah diproses
    return ' '.join(processed_words)

In [18]:
X_stem = [preprocess_text(sentence) for sentence in news['text']]

In [19]:
def custom_tokenizer(text):
    return preprocess_text(text).split()

In [20]:
stem_tf_idf_vectorization = TfidfVectorizer()

In [21]:
x_stem_tf_idf = stem_tf_idf_vectorization.fit_transform(X_stem)

## Feature Extraction (TF-IDF)

In [22]:
x = news['text']
y = news['label']

### TF-IDF Vectorizer

In [23]:
tf_idf_vectorization = TfidfVectorizer()

In [24]:
x_tfidf = tf_idf_vectorization.fit_transform(x)

## Split Data

### Split Data: TF-IDF

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.3, random_state=10)

### Split Data: TF IDF & Stemming & Stopwords

In [26]:
x_train_stem, x_test_stem, y_train_stem, y_test_stem = train_test_split(x_stem_tf_idf, y, test_size=0.3, random_state=10)

# Model

## Logistic Regression Model

### Model Logistic Regression

In [27]:
tf_idf_LR = LogisticRegression()
stem_LR = LogisticRegression()

In [28]:
tf_idf_LR.fit(x_train, y_train)
stem_LR.fit(x_train_stem, y_train_stem)

### Model Evaluation

#### TF-IDF Score

In [29]:
tf_idf_train_pred_lr = tf_idf_LR.predict(x_train)
tf_idf_test_pred_lr = tf_idf_LR.predict(x_test)

In [73]:
tf_idf_train_score_LR = tf_idf_LR.score(x_train, y_train)
tf_idf_train_score_LR

0.9929998727249586

In [74]:
tf_idf_test_score_LR = tf_idf_LR.score(x_test, y_test)
tf_idf_test_score_LR

0.9881217520415738

In [32]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16425
           1       0.99      0.99      0.99     15003

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7056
           1       0.99      0.99      0.99      6414

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



#### TF-IDF with Stemming and Stopwords Score

In [33]:
stem_train_pred_lr = stem_LR.predict(x_train_stem)
stem_test_pred_lr = stem_LR.predict(x_test_stem)

In [85]:
stem_train_score_LR = stem_LR.score(x_train_stem, y_train_stem)

In [86]:
stem_test_score_LR = stem_LR.score(x_test_stem, y_test_stem)

In [36]:
print(classification_report(y_train_stem, stem_train_pred_lr))
print(classification_report(y_test_stem, stem_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16425
           1       0.99      0.99      0.99     15003

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      7056
           1       0.98      0.99      0.98      6414

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



### Logistic Regression Parameter Optimization

In [None]:
# Definisikan parameter grid
tf_idf_param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# Definisikan GridSearchCV
tf_idf_grid_search_log_reg = GridSearchCV(estimator=stem_LR, param_grid=tf_idf_param_grid, cv=5, scoring='accuracy')

# Fit model
tf_idf_grid_search_log_reg.fit(x_train, y_train)

In [None]:
# Definisikan parameter grid
stem_param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# Definisikan GridSearchCV
stem_grid_search_log_reg = GridSearchCV(estimator=stem_LR, param_grid=stem_param_grid, cv=5, scoring='accuracy')

# Fit model
stem_grid_search_log_reg.fit(x_train_stem, y_train_stem)

In [None]:
# Hasil terbaik
print(f"Best parameters for TF IDF Logistic Regression: {tf_idf_grid_search_log_reg.best_params_}")
print(f"Best parameters for Stemming and Stopwords Logistic Regression: {stem_grid_search_log_reg.best_params_}")

Best parameters for TF IDF Logistic Regression: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best parameters for Stemming and Stopwords Logistic Regression: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}


### Logistic Regression Model after Optimisasi

In [41]:
optimized_tf_idf_LR = LogisticRegression(C=100, penalty='l1', solver='liblinear')
optimized_stem_LR = LogisticRegression(C=10, penalty='l1', solver='liblinear')

In [42]:
optimized_tf_idf_LR.fit(x_train, y_train)
optimized_stem_LR.fit(x_train_stem, y_train_stem)

#### TF IDF

In [43]:
optimized_tf_idf_train_pred_lr = optimized_tf_idf_LR.predict(x_train)
optimized_tf_idf_test_pred_lr = optimized_tf_idf_LR.predict(x_test)

In [70]:
optimized_tf_idf_train_score_LR = optimized_tf_idf_LR.score(x_train, y_train)
optimized_tf_idf_train_score_LR

0.9999681812396589

In [71]:
optimized_tf_idf_test_score_LR = optimized_tf_idf_LR.score(x_test, y_test)
optimized_tf_idf_test_score_LR

0.997253155159614

In [46]:
print(classification_report(y_train, optimized_tf_idf_train_pred_lr))
print(classification_report(y_test, optimized_tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16425
           1       1.00      1.00      1.00     15003

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7056
           1       1.00      1.00      1.00      6414

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [110]:
print(f"TF IDF Train Score: {tf_idf_train_score}")
print(f"TF IDF Test Score: {tf_idf_test_score}")
print(f"Optimized TF IDF Train Score: {optimized_tf_idf_train_score}")
print(f"Optimized TF IDF Test Score: {optimized_tf_idf_test_score}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_tf_idf_train_score - tf_idf_train_score} = {((optimized_tf_idf_train_score - tf_idf_train_score) / tf_idf_train_score) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_tf_idf_test_score - tf_idf_test_score}) = {((optimized_tf_idf_test_score - tf_idf_test_score) / tf_idf_test_score) * 100:.2f}%")

TF IDF Train Score: 0.9929998727249586
TF IDF Test Score: 0.9881217520415738
Optimized TF IDF Train Score: 0.9999681812396589
Optimized TF IDF Test Score: 0.997253155159614

Perbedaan skor model setelah optimisasi pada train data: 0.0069683085147003165 = 0.70%
Perbedaan skor model setelah optimisasi pada test data: 0.009131403118040171) = 0.92%


#### TF IDF with Stemming and Stopwords

In [64]:
optimized_stem_train_pred_lr = optimized_stem_LR.predict(x_train_stem)
optimized_stem_test_pred_lr = optimized_stem_LR.predict(x_test_stem)

In [82]:
optimized_stem_train_score_LR = optimized_stem_LR.score(x_train_stem, y_train_stem)

In [83]:
optimized_stem_test_score_LR = optimized_stem_LR.score(x_test_stem, y_test_stem)

In [67]:
print(classification_report(y_train_stem, optimized_stem_train_pred_lr))
print(classification_report(y_test_stem, optimized_stem_test_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16425
           1       1.00      1.00      1.00     15003

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7056
           1       1.00      0.99      0.99      6414

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



### Score Differential

#### TF IDF

In [77]:
print(f"Logistic Regression TF IDF Train Score: {tf_idf_train_score_LR}")
print(f"Logistic Regression TF IDF Test Score: {tf_idf_test_score_LR}")
print(f"Optimized Logistic Regression TF IDF Train Score: {optimized_tf_idf_train_score_LR}")
print(f"Optimized Logistic Regression TF IDF Test Score: {optimized_tf_idf_test_score_LR}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_tf_idf_train_score_LR - tf_idf_train_score_LR} = {((optimized_tf_idf_train_score_LR - tf_idf_train_score_LR) / tf_idf_train_score_LR) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_tf_idf_test_score_LR - tf_idf_test_score_LR}) = {((optimized_tf_idf_test_score_LR - tf_idf_test_score_LR) / tf_idf_test_score_LR) * 100:.2f}%")

Logistic Regression TF IDF Train Score: 0.9929998727249586
Logistic Regression TF IDF Test Score: 0.9881217520415738
Optimized Logistic Regression TF IDF Train Score: 0.9999681812396589
Optimized Logistic Regression TF IDF Test Score: 0.997253155159614

Perbedaan skor model setelah optimisasi pada train data: 0.0069683085147003165 = 0.70%
Perbedaan skor model setelah optimisasi pada test data: 0.009131403118040171) = 0.92%


#### TF IDF with Stemming and Stopwords

In [87]:
print(f"Logistic Regression TF IDF Train Score: {stem_train_score_LR}")
print(f"Logistic Regression TF IDF Test Score: {stem_test_score_LR}")
print(f"Optimized Logistic Regression TF IDF Train Score: {optimized_stem_train_score_LR}")
print(f"Optimized Logistic Regression TF IDF Test Score: {optimized_stem_test_score_LR}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_stem_train_score_LR - stem_train_score_LR} = {((optimized_stem_train_score_LR - stem_train_score_LR) / stem_train_score_LR) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_stem_test_score_LR - stem_test_score_LR}) = {((optimized_stem_test_score_LR - stem_test_score_LR) / stem_test_score_LR) * 100:.2f}%")

Logistic Regression TF IDF Train Score: 0.9913771159475627
Logistic Regression TF IDF Test Score: 0.9853006681514477
Optimized Logistic Regression TF IDF Train Score: 0.999681812396589
Optimized Logistic Regression TF IDF Test Score: 0.9951002227171493

Perbedaan skor model setelah optimisasi pada train data: 0.008304696449026316 = 0.84%
Perbedaan skor model setelah optimisasi pada test data: 0.009799554565701585) = 0.99%


## Pembuatan Model Decision Tree Classifier

### Model Decision Tree Classifier

In [88]:
tf_idf_DTC = DecisionTreeClassifier()
stem_DTC = DecisionTreeClassifier()

In [89]:
tf_idf_DTC.fit(x_train, y_train)
stem_DTC.fit(x_train_stem, y_train_stem)

### Model Evaluation

#### TF IDF

In [90]:
tf_idf_train_pred_DTC = tf_idf_DTC.predict(x_train)
tf_idf_test_pred_DTC = tf_idf_DTC.predict(x_test)

In [91]:
tf_idf_train_score_DTC = tf_idf_DTC.score(x_train, y_train)


In [92]:
tf_idf_test_score_DTC = tf_idf_DTC.score(x_test, y_test)

In [93]:
print(classification_report(y_train, tf_idf_train_pred_DTC))
print(classification_report(y_test, tf_idf_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16425
           1       1.00      1.00      1.00     15003

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7056
           1       1.00      1.00      1.00      6414

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



#### TF IDF with Stemming and Stopwords Score

In [94]:
stem_train_pred_DTC = stem_DTC.predict(x_train_stem)
stem_test_pred_DTC = stem_DTC.predict(x_test_stem)

In [95]:
stem_train_score_DTC = stem_DTC.score(x_train_stem, y_train_stem)

In [96]:
stem_test_score_DTC = stem_DTC.score(x_test_stem, y_test_stem)

In [97]:
print(classification_report(y_train_stem, stem_train_pred_DTC))
print(classification_report(y_test_stem, stem_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16425
           1       1.00      1.00      1.00     15003

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7056
           1       1.00      1.00      1.00      6414

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



### Decision Tree Classifier Parameter Optimization

In [None]:
# Definisikan parameter grid
tf_idf_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
tf_idf_grid_search_dt = GridSearchCV(estimator=tf_idf_DTC, param_grid=tf_idf_param_grid, cv=5, scoring='accuracy')

# Fit model
tf_idf_grid_search_dt.fit(x_train, y_train)

stem_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
stem_grid_search_dt = GridSearchCV(estimator=stem_DTC, param_grid=stem_param_grid, cv=5, scoring='accuracy')

# Fit model
stem_grid_search_dt.fit(x_train_stem, y_train_stem)



In [None]:
print(f"Best parameters for TF IDF Decision Tree Classifier: {tf_idf_grid_search_dt.best_params_}")

print(f"Best parameters for Stem and Stopwords Decision Tree Classifier: {stem_grid_search_dt.best_params_}")

Best parameters for TF IDF Decision Tree Classifier: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best parameters for Stem and Stopwords Decision Tree Classifier: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}


### Decision Tree Clasifier Model after Optimisasi

In [98]:
optimized_tf_idf_DTC = DecisionTreeClassifier(max_depth=30, min_samples_leaf=1, min_samples_split=10)
optimized_stem_DTC = DecisionTreeClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=10)

In [99]:
optimized_tf_idf_DTC.fit(x_train, y_train)
optimized_stem_DTC.fit(x_train_stem, y_train_stem)

#### TF IDF

In [100]:
optimized_tf_idf_train_pred_DTC = optimized_tf_idf_DTC.predict(x_train)
optimized_tf_idf_test_pred_DTC = optimized_tf_idf_DTC.predict(x_test)

In [101]:
optimized_tf_idf_train_score_DTC = optimized_tf_idf_DTC.score(x_train, y_train)

In [102]:
optimized_tf_idf_test_score_DTC = optimized_tf_idf_DTC.score(x_test, y_test)

In [155]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16449
           1       1.00      1.00      1.00     14979

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7032
           1       1.00      1.00      1.00      6438

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



#### TF IDF With Stemming & Stopwords

In [103]:
optimized_stem_train_pred_DTC = optimized_stem_DTC.predict(x_train_stem)
optimized_stem_test_pred_DTC = optimized_stem_DTC.predict(x_test_stem)

In [104]:
optimized_stem_train_score_DTC = optimized_stem_DTC.score(x_train_stem, y_train_stem)

In [105]:
optimized_stem_test_score_DTC = optimized_stem_DTC.score(x_test_stem, y_test_stem)

In [106]:
print(classification_report(y_train_stem, optimized_stem_train_pred_DTC))
print(classification_report(y_test_stem, optimized_stem_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16425
           1       1.00      1.00      1.00     15003

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7056
           1       1.00      1.00      1.00      6414

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



### Score Differential

#### TF IDF

In [107]:
print(f"Decision Tree Classifier TF IDF Train Score: {tf_idf_train_score_LR}")
print(f"Decision Tree Classifier TF IDF Test Score: {tf_idf_test_score_LR}")
print(f"Optimized Decision Tree Classifier TF IDF Train Score: {optimized_tf_idf_train_score_LR}")
print(f"Optimized Decision Tree Classifier TF IDF Test Score: {optimized_tf_idf_test_score_LR}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_tf_idf_train_score_DTC - tf_idf_train_score_DTC} = {((optimized_tf_idf_train_score_DTC - tf_idf_train_score_DTC) / tf_idf_train_score_DTC) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_tf_idf_test_score_DTC - tf_idf_test_score_DTC}) = {((optimized_tf_idf_test_score_DTC - tf_idf_test_score_DTC) / tf_idf_test_score_DTC) * 100:.2f}%")

Decision Tree Classifier TF IDF Train Score: 0.9929998727249586
Decision Tree Classifier TF IDF Test Score: 0.9881217520415738
Optimized Decision Tree Classifier TF IDF Train Score: 0.9999681812396589
Optimized Decision Tree Classifier TF IDF Test Score: 0.997253155159614

Perbedaan skor model setelah optimisasi pada train data: -0.0012091128929616612 = -0.12%
Perbedaan skor model setelah optimisasi pada test data: -0.0008908685968819219) = -0.09%


#### TF IDF With Stemming & Stopwords

In [None]:
print(f"Logistic Regression TF IDF Train Score: {stem_train_score_LR}")
print(f"Logistic Regression TF IDF Test Score: {stem_test_score_LR}")
print(f"Optimized Logistic Regression TF IDF Train Score: {optimized_stem_train_score_LR}")
print(f"Optimized Logistic Regression TF IDF Test Score: {optimized_stem_test_score_LR}")

print("")

print(f"Perbedaan skor model setelah optimisasi pada train data: {optimized_stem_train_score_LR - stem_train_score_LR} = {((optimized_stem_train_score_LR - stem_train_score_LR) / stem_train_score_LR) * 100:.2f}%")
print(f"Perbedaan skor model setelah optimisasi pada test data: {optimized_stem_test_score_LR - stem_test_score_LR}) = {((optimized_stem_test_score_LR - stem_test_score_LR) / stem_test_score_LR) * 100:.2f}%")

## Model Implementation

In [None]:
def output_label(n):
  if n==0:
    return "It Is Fake News"
  elif n==1:
    return "It Is Genuine News"


In [None]:
def manual_testing(news):
    # Corrected syntax for defining dictionary
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_x_test = new_def_test["text"].apply(wordopt)

    # Assuming 'vectorization' is your vectorizer object
    new_xv_test = vectorization.transform(new_x_test)

    # Model predictions
    pred_lr = LR.predict(new_xv_test)
    pred_dtc = DTC.predict(new_xv_test)
    pred_rfc = RFC.predict(new_xv_test)

    return "\nLR Prediction: {}\nDTC Prediction: {}\nRFC Prediction: {}".format(
        output_label(pred_lr[0]),
        output_label(pred_dtc[0]),
        output_label(pred_rfc[0])
    )


In [None]:
news_article = str(input())

KeyboardInterrupt: Interrupted by user

In [None]:
manual_testing(news_article)