# Fake News Detector

## Import Library

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

## Import Data

In [None]:
true = pd.read_csv('True.csv', engine='python', encoding='utf-8', on_bad_lines='skip')
fake = pd.read_csv('Fake.csv', engine='python', encoding='utf-8', on_bad_lines='skip')

In [None]:
true['label'] = 1
fake['label'] = 0

# Data Preprocessing

## Data Integration

In [None]:
news = pd.concat([fake, true], axis=0)

In [None]:
news.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [None]:
news.tail()

Unnamed: 0,title,text,subject,date,label
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


## Data Cleaning

### Checking Null Values

In [None]:
news.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

### Dropping Unnecessary Column

In [None]:
news.drop(['title', 'subject', 'date'], axis=1)

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",1
21414,MINSK (Reuters) - In the shadow of disused Sov...,1
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,1


In [None]:
news

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


### Scramble Data

In [None]:
news = news.sample(frac=1)
news.reset_index(inplace=True)
news.drop(['index'], axis = 1, inplace=True)

In [None]:
news

Unnamed: 0,title,text,subject,date,label
0,Trump Official ADMITS He Is Collecting ‘Dossi...,Reality TV star Omarosa Manigault has found he...,News,"February 14, 2017",0
1,New Trump travel order expected in coming days...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"March 1, 2017",1
2,"Trump Is Going To Make Measles Great Again, W...",Donald Trump says he surrounds himself with t...,News,"January 10, 2017",0
3,BREAKING BAD: John McCain’s Campaign Rocked by...,21st Century Wire asks Will this be the beginn...,US_News,"April 28, 2016",0
4,Former homeland secretary says FBI delayed not...,WASHINGTON (Reuters) - Former Homeland Securit...,politicsNews,"June 21, 2017",1
...,...,...,...,...,...
44893,Russian military chief meets NATO General to s...,"MOSCOW (Reuters) - General Valery Gerasimov, t...",worldnews,"September 7, 2017",1
44894,Vietnam's Facebook dissidents test the limits ...,"HANOI (Reuters) - This isn t like China, say...",worldnews,"August 29, 2017",1
44895,Merkel warns Hungary of financial consequences...,BERLIN (Reuters) - Hungary could face financia...,worldnews,"September 15, 2017",1
44896,"Another debate brawl? Fox News, Kelly set for ...",LOS ANGELES (Reuters) - Thursday’s Republican ...,politicsNews,"March 2, 2016",1


## Feature Extraction (TF-IDF)

In [None]:
import re

In [None]:
def wordopt(text):
    # Convert into lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|\www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove digits
    text = re.sub(r'\d', '', text)

    # Remove newline characters
    text = re.sub(r'\n', ' ', text)

    return text


In [None]:
news['text'] = news['text'].apply(wordopt)

In [None]:
news['text']

0        reality tv star omarosa manigault has found he...
1        washington reuters  us president donald trump ...
2        donald trump says he surrounds himself with  t...
3        st century wire asks will this be the beginnin...
4        washington reuters  former homeland security s...
                               ...                        
44893    moscow reuters  general valery gerasimov the c...
44894    hanoi reuters   this isn t like china  says vi...
44895    berlin reuters  hungary could face financial c...
44896    los angeles reuters  thursdays republican deba...
44897    the cincinnati enquirer has endorsed republica...
Name: text, Length: 44898, dtype: object

## Stopwords and Stemming

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [None]:
def preprocess_text(text):
    # Tokenisasi teks
    words = text.split()

    # Hapus stopwords dan stem kata
    processed_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]

    # Gabungkan kembali kata yang telah diproses
    return ' '.join(processed_words)

## Data Transformation

In [None]:
x = news['text']
y = news['label']

In [None]:
x.shape

(44898,)

### TF-IDF Vectorizer

In [None]:
vectorization = TfidfVectorizer()

In [None]:
x_tfidf = vectorization.fit_transform(x)

### Stemming and Stopwords

In [None]:
X_train_processed = [preprocess_text(sentence) for sentence in x]

In [None]:
def custom_tokenizer(text):
    return preprocess_text(text).split()

In [None]:
vectorizer = CountVectorizer(tokenizer=custom_tokenizer)

In [None]:
x_vectorized = vectorizer.fit_transform(X_train_processed)



# Split Data

### Split Data: TF-IDF

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.3, random_state=10)

Split Data: Stemming & Stopwords

In [None]:
x_train_stem, x_test_stem, y_train_stem, y_test_stem = train_test_split(x_vectorized, y, test_size=0.3, random_state=10)

# Model

## Pembuatan Model Logistic Regression

### Model Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
tf_idf_LR = LogisticRegression(C=1, penalty='l1', solver='liblinear')
stem_LR = LogisticRegression()


In [None]:
tf_idf_LR.fit(x_train, y_train)
stem_LR.fit(x_train_stem, y_train_stem)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
tf_idf_train_pred_lr = tf_idf_LR.predict(x_train)
tf_idf_test_pred_lr = tf_idf_LR.predict(x_test)

stem_train_pred_lr = stem_LR.predict(x_train_stem)
stem_test_pred_lr = stem_LR.predict(x_test_stem)

In [None]:
tf_idf_LR.score(x_train, y_train)

0.9941453480972381

In [None]:
tf_idf_LR.score(x_test, y_test)

0.9943578322197476

In [None]:
stem_LR.score(x_train_stem, y_train_stem)

0.9999681812396589

In [None]:
stem_LR.score(x_test_stem, y_test_stem)

0.995916852264291

In [None]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     16475
           1       0.99      1.00      0.99     14953

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      7006
           1       0.99      1.00      0.99      6464

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [None]:
print(classification_report(y_train_stem, stem_train_pred_lr))
print(classification_report(y_test_stem, stem_test_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16475
           1       1.00      1.00      1.00     14953

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7006
           1       1.00      1.00      1.00      6464

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



### Optimasi Parameter Pada Logistic Regression

In [None]:
# Definisikan parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

In [None]:
from sklearn.model_selection import GridSearchCV

# Definisikan parameter grid
tf_idf_param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# Definisikan GridSearchCV
tf_idf_grid_search_log_reg = GridSearchCV(estimator=stem_LR, param_grid=tf_idf_param_grid, cv=5, scoring='accuracy')

# Fit model
tf_idf_grid_search_log_reg.fit(x_train, y_train)


# Definisikan parameter grid
stem_param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# Definisikan GridSearchCV
stem_grid_search_log_reg = GridSearchCV(estimator=stem_LR, param_grid=stem_param_grid, cv=5, scoring='accuracy')

# Fit model
stem_grid_search_log_reg.fit(x_train_stem, y_train_stem)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
# Hasil terbaik
print(f"Best parameters for Logistic Regression: {tf_idf_grid_search_log_reg.best_params_}")
print(f"Best parameters for Logistic Regression: {stem_grid_search_log_reg.best_params_}")

### Model Setelah Optimisasi

## Pembuatan Model Decision Tree Classifier

### Model Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tf_idf_DTC = DecisionTreeClassifier()
stem_DTC = DecisionTreeClassifier()

In [None]:
# DTC.fit(x_train, y_train)

tf_idf_DTC.fit(x_train, y_train)
stem_DTC.fit(x_train_stem, y_train_stem)

In [None]:
# train_pred_dtc = DTC.predict(x_train)
# test_pred_dtc = DTC.predict(x_test)

tf_idf_train_pred_DTC = tf_idf_DTC.predict(x_train)
tf_idf_test_pred_DTC = tf_idf_DTC.predict(x_test)

stem_train_pred_DTC = stem_DTC.predict(x_train_stem)
stem_test_pred_DTC = stem_DTC.predict(x_test_stem)

In [None]:
tf_idf_DTC.score(x_train, y_train)


In [None]:
tf_idf_DTC.score(x_test, y_test)

In [None]:
stem_DTC.score(x_train_stem, y_train_stem)

In [None]:
stem_DTC.score(x_test_stem, y_test_stem)

In [None]:
print(classification_report(y_train, tf_idf_train_pred_DTC))
print(classification_report(y_test, tf_idf_test_pred_DTC))

In [None]:
print(classification_report(y_train_stem, stem_train_pred_DTC))
print(classification_report(y_test_stem, stem_test_pred_DTC))

### Optimasi Parameter Pada Decision Tree classifier

In [None]:
from sklearn.model_selection import GridSearchCV

# Definisikan parameter grid
tf_idf_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
tf_idf_grid_search_dt = GridSearchCV(estimator=tf_idf_DTC, param_grid=tf_idf_param_grid, cv=5, scoring='accuracy')

# Fit model
tf_idf_grid_search_dt.fit(x_train, y_train)

# Hasil terbaik
print(f"Best parameters for Decision Tree Classifier: {tf_idf_grid_search_dt.best_params_}")

stem_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
stem_grid_search_dt = GridSearchCV(estimator=stem_DTC, param_grid=stem_param_grid, cv=5, scoring='accuracy')

# Fit model
stem_grid_search_dt.fit(x_train_stem, y_train_stem)

# Hasil terbaik
print(f"Best parameters for Decision Tree Classifier: {stem_grid_search_dt.best_params_}")


### Model DTC Setelah Optimisasi

## Pembuatan Model Random Forest Classifier

### Model Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
tf_idf_RFC = RandomForestClassifier()
stem_RFC = RandomForestClassifier()

In [None]:
tf_idf_RFC.fit(x_train, y_train)
stem_RFC.fit(x_train_stem, y_train_stem)

In [None]:
tf_idf_train_pred_RFC = tf_idf_RFC.predict(x_train)
tf_idf_test_pred_RFC = tf_idf_RFC.predict(x_test)

stem_train_pred_RFC = stem_RFC.predict(x_train_stem)
stem_test_pred_RFC = stem_RFC.predict(x_test_stem)

In [None]:
tf_idf_RFC.score(x_train, y_train)


In [None]:
tf_idf_RFC.score(x_test, y_test)


In [None]:
stem_RFC.score(x_train_stem, y_train_stem)


In [None]:
stem_RFC.score(x_test_stem, y_test_stem)

In [None]:
print(classification_report(y_train, tf_idf_train_pred_RFC))
print(classification_report(y_test, tf_idf_test_pred_RFC))

In [None]:
print(classification_report(y_train_stem, stem_train_pred_RFC))
print(classification_report(y_test_stem, stem_test_pred_RFC))

### Optimasi Parameter Pada Random Forest Classifier

In [None]:
from sklearn.model_selection import GridSearchCV


# Definisikan parameter grid
tf_idf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
tf_idf_grid_search_rf = GridSearchCV(estimator=tf_idf_RFC, param_grid=tf_idf_param_grid, cv=5, scoring='accuracy')

# Fit model
tf_idf_grid_search_rf.fit(x_train, y_train)

# Hasil terbaik
print(f"Best parameters for Random Forest Classifier: {tf_idf_grid_search_rf.best_params_}")

# Definisikan parameter grid
stem_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
stem_grid_search_rf = GridSearchCV(estimator=stem_RFC, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit model
stem_grid_search_rf.fit(x_train_stem, y_train_stem)

# Hasil terbaik
print(f"Best parameters for Random Forest Classifier: {stem_grid_search_rf.best_params_}")


### Model Setelah Optimisasi

# TESTING

In [None]:
def output_label(n):
  if n==0:
    return "It Is Fake News"
  elif n==1:
    return "It Is Genuine News"


In [None]:
def manual_testing(news):
    # Corrected syntax for defining dictionary
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_x_test = new_def_test["text"].apply(wordopt)

    # Assuming 'vectorization' is your vectorizer object
    new_xv_test = vectorization.transform(new_x_test)

    # Model predictions
    pred_lr = LR.predict(new_xv_test)
    pred_dtc = DTC.predict(new_xv_test)
    pred_rfc = RFC.predict(new_xv_test)

    return "\nLR Prediction: {}\nDTC Prediction: {}\nRFC Prediction: {}".format(
        output_label(pred_lr[0]),
        output_label(pred_dtc[0]),
        output_label(pred_rfc[0])
    )


In [None]:
news_article = str(input())

KeyboardInterrupt: Interrupted by user

In [None]:
manual_testing(news_article)