# Fake News Detector

## Import Library

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline as ImbPipeline
import os

## Import Data

In [2]:
true = pd.read_csv('True.csv', engine='python', encoding='utf-8', on_bad_lines='skip')
fake = pd.read_csv('Fake.csv', engine='python', encoding='utf-8', on_bad_lines='skip')

In [3]:
true

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"


In [4]:
fake

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"


In [5]:
true['label'] = 1
fake['label'] = 0

# Data Preprocessing

## Data Integration

In [6]:
news = pd.concat([fake, true], axis=0)

In [7]:
news.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [8]:
news.tail()

Unnamed: 0,title,text,subject,date,label
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


## Data Cleaning

### Checking Null Values

In [9]:
news.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

### Dropping Unnecessary Column

In [10]:
news = news.drop(['title', 'subject', 'date'], axis=1)

In [11]:
news

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",1
21414,MINSK (Reuters) - In the shadow of disused Sov...,1
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,1


### Scramble Data

In [12]:
news = news.sample(frac=1)
news.reset_index(inplace=True)
news.drop(['index'], axis = 1, inplace=True)

In [13]:
news

Unnamed: 0,text,label
0,"LOWELLVILLE, Ohio (Reuters) - On a sweltering ...",1
1,One of the countries Mooch and her taxpayer fu...,0
2,"As we know, the final United Nations General A...",0
3,WASHINGTON (Reuters) - A diplomatic dispute be...,1
4,"The internet is one of the few, if not only, ...",0
...,...,...
44893,WASHINGTON (Reuters) - President Donald Trump ...,1
44894,BRASILIA (Reuters) - One of four ministers rep...,1
44895,"Before Lavar Ball, the ungrateful father of ac...",0
44896,BEIJING (Reuters) - A workaholic keen swimmer ...,1


### WordOPT

In [14]:
def wordopt(text):
    # Convert into lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|\www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove digits
    text = re.sub(r'\d', '', text)

    # Remove newline characters
    text = re.sub(r'\n', ' ', text)

    return text

In [15]:
news['text'] = news['text'].apply(wordopt)

In [16]:
news['text']

0        lowellville ohio reuters  on a sweltering even...
1        one of the countries mooch and her taxpayer fu...
2        as we know the final united nations general as...
3        washington reuters  a diplomatic dispute betwe...
4         the internet is one of the few if not only av...
                               ...                        
44893    washington reuters  president donald trump on ...
44894    brasilia reuters  one of four ministers repres...
44895    before lavar ball the ungrateful father of acc...
44896    beijing reuters  a workaholic keen swimmer wit...
44897    donald trump released a brilliant video with c...
Name: text, Length: 44898, dtype: object

## Stopwords and Stemming

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [19]:
def preprocess_text(text):
    words = text.split()

    # Stemmer
    processed_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]

    # Join Words
    return ' '.join(processed_words)

In [20]:
X_stem = [preprocess_text(sentence) for sentence in news['text']]

## Split Data

In [21]:
x = news['text']
y = news['label']

### Split Data: TF-IDF

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10)

### Split Data: TF IDF & Stemming & Stopwords

In [23]:
x_train_stem, x_test_stem, y_train_stem, y_test_stem = train_test_split(X_stem, y, test_size=0.3, random_state=10)

# Model

## Logistic Regression Model

### Initializing Models: Logistic Regression

In [24]:
tf_idf_LR = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('log_reg', LogisticRegression())
])

stem_LR = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('log_reg', LogisticRegression())
])

full_LR = ImbPipeline([
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE()),
    ('scaler', StandardScaler(with_mean=False)),
    ('log_reg', LogisticRegression(class_weight='balanced'))
])

In [25]:
tf_idf_LR.fit(x_train, y_train)

In [26]:
stem_LR.fit(x_train_stem, y_train_stem)

In [None]:
full_LR.fit(x_train_stem, y_train_stem)

#### Model Evaluation: Logistic Regression

In [None]:
tf_idf_train_pred_lr = tf_idf_LR.predict(x_train)
tf_idf_test_pred_lr = tf_idf_LR.predict(x_test)

In [None]:
stem_train_pred_lr = stem_LR.predict(x_train_stem)
stem_test_pred_lr = stem_LR.predict(x_test_stem)

In [None]:
full_train_pred_lr = full_LR.predict(x_train_stem)
full_test_pred_lr = full_LR.predict(x_test_stem)

##### Classification Report

In [None]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

In [None]:
print(classification_report(y_train_stem, stem_train_pred_lr))
print(classification_report(y_test_stem, stem_test_pred_lr))

In [None]:
print(classification_report(y_train_stem, full_train_pred_lr))
print(classification_report(y_test_stem, full_test_pred_lr))

##### Logistic Regression Score

In [None]:
tf_idf_train_score_LR = tf_idf_LR.score(x_train, y_train)
tf_idf_test_score_LR = tf_idf_LR.score(x_test, y_test)

stem_train_score_LR = stem_LR.score(x_train_stem, y_train_stem)
stem_test_score_LR = stem_LR.score(x_test_stem, y_test_stem)

full_train_score_LR = full_LR.score(x_train_stem, y_train_stem)
full_test_score_LR = full_LR.score(x_test_stem, y_test_stem)

##### Model Comparison (Logistic Regression)

In [None]:
LR_models_train_score = [
    ('Logistic Regression TF IDF (Train Score)', tf_idf_train_score_LR, tf_idf_LR),
    ('Logistic Regression Stemming, Stopwords, TF IDF (Train Score)', stem_train_score_LR, stem_LR),
    ('Logistic Regression Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler (Train Score)', full_train_score_LR, full_LR),
]

LR_models_test_score = [
    ('Logistic Regression TF IDF (Test Score)', tf_idf_test_score_LR, tf_idf_LR),
    ('Logistic Regression Stemming, Stopwords, TF IDF (Test Score)', stem_test_score_LR, stem_LR),
    ('Logistic Regression Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler (Test Score)', full_test_score_LR, full_LR),
]

LR_models_train_score_sorted = sorted(LR_models_train_score, key=lambda x: x[1], reverse=True)
LR_models_test_score_sorted = sorted(LR_models_test_score, key=lambda x: x[1], reverse=True)

# Percentage differentials for training scores
LR_max_train_score = LR_models_train_score_sorted[0][1]
LR_train_percentage_differentials = [(model[1] - LR_max_train_score) / LR_max_train_score * 100 for model in LR_models_train_score_sorted]

# Percentage differentials for test scores
LR_max_test_score = LR_models_test_score_sorted[0][1]
LR_test_percentage_differentials = [(model[1] - LR_max_test_score) / LR_max_test_score * 100 for model in LR_models_test_score_sorted]

print("\nModel Rankings based on Train Accuracy:")
for rank, (desc, score, model) in enumerate(LR_models_train_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({LR_train_percentage_differentials[rank-1]:.2f}%)")

print("\nModel Rankings based on Test Accuracy:")
for rank, (desc, score, model) in enumerate(LR_models_test_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({LR_test_percentage_differentials[rank-1]:.2f}%)")

### Logistic Regression Hyperparameter Optimization

#### GridSearchCV

In [None]:
# GridSearchCV Parameters
LR_param_grid = {
    'tfidf__max_df': [0.75, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'log_reg__C': [0.1, 1.0, 10.0],
    'log_reg__penalty': ['l1', 'l2'],
    'log_reg__solver': ['liblinear', 'saga'],
    'log_reg__max_iter': [1000, 5000]
}

In [None]:
# TF IDF GridSearchCV
tf_idf_LR_random_search = GridSearchCV(estimator=tf_idf_LR, param_grid=LR_param_grid, cv=5, scoring='accuracy', n_jobs=4)
tf_idf_LR_random_search.fit(x_train, y_train)

In [None]:
# TF-IDF with Stemming & Stopwords GridSearchCV
stem_LR_random_search = GridSearchCV(estimator=stem_LR, param_grid=LR_param_grid, cv=5, scoring='accuracy', n_jobs=4)
stem_LR_random_search.fit(x_train_stem, y_train_stem)

In [None]:
# TF-IDF with Stemming, Stopwords, Class Balancing, and Standard Scaler GridSearchCV
full_LR_random_search = GridSearchCV(estimator=full_LR, param_grid=LR_param_grid, cv=5, scoring='accuracy', n_jobs=4)
full_LR_random_search.fit(x_train_stem, y_train_stem)



In [None]:
# Best Parameters
print(f"Best parameters for TF IDF Logistic Regression: {tf_idf_LR_random_search.best_params_}")
print(f"Best parameters for Stemming and Stopwords Logistic Regression: {stem_LR_random_search.best_params_}")
print(f"Best parameters for Stemming, Stopwords, Class Balancing, and Standard Scaler Logistic Regression: {full_LR_random_search.best_params_}")

Best parameters for TF IDF Logistic Regression: {'log_reg__C': 10.0, 'log_reg__max_iter': 5000, 'log_reg__penalty': 'l1', 'log_reg__solver': 'liblinear', 'tfidf__max_df': 0.75, 'tfidf__ngram_range': (1, 2)}
Best parameters for Stemming and Stopwords Logistic Regression: {'log_reg__C': 10.0, 'log_reg__max_iter': 1000, 'log_reg__penalty': 'l1', 'log_reg__solver': 'liblinear', 'tfidf__max_df': 1.0, 'tfidf__ngram_range': (1, 2)}
Best parameters for Stemming, Stopwords, Class Balancing, and Standard Scaler Logistic Regression: {'log_reg__C': 10.0, 'log_reg__max_iter': 1000, 'log_reg__penalty': 'l1', 'log_reg__solver': 'liblinear', 'tfidf__max_df': 1.0, 'tfidf__ngram_range': (1, 2)}


#### Initializing Models: Logistic Regression Model After GridSearchCV

In [None]:
tf_idf_LR.set_params(
    tfidf__max_df=0.75,
    tfidf__ngram_range=(1, 2),
    log_reg__C=10.0,
    log_reg__max_iter=5000,
    log_reg__penalty='l1',
    log_reg__solver='liblinear'
)
optimized_tf_idf_LR = tf_idf_LR

stem_LR.set_params(
    tfidf__max_df=1.0,
    tfidf__ngram_range=(1, 2),
    log_reg__C=10.0,
    log_reg__max_iter=1000,
    log_reg__penalty='l1',
    log_reg__solver='liblinear'
)
optimized_stem_LR = stem_LR

full_LR.set_params(
    tfidf__max_df=1.0,
    tfidf__ngram_range=(1, 2),
    log_reg__C=10.0,
    log_reg__max_iter=1000,
    log_reg__penalty='l1',
    log_reg__solver='liblinear'
)
optimized_full_LR = full_LR

##### Model Evaluation: Optimized Logistic Regression

In [None]:
optimized_tf_idf_train_pred_LR = optimized_tf_idf_LR.predict(x_train)
optimized_tf_idf_test_pred_LR = optimized_tf_idf_LR.predict(x_test)

In [None]:
optimized_stem_train_pred_LR = optimized_stem_LR.predict(x_train_stem)
optimized_stem_test_pred_LR = optimized_stem_LR.predict(x_test_stem)

In [None]:
optimized_full_train_pred_LR = optimized_full_LR.predict(x_train_stem)
optimized_full_test_pred_LR = optimized_full_LR.predict(x_test_stem)

###### Classification Report

In [None]:
print(classification_report(y_train, optimized_tf_idf_train_pred_LR))
print(classification_report(y_test, optimized_tf_idf_test_pred_LR))

In [None]:
print(classification_report(y_train_stem, optimized_stem_train_pred_LR))
print(classification_report(y_test_stem, optimized_stem_test_pred_LR))

In [None]:
print(classification_report(y_train_stem, optimized_full_train_pred_LR))
print(classification_report(y_test_stem, optimized_full_test_pred_LR))

###### Optimized Logistic Regression Score

In [None]:
optimized_tf_idf_train_score_LR = optimized_tf_idf_LR.score(x_train, y_train)
optimized_tf_idf_test_score_LR = optimized_tf_idf_LR.score(x_test, y_test)

optimized_stem_train_score_LR = optimized_stem_LR.score(x_train_stem, y_train_stem)
optimized_stem_test_score_LR = optimized_stem_LR.score(x_test_stem, y_test_stem)

optimized_full_train_score_LR = optimized_full_LR.score(x_train_stem, y_train_stem)
optimized_full_test_score_LR = optimized_full_LR.score(x_test_stem, y_test_stem)

###### Model Comparison (Optimized Logistic Regression)

In [None]:
LR_optimized_models_train_score = [
    ('Optimized Logistic Regression TF IDF (Train Score)', optimized_tf_idf_train_score_LR, optimized_tf_idf_LR),
    ('Optimized Logistic Regression Stemming, Stopwords, TF IDF (Train Score)', optimized_stem_train_score_LR, optimized_stem_LR),
    ('Optimized Logistic Regression Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler (Train Score)', optimized_full_train_score_LR, optimized_full_LR)
]

LR_optimized_models_test_score = [

    ('Optimized Logistic Regression TF IDF (Test Score)', optimized_tf_idf_test_score_LR, optimized_tf_idf_LR),
    ('Optimized Logistic Regression Stemming, Stopwords, TF IDF (Test Score)', optimized_stem_test_score_LR, optimized_stem_LR),
    ('Optimized Logistic Regression Stemming, Stopwords, TF IDF (Test Score), Class Balancing, and Standard Scaler', optimized_full_test_score_LR, optimized_full_LR)
]

LR_optimized_models_train_score_sorted = sorted(LR_optimized_models_train_score, key=lambda x: x[1], reverse=True)
LR_optimized_models_test_score_sorted = sorted(LR_optimized_models_test_score, key=lambda x: x[1], reverse=True)

LR_optimized_max_train_score = LR_optimized_models_train_score_sorted[0][1]
LR_optimized_train_percentage_differentials = [(model[1] - LR_optimized_max_train_score) / LR_optimized_max_train_score * 100 for model in LR_optimized_models_train_score_sorted]

LR_max_test_score = LR_optimized_models_test_score_sorted[0][1]
LR_test_percentage_differentials = [(model[1] - LR_max_test_score) / LR_max_test_score * 100 for model in LR_optimized_models_test_score_sorted]

print("\nOptimized Model Rankings based on Train Accuracy:")
for rank, (desc, score, model) in enumerate(LR_optimized_models_train_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({LR_optimized_train_percentage_differentials[rank-1]:.2f}%)")

print("\nOptimized Model Rankings based on Test Accuracy:")
for rank, (desc, score, model) in enumerate(LR_optimized_models_test_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({LR_test_percentage_differentials[rank-1]:.2f}%)")

### Models Rankings: Logistic Regression

In [None]:
LR_combined_models_train_score = LR_models_train_score + LR_optimized_models_train_score
LR_combined_models_test_score = LR_models_test_score + LR_optimized_models_test_score

LR_combined_models_train_score_sorted = sorted(LR_combined_models_train_score, key=lambda x: x[1], reverse=True)
LR_combined_models_test_score_sorted = sorted(LR_combined_models_test_score, key=lambda x: x[1], reverse=True)

LR_combined_max_train_score = LR_combined_models_train_score_sorted[0][1]
LR_combined_train_percentage_differentials = [(model[1] - LR_combined_max_train_score) / LR_combined_max_train_score * 100 for model in LR_combined_models_train_score_sorted]

LR_combined_max_test_score = LR_combined_models_test_score_sorted[0][1]
LR_combined_test_percentage_differentials = [(model[1] - LR_combined_max_test_score) / LR_combined_max_test_score * 100 for model in LR_combined_models_test_score_sorted]

print("\nOptimized Model Rankings based on Train Accuracy:")
for rank, (desc, score, model) in enumerate(LR_combined_models_train_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({LR_combined_train_percentage_differentials[rank-1]:.2f}%)")

print("\nOptimized Model Rankings based on Test Accuracy:")
for rank, (desc, score, model) in enumerate(LR_combined_models_test_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({LR_combined_test_percentage_differentials[rank-1]:.2f}%)")

## Decision Tree Classifier Model

### Initializing Models: Decision Tree Classifier

In [None]:
tf_idf_DTC = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dtc', DecisionTreeClassifier())
])

stem_DTC = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dtc', DecisionTreeClassifier())
])

full_DTC = ImbPipeline([
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE()),
    ('dtc', DecisionTreeClassifier(class_weight='balanced'))
])

In [None]:
tf_idf_DTC.fit(x_train, y_train)

In [None]:
stem_DTC.fit(x_train_stem, y_train_stem)

In [None]:
full_DTC.fit(x_train_stem, y_train_stem)

### Model Evaluation: Decision Tree Classifier

In [None]:
tf_idf_train_pred_DTC = tf_idf_DTC.predict(x_train)
tf_idf_test_pred_DTC = tf_idf_DTC.predict(x_test)

In [None]:
stem_train_pred_DTC = stem_DTC.predict(x_train_stem)
stem_test_pred_DTC = stem_DTC.predict(x_test_stem)

In [None]:
full_train_pred_DTC = full_DTC.predict(x_train)
full_test_pred_DTC = full_DTC.predict(x_test)

#### Classification Report

In [None]:
print(classification_report(y_train, tf_idf_train_pred_DTC))
print(classification_report(y_test, tf_idf_test_pred_DTC))

In [None]:
print(classification_report(y_train_stem, stem_train_pred_DTC))
print(classification_report(y_test_stem, stem_test_pred_DTC))

In [None]:
print(classification_report(y_train_stem, stem_train_pred_DTC))
print(classification_report(y_test_stem, stem_test_pred_DTC))

#### Decision Tree Classifier Score

In [None]:
tf_idf_train_score_DTC = tf_idf_DTC.score(x_train, y_train)
tf_idf_test_score_DTC = tf_idf_DTC.score(x_test, y_test)

stem_train_score_DTC = stem_DTC.score(x_train_stem, y_train_stem)
stem_test_score_DTC = stem_DTC.score(x_test_stem, y_test_stem)

full_train_score_DTC = full_DTC.score(x_train_stem, y_train_stem)
full_test_score_DTC = full_DTC.score(x_test_stem, y_test_stem)

#### Model Comparison

In [None]:
DTC_models_train_score = [
    ('Decision Tree Classifier TF IDF (Train Score)', tf_idf_train_score_DTC, tf_idf_DTC),
    ('Decision Tree Classifier Stemming, Stopwords, TF IDF (Train Score)', stem_train_score_DTC, stem_DTC),
    ('Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler (Train Score)', full_train_score_DTC, full_DTC),
]

DTC_models_test_score = [
    ('Decision Tree Classifier TF IDF (Test Score)', tf_idf_test_score_DTC, tf_idf_DTC),
    ('Decision Tree Classifier Stemming, Stopwords, TF IDF (Test Score)', stem_test_score_DTC, stem_DTC),
    ('Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler (Test Score)', full_test_score_DTC, full_DTC),
]

DTC_models_train_score_sorted = sorted(DTC_models_train_score, key=lambda x: x[1], reverse=True)
DTC_models_test_score_sorted = sorted(DTC_models_test_score, key=lambda x: x[1], reverse=True)

DTC_max_train_score = DTC_models_train_score_sorted[0][1]
DTC_train_percentage_differentials = [(model[1] - DTC_max_train_score) / DTC_max_train_score * 100 for model in DTC_models_train_score_sorted]

DTC_max_test_score = DTC_models_test_score_sorted[0][1]
DTC_test_percentage_differentials = [(model[1] - DTC_max_test_score) / DTC_max_test_score * 100 for model in DTC_models_test_score_sorted]

print("\nModel Rankings based on Train Accuracy:")
for rank, (desc, score, model) in enumerate(DTC_models_train_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({DTC_train_percentage_differentials[rank-1]:.2f}%)")

print("\nModel Rankings based on Test Accuracy:")
for rank, (desc, score, model) in enumerate(DTC_models_test_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({DTC_test_percentage_differentials[rank-1]:.2f}%)")

### Decision Tree Classifier Hyperparameter Optimization

#### GridSearchCV

In [None]:
# GridSearchCV Parameters
param_grid_DTC = {
    'dtc__max_depth': [None, 5, 10, 20, 30, 40, 50],
    'dtc__min_samples_split': [2, 5, 10, 15, 20, 25, 30],
    'dtc__min_samples_leaf': [1, 2, 3, 4, 5, 6, 7]
}

In [None]:
# TF-IDF GridSearchCV
tf_idf_grid_search_DTC = GridSearchCV(estimator=tf_idf_DTC, param_grid=param_grid_DTC, cv=5, scoring='accuracy', n_jobs=4)
tf_idf_grid_search_DTC.fit(x_train, y_train)

In [None]:
# TF-IDF with Stemming & Stopwords GridSearchCV
stem_grid_search_DTC = GridSearchCV(estimator=stem_DTC, param_grid=param_grid_DTC, cv=5, scoring='accuracy', n_jobs=4)
stem_grid_search_DTC.fit(x_train_stem, y_train_stem)

In [None]:
# TF-IDF with Stemming, Stopwords, Class Balancing, and Standard Scaler GridSearchCV
full_grid_search_DTC = GridSearchCV(estimator=full_DTC, param_grid=param_grid_DTC, cv=5, scoring='accuracy', n_jobs=4)
full_grid_search_DTC.fit(x_train_stem, y_train_stem)

In [None]:
print(f"Best parameters for TF IDF Decision Tree Classifier: {tf_idf_grid_search_DTC.best_params_}")
print(f"Best parameters for Stem and Stopwords Decision Tree Classifier: {stem_grid_search_DTC.best_params_}")
print(f"Best parameters for Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler: {full_grid_search_DTC.best_params_}")

Best parameters for TF IDF Decision Tree Classifier: {'dtc__max_depth': None, 'dtc__min_samples_leaf': 1, 'dtc__min_samples_split': 2}
Best parameters for Stem and Stopwords Decision Tree Classifier: {'dtc__max_depth': None, 'dtc__min_samples_leaf': 1, 'dtc__min_samples_split': 2}
Best parameters for Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler: {'dtc__max_depth': None, 'dtc__min_samples_leaf': 1, 'dtc__min_samples_split': 2}


#### Initializing Models: Decision Tree Classifier Model After GridSearchCV

In [None]:
tf_idf_DTC.set_params(
    dtc__max_depth=None,
    dtc__min_samples_leaf=1,
    dtc__min_samples_split=2
)

optimized_tf_idf_DTC = tf_idf_DTC

stem_DTC.set_params(
    dtc__max_depth=None,
    dtc__min_samples_leaf=1,
    dtc__min_samples_split=2
)

optimized_stem_DTC = stem_DTC

full_DTC.set_params(
    dtc__max_depth=None,
    dtc__min_samples_leaf=1,
    dtc__min_samples_split=2
)

optimized_full_DTC = full_DTC

##### Model Evaluation: Optimized Decision Tree Classifier

In [None]:
optimized_tf_idf_train_pred_DTC = optimized_tf_idf_DTC.predict(x_train)
optimized_tf_idf_test_pred_DTC = optimized_tf_idf_DTC.predict(x_test)

In [None]:
optimized_stem_train_pred_DTC = optimized_stem_DTC.predict(x_train_stem)
optimized_stem_test_pred_DTC = optimized_stem_DTC.predict(x_test_stem)

In [None]:
optimized_full_train_pred_DTC = optimized_full_DTC.predict(x_train_stem)
optimized_full_test_pred_DTC = optimized_full_DTC.predict(x_test_stem)

###### Classification Report

In [None]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

In [None]:
print(classification_report(y_train_stem, optimized_stem_train_pred_DTC))
print(classification_report(y_test_stem, optimized_stem_test_pred_DTC))

In [None]:
print(classification_report(y_train_stem, optimized_full_train_pred_DTC))
print(classification_report(y_test_stem, optimized_full_test_pred_DTC))

###### Optimized Decision Tree Classifier Score

In [None]:
optimized_tf_idf_train_score_DTC = optimized_tf_idf_DTC.score(x_train, y_train)
optimized_tf_idf_test_score_DTC = optimized_tf_idf_DTC.score(x_test, y_test)

optimized_stem_train_score_DTC = optimized_stem_DTC.score(x_train_stem, y_train_stem)
optimized_stem_test_score_DTC = optimized_stem_DTC.score(x_test_stem, y_test_stem)

optimized_full_train_score_DTC = optimized_full_DTC.score(x_train_stem, y_train_stem)
optimized_full_test_score_DTC = optimized_full_DTC.score(x_test_stem, y_test_stem)


###### Model Comparison (Optimized Logistic Regression)

In [None]:
DTC_optimized_models_train_score = [
    ('Optimized Decision Tree Classifier TF IDF (Train Score)', optimized_tf_idf_train_score_DTC, tf_idf_DTC),
    ('Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF (Train Score)', optimized_stem_train_score_DTC, stem_DTC),
    ('Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler (Train Score)', optimized_full_train_score_DTC, full_DTC),
]

DTC_optimized_models_test_score = [
    ('Optimized Decision Tree Classifier TF IDF (Test Score)', optimized_tf_idf_test_score_DTC, optimized_tf_idf_DTC),
    ('Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF (Test Score)', optimized_stem_test_score_DTC, optimized_stem_DTC),
    ('Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler (Test Score)', optimized_full_test_score_DTC, optimized_full_DTC),
]

DTC_optimized_models_train_score_sorted = sorted(DTC_optimized_models_train_score, key=lambda x: x[1], reverse=True)
DTC_optimized_models_test_score_sorted = sorted(DTC_optimized_models_test_score, key=lambda x: x[1], reverse=True)

DTC_optimized_max_train_score = DTC_optimized_models_train_score_sorted[0][1]
DTC_optimized_train_percentage_differentials = [(model[1] - DTC_optimized_max_train_score) / DTC_optimized_max_train_score * 100 for model in DTC_optimized_models_train_score_sorted]

DTC_optimized_max_test_score = DTC_optimized_models_test_score_sorted[0][1]
DTC_optimized_test_percentage_differentials = [(model[1] - DTC_optimized_max_test_score) / DTC_optimized_max_test_score * 100 for model in DTC_optimized_models_test_score_sorted]

print("\nModel Rankings based on Train Accuracy:")
for rank, (desc, score, model) in enumerate(DTC_optimized_models_train_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({DTC_optimized_train_percentage_differentials[rank-1]:.2f}%)")

print("\nModel Rankings based on Test Accuracy:")
for rank, (desc, score, model) in enumerate(DTC_optimized_models_test_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({DTC_optimized_test_percentage_differentials[rank-1]:.2f}%)")

### Models Rankings: Decision Tree Classifier

In [None]:
DTC_combined_models_train_score = DTC_models_train_score + DTC_optimized_models_train_score
DTC_combined_models_test_score = DTC_models_test_score + DTC_optimized_models_test_score

DTC_combined_models_train_score_sorted = sorted(DTC_combined_models_train_score, key=lambda x: x[1], reverse=True)
DTC_combined_models_test_score_sorted = sorted(DTC_combined_models_test_score, key=lambda x: x[1], reverse=True)

DTC_combined_max_train_score = DTC_combined_models_train_score_sorted[0][1]
DTC_combined_train_percentage_differentials = [(model[1] - DTC_combined_max_train_score) / DTC_combined_max_train_score * 100 for model in DTC_combined_models_train_score_sorted]

DTC_combined_max_test_score = DTC_combined_models_test_score_sorted[0][1]
DTC_combined_test_percentage_differentials = [(model[1] - DTC_combined_max_test_score) / DTC_combined_max_test_score * 100 for model in DTC_combined_models_test_score_sorted]

print("\nOptimized Model Rankings based on Train Accuracy:")
for rank, (desc, score, model) in enumerate(DTC_combined_models_train_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({DTC_combined_train_percentage_differentials[rank-1]:.2f}%)")

print("\nOptimized Model Rankings based on Test Accuracy:")
for rank, (desc, score, model) in enumerate(DTC_combined_models_test_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({DTC_combined_test_percentage_differentials[rank-1]:.2f}%)")

## Logistic Regression and Decision Tree Classifier Model Rankings

In [None]:
all_models_train_score = LR_combined_models_train_score + DTC_combined_models_train_score
all_models_test_score = LR_combined_models_test_score + DTC_combined_models_test_score

all_models_train_score_sorted = sorted(all_models_train_score, key=lambda x: x[1], reverse=True)
all_models_test_score_sorted = sorted(all_models_test_score, key=lambda x: x[1], reverse=True)

all_models_train_score = all_models_train_score_sorted[0][1]
all_models_train_percentage_differentials = [(model[1] - all_models_train_score) / all_models_train_score * 100 for model in all_models_train_score_sorted]

all_models_max_test_score = all_models_test_score_sorted[0][1]
all_models_test_percentage_differentials = [(model[1] - all_models_max_test_score) / all_models_max_test_score * 100 for model in all_models_test_score_sorted]

print("\nOptimized Model Rankings based on Train Accuracy:")
for rank, (desc, score, model) in enumerate(all_models_train_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({all_models_train_percentage_differentials[rank-1]:.2f}%)")

print("\nOptimized Model Rankings based on Test Accuracy:")
for rank, (desc, score, model) in enumerate(all_models_test_score_sorted, start=1):
    print(f"{rank}. {desc} -> {score:.5f} ({all_models_test_percentage_differentials[rank-1]:.2f}%)")

# Model Implementation

In [None]:
def output_label(n):
  if n==0:
    return "Fake News"
  elif n==1:
    return "Genuine News"

In [None]:
best_models = all_models_test_score_sorted[0]

In [None]:
print(f"Best Models\n{best_models[0]} -> {best_models[1]}")

Best Models
Logistic Regression Stemming, Stopwords, TF IDF (Test Score) -> 0.9964365256124722


In [None]:
def manual_testing(news):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_x_test = new_def_test["text"].apply(wordopt)

    # Model predictions
    pred = best_models[2].predict(new_x_test)
    print(pred)

    return print(f"Text: {news}\nPrediction: {output_label(pred[0])}")


In [None]:
news_article = str(input())
manual_testing(news_article)

[1]
Text: House Intelligence Committee Chairman Devin Nunes is going to have a bad day. He s been under the assumption, like many of us, that the Christopher Steele-dossier was what prompted the Russia investigation so he s been lashing out at the Department of Justice and the FBI in order to protect Trump. As it happens, the dossier is not what started the investigation, according to documents obtained by the New York Times.Former Trump campaign adviser George Papadopoulos was drunk in a wine bar when he revealed knowledge of Russian opposition research on Hillary Clinton.On top of that, Papadopoulos wasn t just a covfefe boy for Trump, as his administration has alleged. He had a much larger role, but none so damning as being a drunken fool in a wine bar. Coffee boys  don t help to arrange a New York meeting between Trump and President Abdel Fattah el-Sisi of Egypt two months before the election. It was known before that the former aide set up meetings with world leaders for Trump, bu

In [None]:
news_article = str(input())
manual_testing(news_article)

[0]
Text: On Friday, it was revealed that former Milwaukee Sheriff David Clarke, who was being considered for Homeland Security Secretary in Donald Trump s administration, has an email scandal of his own.In January, there was a brief run-in on a plane between Clarke and fellow passenger Dan Black, who he later had detained by the police for no reason whatsoever, except that maybe his feelings were hurt. Clarke messaged the police to stop Black after he deplaned, and now, a search warrant has been executed by the FBI to see the exchanges.Clarke is calling it fake news even though copies of the search warrant are on the Internet. I am UNINTIMIDATED by lib media attempts to smear and discredit me with their FAKE NEWS reports designed to silence me,  the former sheriff tweeted.  I will continue to poke them in the eye with a sharp stick and bitch slap these scum bags til they get it. I have been attacked by better people than them #MAGA I am UNINTIMIDATED by lib media attempts to smear and