# Fake News Detector

## Import Library

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import RandomizedSearchCV
import os

In [2]:
files = os.listdir()

for file in files:
    print(f"{file}")

.git
Fake.csv
Progress Laporan STKI Kelompok 2.docx
Progress Laporan STKI Kelompok 2.pdf
STKI_Fake_News_Detector.ipynb
True.csv


## Import Data

In [3]:
true = pd.read_csv('True.csv', engine='python', encoding='utf-8', on_bad_lines='skip')
fake = pd.read_csv('Fake.csv', engine='python', encoding='utf-8', on_bad_lines='skip')

In [4]:
true

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"


In [5]:
fake

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"


In [6]:
true['label'] = 1
fake['label'] = 0

# Data Preprocessing

## Data Integration

In [7]:
news = pd.concat([fake, true], axis=0)

In [8]:
news.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [9]:
news.tail()

Unnamed: 0,title,text,subject,date,label
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


## Data Cleaning

### Checking Null Values

In [10]:
news.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

### Dropping Unnecessary Column

In [11]:
news = news.drop(['title', 'subject', 'date'], axis=1)

In [12]:
news

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",1
21414,MINSK (Reuters) - In the shadow of disused Sov...,1
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,1


### Scramble Data

In [13]:
news = news.sample(frac=1)
news.reset_index(inplace=True)
news.drop(['index'], axis = 1, inplace=True)

In [14]:
news

Unnamed: 0,text,label
0,NEW YORK (Reuters) - One would expect voters f...,1
1,WASHINGTON (Reuters) - Some key U.S. senators ...,1
2,A budget meeting was cut short after the mayor...,0
3,If Donald Trump thinks being president means h...,0
4,WASHINGTON (Reuters) - Federal and state autho...,1
...,...,...
44893,WASHINGTON (Reuters) - Democrat Hillary Clinto...,1
44894,"Fox News, Donald Trump, and the rest of the ha...",0
44895,While it seems like Donald Trump hasn t worked...,0
44896,"If I were U.S. District Judge Andrew Hanen, I ...",0


### WordOPT

In [15]:
def wordopt(text):
    # Convert into lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|\www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove digits
    text = re.sub(r'\d', '', text)

    # Remove newline characters
    text = re.sub(r'\n', ' ', text)

    return text


In [16]:
news['text'] = news['text'].apply(wordopt)

In [17]:
news['text']

0        new york reuters  one would expect voters from...
1        washington reuters  some key us senators still...
2        a budget meeting was cut short after the mayor...
3        if donald trump thinks being president means h...
4        washington reuters  federal and state authorit...
                               ...                        
44893    washington reuters  democrat hillary clinton m...
44894    fox news donald trump and the rest of the half...
44895    while it seems like donald trump hasn t worked...
44896    if i were us district judge andrew hanen i d b...
44897    kuala lumpur reuters  malaysian police arreste...
Name: text, Length: 44898, dtype: object

## Stopwords and Stemming

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kalea\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [20]:
def preprocess_text(text):
    words = text.split()

    # Hapus stopwords dan stem kata
    processed_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]

    # Gabungkan kembali kata yang telah diproses
    return ' '.join(processed_words)

In [21]:
X_stem = [preprocess_text(sentence) for sentence in news['text']]

## Split Data

In [22]:
x = news['text']
y = news['label']

### Split Data: TF-IDF

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10)

### Split Data: TF IDF & Stemming & Stopwords

In [24]:
x_train_stem, x_test_stem, y_train_stem, y_test_stem = train_test_split(X_stem, y, test_size=0.3, random_state=10)

# Model

## Logistic Regression Model

### Model Logistic Regression

In [25]:
tf_idf_LR = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('log_reg', LogisticRegression())
])

stem_LR = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('log_reg', LogisticRegression())
])

full_LR = ImbPipeline([
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE()),
    ('scaler', StandardScaler(with_mean=False)),
    ('log_reg', LogisticRegression(class_weight='balanced'))
])

In [26]:
tf_idf_LR.fit(x_train, y_train)
stem_LR.fit(x_train_stem, y_train_stem)
full_LR.fit(x_train_stem, y_train_stem)

### Model Evaluation

#### TF-IDF Score

In [27]:
tf_idf_train_pred_lr = tf_idf_LR.predict(x_train)
tf_idf_test_pred_lr = tf_idf_LR.predict(x_test)

In [28]:
tf_idf_train_score_LR = tf_idf_LR.score(x_train, y_train)
tf_idf_test_score_LR = tf_idf_LR.score(x_test, y_test)

In [29]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16495
           1       0.99      0.99      0.99     14933

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6986
           1       0.99      0.99      0.99      6484

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [30]:
print(f"Logistic Regression TF IDF Train Score: {tf_idf_train_score_LR}")
print(f"Logistic Regression TF IDF Test Score: {tf_idf_test_score_LR}")

Logistic Regression TF IDF Train Score: 0.9925225913198422
Logistic Regression TF IDF Test Score: 0.9882702301410542


#### TF-IDF with Stemming and Stopwords Score

In [31]:
stem_train_pred_lr = stem_LR.predict(x_train_stem)
stem_test_pred_lr = stem_LR.predict(x_test_stem)

In [32]:
stem_train_score_LR = stem_LR.score(x_train_stem, y_train_stem)
stem_test_score_LR = stem_LR.score(x_test_stem, y_test_stem)

In [33]:
print(classification_report(y_train_stem, stem_train_pred_lr))
print(classification_report(y_test_stem, stem_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16495
           1       0.99      0.99      0.99     14933

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      6986
           1       0.98      0.99      0.98      6484

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [34]:
print(f"Stemming, Stopwords, TF IDF Train Score: {stem_train_score_LR}")
print(f"Stemming, Stopwords, TF IDF Test Score: {stem_test_score_LR}")

Stemming, Stopwords, TF IDF Train Score: 0.9910589283441517
Stemming, Stopwords, TF IDF Test Score: 0.9852264291017075


#### TF-IDF with Stemming, Stopwords, Class Balancing, and Standard Scaler Score

In [35]:
full_train_pred_lr = full_LR.predict(x_train_stem)
full_test_pred_lr = full_LR.predict(x_test_stem)

In [36]:
full_train_score_LR = full_LR.score(x_train_stem, y_train_stem)
full_test_score_LR = full_LR.score(x_test_stem, y_test_stem)

In [37]:
print(classification_report(y_train_stem, full_train_pred_lr))
print(classification_report(y_test_stem, full_test_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16495
           1       1.00      1.00      1.00     14933

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      6986
           1       0.95      0.99      0.97      6484

    accuracy                           0.97     13470
   macro avg       0.97      0.97      0.97     13470
weighted avg       0.97      0.97      0.97     13470



In [38]:
print(f"Stemming, Stopwords, Class Balancing, and Standard Scaler Train Score: {full_train_score_LR}")
print(f"Stemming, Stopwords, Class Balancing, and Standard Scaler Test Score: {full_test_score_LR}")

Stemming, Stopwords, Class Balancing, and Standard Scaler Train Score: 0.9999681812396589
Stemming, Stopwords, Class Balancing, and Standard Scaler Test Score: 0.9719376391982183


#### Model Comparison

In [39]:
# Model 1: Logistic Regression TF IDF Scores
print("Model 1: Logistic Regression TF IDF Scores Scores:")
print(f"Train Score: {tf_idf_train_score_LR}")
print(f"Test Score: {tf_idf_test_score_LR}")

# Model 2: Logistic Regression Stemming, Stopwords, TF IDF Scores
print("\nModel 2: Logistic Regression Stemming, Stopwords, TF IDF Scores:")
print(f"Train Score: {stem_train_score_LR}")
print(f"Test Score: {stem_test_score_LR}")

# Model 3: Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Scores
print("\nModel 3: Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Scores:")
print(f"Train Score: {full_train_score_LR}")
print(f"Test Score: {full_test_score_LR}")

print("")

# Percentage Differences between Model 1 and Model 2 on train and test data
print("Percentage Differences between Model 1 and Model 2:")
train_score_diff_1_2 = tf_idf_train_score_LR - stem_train_score_LR
test_score_diff_1_2 = tf_idf_test_score_LR - stem_test_score_LR
train_score_percent_diff_1_2 = (train_score_diff_1_2 / tf_idf_train_score_LR) * 100
test_score_percent_diff_1_2 = (test_score_diff_1_2 / tf_idf_test_score_LR) * 100
print(f"Train Data: {train_score_diff_1_2} = {train_score_percent_diff_1_2:.2f}%")
print(f"Test Data: {test_score_diff_1_2} = {test_score_percent_diff_1_2:.2f}%")

# Percentage Differences between Model 1 and Model 3 on train and test data
print("\nPercentage Differences between Model 1 and Model 3:")
train_score_diff_1_3 = tf_idf_train_score_LR - full_train_score_LR
test_score_diff_1_3 = tf_idf_test_score_LR - full_test_score_LR
train_score_percent_diff_1_3 = (train_score_diff_1_3 / tf_idf_train_score_LR) * 100
test_score_percent_diff_1_3 = (test_score_diff_1_3 / tf_idf_test_score_LR) * 100
print(f"Train Data: {train_score_diff_1_3} = {train_score_percent_diff_1_3:.2f}%")
print(f"Test Data: {test_score_diff_1_3} = {test_score_percent_diff_1_3:.2f}%")

# Percentage Differences between Model 2 and Model 3 on train and test data
print("\nPercentage Differences between Model 2 and Model 3:")
train_score_diff_2_3 = stem_train_score_LR - full_train_score_LR
test_score_diff_2_3 = stem_test_score_LR - full_test_score_LR
train_score_percent_diff_2_3 = (train_score_diff_2_3 / stem_train_score_LR) * 100
test_score_percent_diff_2_3 = (test_score_diff_2_3 / stem_test_score_LR) * 100
print(f"Train Data: {train_score_diff_2_3} = {train_score_percent_diff_2_3:.2f}%")
print(f"Test Data: {test_score_diff_2_3} = {test_score_percent_diff_2_3:.2f}%")


Model 1: Logistic Regression TF IDF Scores Scores:
Train Score: 0.9925225913198422
Test Score: 0.9882702301410542

Model 2: Logistic Regression Stemming, Stopwords, TF IDF Scores:
Train Score: 0.9910589283441517
Test Score: 0.9852264291017075

Model 3: Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Scores:
Train Score: 0.9999681812396589
Test Score: 0.9719376391982183

Percentage Differences between Model 1 and Model 2:
Train Data: 0.0014636629756905606 = 0.15%
Test Data: 0.0030438010393466497 = 0.31%

Percentage Differences between Model 1 and Model 3:
Train Data: -0.0074455899198166975 = -0.75%
Test Data: 0.0163325909428359 = 1.65%

Percentage Differences between Model 2 and Model 3:
Train Data: -0.008909252895507258 = -0.90%
Test Data: 0.013288789903489251 = 1.35%


### Logistic Regression Hyperparameter Optimization

#### GridSearchCV

In [40]:
# Definisikan parameter grid
LR_param_grid = {
    'tfidf__max_df': [0.75, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'log_reg__C': [0.1, 1.0, 10.0],
    'log_reg__penalty': ['l1', 'l2'],
    'log_reg__solver': ['liblinear', 'saga'],
    'log_reg__max_iter': [1000, 5000]
}


In [41]:
# Definisikan GridSearchCV
tf_idf_LR_random_search = GridSearchCV(estimator=tf_idf_LR, param_grid=LR_param_grid, cv=5, scoring='accuracy', n_jobs=4)
tf_idf_LR_random_search.fit(x_train, y_train)

In [42]:
# Definisikan GridSearchCV
stem_LR_random_search = GridSearchCV(estimator=stem_LR, param_grid=LR_param_grid, cv=5, scoring='accuracy', n_jobs=4)
stem_LR_random_search.fit(x_train_stem, y_train_stem)

In [43]:
# full_LR_param_grid = {
#     'log_reg__C': [0.1],  # Just one value for simplicity
#     'log_reg__penalty': ['l1', 'l2'],
#     'log_reg__solver': ['liblinear', 'saga'],
# }

In [54]:
print(f"Best parameters for TF IDF Logistic Regression: {tf_idf_LR_random_search.best_params_}")
print(f"Best parameters for Stemming and Stopwords Logistic Regression: {stem_LR_random_search.best_params_}")

Best parameters for TF IDF Logistic Regression: {'log_reg__C': 10.0, 'log_reg__max_iter': 5000, 'log_reg__penalty': 'l1', 'log_reg__solver': 'liblinear', 'tfidf__max_df': 0.75, 'tfidf__ngram_range': (1, 2)}
Best parameters for Stemming and Stopwords Logistic Regression: {'log_reg__C': 10.0, 'log_reg__max_iter': 1000, 'log_reg__penalty': 'l1', 'log_reg__solver': 'liblinear', 'tfidf__max_df': 1.0, 'tfidf__ngram_range': (1, 2)}


In [41]:
# Definisikan GridSearchCV
full_LR_random_search = GridSearchCV(estimator=full_LR, param_grid=LR_param_grid, cv=5, scoring='accuracy', n_jobs=4)
full_LR_random_search.fit(x_train_stem, y_train_stem)

KeyboardInterrupt: 

In [None]:
# Hasil terbaik
print(f"Best parameters for TF IDF Logistic Regression: {tf_idf_LR_random_search.best_params_}")
print(f"Best parameters for Stemming and Stopwords Logistic Regression: {stem_LR_random_search.best_params_}")
print(f"Best parameters for Stemming, Stopwords, Class Balancing, and Standard Scaler Logistic Regression: {full_LR_random_search.best_params_}")

Best parameters for TF IDF Logistic Regression: {'log_reg__C': 10.0, 'log_reg__penalty': 'l1', 'log_reg__solver': 'saga'}
Best parameters for Stemming and Stopwords Logistic Regression: {'log_reg__C': 10.0, 'log_reg__penalty': 'l1', 'log_reg__solver': 'liblinear'}


NameError: name 'optimized_stem_DTC' is not defined

In [None]:
print(f"Best parameters for Stemming, Stopwords, Class Balancing, and Standard Scaler Logistic Regression: {full_LR_random_search.best_params_}")

### Logistic Regression Model Setelah Optimisasi (GridSearchCV)

In [None]:
# optimized_tf_idf_LR = LogisticRegression(C=10, penalty='l1', solver='saga', max_iter=1000)
# optimized_stem_LR = LogisticRegression(C=10, penalty='l1', solver='saga', max_iter=1000)

optimized_tf_idf_LR = tf_idf_LR_random_search.best_estimator_
optimized_stem_LR = stem_LR_random_search.best_estimator_
optimized_full_LR = full_LR_random_search.best_estimator_

#### TF IDF

In [None]:
optimized_tf_idf_train_pred_LR = optimized_tf_idf_LR.predict(x_train)
optimized_tf_idf_test_pred_LR = optimized_tf_idf_LR.predict(x_test)

In [None]:
optimized_tf_idf_train_score_LR = optimized_tf_idf_LR.score(x_train, y_train)
optimized_tf_idf_test_score_LR = optimized_tf_idf_LR.score(x_test, y_test)

In [None]:
print(classification_report(y_train, optimized_tf_idf_train_pred_LR))
print(classification_report(y_test, optimized_tf_idf_test_pred_LR))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16436
           1       1.00      1.00      1.00     14992

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7045
           1       1.00      1.00      1.00      6425

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [None]:
print(f"Optimized Logistic Regression TF IDF Train Score: {optimized_tf_idf_train_score_LR}")
print(f"Optimized Logistic Regression TF IDF Test Score: {optimized_tf_idf_test_score_LR}")

Optimized Logistic Regression TF IDF Train Score: 0.999649993636248
Optimized Logistic Regression TF IDF Test Score: 0.9968819599109131


#### TF IDF with Stemming and Stopwords

In [None]:
optimized_stem_train_pred_LR = optimized_stem_LR.predict(xs_train_stem)
optimized_stem_test_pred_LR = optimized_stem_LR.predict(x_test_stem)

In [None]:
optimized_stem_train_score_LR = optimized_stem_LR.score(x_train_stem, y_train_stem)
optimized_stem_test_score_LR = optimized_stem_LR.score(x_test_stem, y_test_stem)

In [None]:
print(classification_report(y_train_stem, optimized_stem_train_pred_LR))
print(classification_report(y_test_stem, optimized_stem_test_pred_LR))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16436
           1       1.00      1.00      1.00     14992

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7045
           1       1.00      1.00      1.00      6425

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [None]:
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: {optimized_stem_train_score_LR}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: {optimized_stem_test_score_LR}")

Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9998090874379534
Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.9962880475129918


#### TF-IDF with Stemming, Stopwords, Class Balancing, and Standard Scaler

In [None]:
optimized_full_train_pred_LR = optimized_full_LR.predict(x_train_stem)
optimized_full_test_pred_LR = optimized_full_LR.predict(x_test_stem)

In [None]:
optimized_full_train_score_LR = optimized_full_LR.score(x_train_stem, y_train_stem)
optimized_full_test_score_LR = optimized_full_LR.score(x_test_stem, y_test_stem)


In [None]:
print(classification_report(y_train_stem, optimized_stem_train_pred_LR))
print(classification_report(y_test_stem, optimized_stem_test_pred_LR))

In [None]:

print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Train Score: {optimized_full_train_score_LR}")
print(f"Optimized Logistic Regression Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Test Score: {optimized_full_test_score_LR}")

### Score Differential

#### TF IDF

In [None]:
# Train vs Train Comparison for TF-IDF Logistic Regression
print("Train vs Train Comparison for TF-IDF Logistic Regression:")
print(f"TF-IDF Train Score vs Optimized Train Score: {tf_idf_train_score_LR} vs {optimized_tf_idf_train_score_LR}")
print("")

# Test vs Test Comparison for TF-IDF Logistic Regression
print("Test vs Test Comparison for TF-IDF Logistic Regression:")
print(f"TF-IDF Test Score vs Optimized Test Score: {tf_idf_test_score_LR} vs {optimized_tf_idf_test_score_LR}")
print("")

# Percentage Differences
print("Percentage Differences:")
print(f"Percentage Difference (Train Score): TF-IDF vs Optimized: {((optimized_tf_idf_train_score_LR - tf_idf_train_score_LR) / tf_idf_train_score_LR) * 100:.2f}%")
print(f"Percentage Difference (Test Score): TF-IDF vs Optimized: {((optimized_tf_idf_test_score_LR - tf_idf_test_score_LR) / tf_idf_test_score_LR) * 100:.2f}%")


Train vs Train Comparison for TF-IDF Logistic Regression:
TF-IDF Train Score vs Optimized Train Score: 0.9928725976835943 vs 0.999649993636248

Test vs Test Comparison for TF-IDF Logistic Regression:
TF-IDF Test Score vs Optimized Test Score: 0.9882702301410542 vs 0.9968819599109131

Percentage Differences:
Percentage Difference (Train Score): TF-IDF vs Optimized: 0.68%
Percentage Difference (Test Score): TF-IDF vs Optimized: 0.87%


#### TF IDF with Stemming and Stopwords

In [None]:
# Train vs Train Comparison for Stemming, Stopwords, TF-IDF Logistic Regression
print("Train vs Train Comparison for Stemming, Stopwords, TF-IDF Logistic Regression:")
print(f"Stemming, Stopwords, TF-IDF Train Score vs Optimized Train Score: {stem_train_score_LR} vs {optimized_stem_train_score_LR}")
print("")

# Test vs Test Comparison for Stemming, Stopwords, TF-IDF Logistic Regression
print("Test vs Test Comparison for Stemming, Stopwords, TF-IDF Logistic Regression:")
print(f"Stemming, Stopwords, TF-IDF Test Score vs Optimized Test Score: {stem_test_score_LR} vs {optimized_stem_test_score_LR}")
print("")

# Percentage Differences
print("Percentage Differences:")
print(f"Percentage Difference (Train Score): Stemming, Stopwords, TF-IDF vs Optimized: {((optimized_stem_train_score_LR - stem_train_score_LR) / stem_train_score_LR) * 100:.2f}%")
print(f"Percentage Difference (Test Score): Stemming, Stopwords, TF-IDF vs Optimized: {((optimized_stem_test_score_LR - stem_test_score_LR) / stem_test_score_LR) * 100:.2f}%")


Train vs Train Comparison for Stemming, Stopwords, TF-IDF Logistic Regression:
Stemming, Stopwords, TF-IDF Train Score vs Optimized Train Score: 0.9910907471044929 vs 0.9998090874379534

Test vs Test Comparison for Stemming, Stopwords, TF-IDF Logistic Regression:
Stemming, Stopwords, TF-IDF Test Score vs Optimized Test Score: 0.9862657757980697 vs 0.9962880475129918

Percentage Differences:
Percentage Difference (Train Score): Stemming, Stopwords, TF-IDF vs Optimized: 0.88%
Percentage Difference (Test Score): Stemming, Stopwords, TF-IDF vs Optimized: 1.02%


#### Optimized Logistic Regression Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler

In [None]:
# Train vs Train Comparison for Stemming, Stopwords, TF-IDF Logistic Regression
print("Train vs Train Comparison for Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Logistic Regression:")
print(f"Logistic Regression Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Train Score vs Optimized Train Score: {full_train_score_LR} vs {optimized_full_train_score_LR}")
print("")

# Test vs Test Comparison for Stemming, Stopwords, TF-IDF Logistic Regression
print("Test vs Test Comparison for Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Logistic Regression:")
print(f"Logistic Regression Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Test Score: {full_test_score_LR} vs {optimized_full_test_score_LR}")
print("")

# Percentage Differences
print("Percentage Differences:")
print(f"Percentage Difference (Train Score): Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler vs Optimized: {((optimized_full_train_score_LR - full_train_score_LR) / full_train_score_LR) * 100:.2f}%")
print(f"Percentage Difference (Test Score): Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler vs Optimized: {((optimized_full_test_score_LR - full_test_score_LR) / full_test_score_LR) * 100:.2f}%")


#### Optimized Model Comparison

In [None]:
# Model 1: Optimized Logistic Regression TF IDF Scores
print("Model 1 Scores:")
print(f"Train Score: {optimized_tf_idf_train_score_LR}")
print(f"Test Score: {optimized_tf_idf_test_score_LR}")

# Model 2: Optimized Logistic Regression Stemming, Stopwords, TF IDF Scores
print("\nModel 2 Scores:")
print(f"Train Score: {optimized_stem_train_score_LR}")
print(f"Test Score: {optimized_stem_test_score_LR}")

# Model 3: Optimized Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Scores
print("\nModel 3 Scores:")
print(f"Train Score: {optimized_full_train_score_LR}")
print(f"Test Score: {optimized_full_test_score_LR}")

print("")

# Percentage Differences between Model 1 and Model 2 on train and test data
print("Percentage Differences between Model 1 and Model 2:")
train_score_diff_1_2 = optimized_tf_idf_train_score_LR - optimized_stem_train_score_LR
test_score_diff_1_2 = optimized_tf_idf_test_score_LR - optimized_stem_test_score_LR
train_score_percent_diff_1_2 = (train_score_diff_1_2 / optimized_tf_idf_train_score_LR) * 100
test_score_percent_diff_1_2 = (test_score_diff_1_2 / optimized_tf_idf_test_score_LR) * 100
print(f"Train Data: {train_score_diff_1_2} = {train_score_percent_diff_1_2:.2f}%")
print(f"Test Data: {test_score_diff_1_2} = {test_score_percent_diff_1_2:.2f}%")

# Percentage Differences between Model 1 and Model 3 on train and test data
print("\nPercentage Differences between Model 1 and Model 3:")
train_score_diff_1_3 = optimized_tf_idf_train_score_LR - optimized_full_train_score_LR
test_score_diff_1_3 = optimized_tf_idf_test_score_LR - optimized_full_test_score_LR
train_score_percent_diff_1_3 = (train_score_diff_1_3 / optimized_tf_idf_train_score_LR) * 100
test_score_percent_diff_1_3 = (test_score_diff_1_3 / optimized_tf_idf_test_score_LR) * 100
print(f"Train Data: {train_score_diff_1_3} = {train_score_percent_diff_1_3:.2f}%")
print(f"Test Data: {test_score_diff_1_3} = {test_score_percent_diff_1_3:.2f}%")

# Percentage Differences between Model 2 and Model 3 on train and test data
print("\nPercentage Differences between Model 2 and Model 3:")
train_score_diff_2_3 = optimized_stem_train_score_LR - optimized_full_train_score_LR
test_score_diff_2_3 = optimized_stem_test_score_LR - optimized_full_test_score_LR
train_score_percent_diff_2_3 = (train_score_diff_2_3 / optimized_stem_train_score_LR) * 100
test_score_percent_diff_2_3 = (test_score_diff_2_3 / optimized_stem_test_score_LR) * 100
print(f"Train Data: {train_score_diff_2_3} = {train_score_percent_diff_2_3:.2f}%")
print(f"Test Data: {test_score_diff_2_3} = {test_score_percent_diff_2_3:.2f}%")


Optimized Logistic Regression TF IDF Train Score: 0.9995545373552246
Optimized Logistic Regression TF IDF Test Score: 0.9964365256124722
Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9995863561155658
Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.9947290274684484

Perbedaan skor antara 2 model pada train data: 3.1818760341195684e-05 = 0.00%
Perbedaan skor antara 2 model pada pada test data: -0.0017074981440238224) = -0.17%


## Pembuatan Model Decision Tree Classifier

### Model Decision Tree Classifier

In [None]:
tf_idf_DTC = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dtc', DecisionTreeClassifier())
])

stem_DTC = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dtc', DecisionTreeClassifier())
])

full_DTC = ImbPipeline([
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE()),
    ('dtc', DecisionTreeClassifier(class_weight='balanced'))
])

In [None]:
tf_idf_DTC.fit(x_train, y_train)
stem_DTC.fit(x_train_stem, y_train_stem)
full_DTC.fit(x_train_stem, y_train_stem)

### Model Evaluation

#### TF IDF

In [None]:
tf_idf_train_pred_DTC = tf_idf_DTC.predict(x_train)
tf_idf_test_pred_DTC = tf_idf_DTC.predict(x_test)

In [None]:
tf_idf_train_score_DTC = tf_idf_DTC.score(x_train, y_train)
tf_idf_test_score_DTC = tf_idf_DTC.score(x_test, y_test)

In [None]:
print(classification_report(y_train, tf_idf_train_pred_DTC))
print(classification_report(y_test, tf_idf_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16436
           1       1.00      1.00      1.00     14992

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7045
           1       1.00      1.00      1.00      6425

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [None]:
print(f"Decision Tree Classifier TF IDF Train Score: {tf_idf_train_score_DTC}")
print(f"Decision Tree Classifier TF IDF Test Score: {tf_idf_test_score_DTC}")

Decision Tree Classifier TF IDF Train Score: 0.9999681812396589
Decision Tree Classifier TF IDF Test Score: 0.9974758723088345


#### TF IDF with Stemming and Stopwords Score

In [None]:
stem_train_pred_DTC = stem_DTC.predict(x_train_stem)
stem_test_pred_DTC = stem_DTC.predict(x_test_stem)

In [None]:
stem_train_score_DTC = stem_DTC.score(x_train_stem, y_train_stem)
stem_test_score_DTC = stem_DTC.score(x_test_stem, y_test_stem)

In [None]:
print(classification_report(y_train_stem, stem_train_pred_DTC))
print(classification_report(y_test_stem, stem_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16436
           1       1.00      1.00      1.00     14992

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7045
           1       1.00      1.00      1.00      6425

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [None]:
print(f"Decision Tree Classifier Stemming, Stopwords, TF IDF Train Score: {stem_train_score_DTC}")
print(f"Decision Tree Classifier Stemming, Stopwords, TF IDF Test Score: {stem_test_score_DTC}")

Decision Tree Classifier Stemming, Stopwords, TF IDF Train Score: 0.9999681812396589
Decision Tree Classifier Stemming, Stopwords, TF IDF Test Score: 0.9966592427616926


#### Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Score

In [None]:
full_train_pred_DTC = full_DTC.predict(x_train)
full_test_pred_DTC = full_DTC.predict(x_test)

In [None]:
full_train_score_DTC = full_DTC.score(x_train_stem, y_train_stem)
full_test_score_DTC = full_DTC.score(x_test_stem, y_test_stem)

In [None]:
print(classification_report(y_train_stem, stem_train_pred_DTC))
print(classification_report(y_test_stem, stem_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16436
           1       1.00      1.00      1.00     14992

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7045
           1       1.00      1.00      1.00      6425

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [None]:

print(f"Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Train Score: {full_train_score_DTC}")
print(f"Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Test Score: {full_test_score_DTC}")

Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Train Score: 0.9999681812396589
Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Test Score: 0.997253155159614


#### Model Comparison

In [None]:
# Model 1: Decision Tree Classifier TF IDF Scores
print("Model 1: Decision Tree Classifier TF IDF Scores Scores:")
print(f"Train Score: {tf_idf_train_score_DTC}")
print(f"Test Score: {tf_idf_test_score_DTC}")

# Model 2: Decision Tree Classifier Stemming, Stopwords, TF IDF Scores
print("\nModel 2: Decision Tree Classifier Stemming, Stopwords, TF IDF Scores:")
print(f"Train Score: {stem_train_score_DTC}")
print(f"Test Score: {stem_test_score_DTC}")

# Model 3: Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Scores
print("\nModel 3: Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Scores:")
print(f"Train Score: {full_train_score_DTC}")
print(f"Test Score: {full_test_score_DTC}")

print("")

# Percentage Differences between Model 1 and Model 2 on train and test data
print("Percentage Differences between Model 1 and Model 2:")
train_score_diff_1_2 = tf_idf_train_score_DTC - stem_train_score_DTC
test_score_diff_1_2 = tf_idf_test_score_DTC - stem_test_score_DTC
train_score_percent_diff_1_2 = (train_score_diff_1_2 / tf_idf_train_score_DTC) * 100
test_score_percent_diff_1_2 = (test_score_diff_1_2 / tf_idf_test_score_DTC) * 100
print(f"Train Data: {train_score_diff_1_2} = {train_score_percent_diff_1_2:.2f}%")
print(f"Test Data: {test_score_diff_1_2} = {test_score_percent_diff_1_2:.2f}%")

# Percentage Differences between Model 1 and Model 3 on train and test data
print("\nPercentage Differences between Model 1 and Model 3:")
train_score_diff_1_3 = tf_idf_train_score_DTC - full_train_score_DTC
test_score_diff_1_3 = tf_idf_test_score_DTC - full_test_score_DTC
train_score_percent_diff_1_3 = (train_score_diff_1_3 / tf_idf_train_score_DTC) * 100
test_score_percent_diff_1_3 = (test_score_diff_1_3 / tf_idf_test_score_DTC) * 100
print(f"Train Data: {train_score_diff_1_3} = {train_score_percent_diff_1_3:.2f}%")
print(f"Test Data: {test_score_diff_1_3} = {test_score_percent_diff_1_3:.2f}%")

# Percentage Differences between Model 2 and Model 3 on train and test data
print("\nPercentage Differences between Model 2 and Model 3:")
train_score_diff_2_3 = stem_train_score_DTC - full_train_score_DTC
test_score_diff_2_3 = stem_test_score_DTC - full_test_score_DTC
train_score_percent_diff_2_3 = (train_score_diff_2_3 / stem_train_score_DTC) * 100
test_score_percent_diff_2_3 = (test_score_diff_2_3 / stem_test_score_DTC) * 100
print(f"Train Data: {train_score_diff_2_3} = {train_score_percent_diff_2_3:.2f}%")
print(f"Test Data: {test_score_diff_2_3} = {test_score_percent_diff_2_3:.2f}%")


Model 1: Decision Tree Classifier TF IDF Scores Scores:
Train Score: 0.9999681812396589
Test Score: 0.9974758723088345

Model 2: Decision Tree Classifier Stemming, Stopwords, TF IDF Scores:
Train Score: 0.9999681812396589
Test Score: 0.9966592427616926

Model 3: Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Scores:
Train Score: 0.9999681812396589
Test Score: 0.997253155159614

Percentage Differences between Model 1 and Model 2:
Train Data: 0.0 = 0.00%
Test Data: 0.0008166295471419005 = 0.08%

Percentage Differences between Model 1 and Model 3:
Train Data: 0.0 = 0.00%
Test Data: 0.00022271714922050823 = 0.02%

Percentage Differences between Model 2 and Model 3:
Train Data: 0.0 = 0.00%
Test Data: -0.0005939123979213923 = -0.06%


: 

### Decision Tree Classifier Hyperparameter Optimization

In [None]:
# Definisikan parameter grid
param_grid_DTC = {
    'dtc__max_depth': [None, 5, 10, 20, 30, 40, 50],
    'dtc__min_samples_split': [2, 5, 10, 15, 20, 25, 30],
    'dtc__min_samples_leaf': [1, 2, 3, 4, 5, 6, 7]
}


In [None]:
# Definisikan GridSearchCV
tf_idf_grid_search_DTC = GridSearchCV(estimator=tf_idf_DTC, param_grid=param_grid_DTC, cv=5, scoring='accuracy', n_jobs=4)

# Fit model
tf_idf_grid_search_DTC.fit(x_train, y_train)

In [None]:
# Definisikan GridSearchCV
stem_grid_search_DTC = GridSearchCV(estimator=stem_DTC, param_grid=param_grid_DTC, cv=5, scoring='accuracy', n_jobs=4)

# Fit model
stem_grid_search_DTC.fit(x_train_stem, y_train_stem)

In [None]:
print(f"Best parameters for TF IDF Decision Tree Classifier: {tf_idf_grid_search_DTC.best_params_}")
print(f"Best parameters for Stem and Stopwords Decision Tree Classifier: {stem_grid_search_DTC.best_params_}")

In [None]:
# Definisikan GridSearchCV
full_grid_search_DTC = GridSearchCV(estimator=full_DTC, param_grid=param_grid_DTC, cv=5, scoring='accuracy', n_jobs=4)

# Fit model
full_grid_search_DTC.fit(x_train_stem, y_train_stem)

In [None]:
print(f"Best parameters for TF IDF Decision Tree Classifier: {tf_idf_grid_search_DTC.best_params_}")
print(f"Best parameters for Stem and Stopwords Decision Tree Classifier: {stem_grid_search_DTC.best_params_}")
print(f"Best parameters for Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler: {full_grid_search_DTC.best_params_}")

### Decision Tree Clasifier Model Setelah Optimisasi

In [None]:
# optimized_tf_idf_DTC = DecisionTreeClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2)
# optimized_stem_DTC = DecisionTreeClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2)

optimized_tf_idf_DTC = tf_idf_grid_search_DTC.best_estimator_
optimized_stem_DTC = stem_grid_search_DTC.best_estimator_
optimized_full_DTC = full_grid_search_DTC.best_estimator_

#### TF IDF

In [None]:
optimized_tf_idf_train_pred_DTC = optimized_tf_idf_DTC.predict(x_train)
optimized_tf_idf_test_pred_DTC = optimized_tf_idf_DTC.predict(x_test)

In [None]:
optimized_tf_idf_train_score_DTC = optimized_tf_idf_DTC.score(x_train, y_train)

In [None]:
optimized_tf_idf_test_score_DTC = optimized_tf_idf_DTC.score(x_test, y_test)

In [None]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16464
           1       0.99      0.99      0.99     14964

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      7017
           1       0.98      0.99      0.99      6453

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [None]:
print(f"Optimized Decision Tree Classifier TF IDF Train Score: {optimized_tf_idf_train_score_DTC}")
print(f"Optimized Decision Tree Classifier TF IDF Test Score: {optimized_tf_idf_test_score_DTC}")

Optimized Decision Tree Classifier TF IDF Train Score: 0.9999681812396589
Optimized Decision Tree Classifier TF IDF Test Score: 0.9966592427616926


#### TF IDF With Stemming & Stopwords

In [None]:
optimized_stem_train_pred_DTC = optimized_stem_DTC.predict(x_train_stem)
optimized_stem_test_pred_DTC = optimized_stem_DTC.predict(x_test_stem)

In [None]:
optimized_stem_train_score_DTC = optimized_stem_DTC.score(x_train_stem, y_train_stem)

In [None]:
optimized_stem_test_score_DTC = optimized_stem_DTC.score(x_test_stem, y_test_stem)

In [None]:
print(classification_report(y_train_stem, optimized_stem_train_pred_DTC))
print(classification_report(y_test_stem, optimized_stem_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16464
           1       1.00      1.00      1.00     14964

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7017
           1       1.00      1.00      1.00      6453

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [None]:
print(f"Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF Train Score: {optimized_stem_train_score_DTC}")
print(f"Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF Test Score: {optimized_stem_test_score_DTC}")

Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF Train Score: 0.9999681812396589
Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF Test Score: 0.996807720861173


#### TF IDF with Stemming and Stopwords 

In [None]:
optimized_full_train_pred_DTC = optimized_full_DTC.predict(x_train_stem)
optimized_full_test_pred_DTC = optimized_full_DTC.predict(x_test_stem)

In [None]:
optimized_full_train_score_DTC = optimized_full_DTC.score(x_train_stem, y_train_stem)

In [None]:
optimized_full_test_score_DTC = optimized_full_DTC.score(x_test_stem, y_test_stem)

In [None]:
print(classification_report(y_train_stem, optimized_full_train_pred_DTC))
print(classification_report(y_test_stem, optimized_full_test_pred_DTC))

In [None]:

print(f"Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Train Score: {optimized_full_train_score_DTC}")
print(f"Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Test Score: {optimized_full_test_score_DTC}")

### Score Differential

#### TF IDF

In [None]:
# Train vs Train Comparison for TF-IDF Logistic Regression
print("Train vs Train Comparison for TF-IDF Logistic Regression:")
print(f"TF-IDF Train Score vs Optimized Train Score: {tf_idf_train_score_DTC} vs {optimized_tf_idf_train_score_DTC}")
print("")

# Test vs Test Comparison for TF-IDF Logistic Regression
print("Test vs Test Comparison for TF-IDF Logistic Regression:")
print(f"TF-IDF Test Score vs Optimized Test Score: {tf_idf_test_score_DTC} vs {optimized_tf_idf_test_score_DTC}")
print("")

# Percentage Differences
print("Percentage Differences:")
print(f"Percentage Difference (Train Score): TF-IDF vs Optimized: {((optimized_tf_idf_train_score_DTC - tf_idf_train_score_DTC) / tf_idf_train_score_DTC) * 100:.2f}%")
print(f"Percentage Difference (Test Score): TF-IDF vs Optimized: {((optimized_tf_idf_test_score_DTC - tf_idf_test_score_DTC) / tf_idf_test_score_DTC) * 100:.2f}%")


Decision Tree Classifier TF IDF Train Score: 0.9999681812396589
Decision Tree Classifier TF IDF Test Score: 0.9962138084632517
Optimized Decision Tree Classifier TF IDF Train Score: 0.9999681812396589
Optimized Decision Tree Classifier TF IDF Test Score: 0.9966592427616926

Perbedaan skor model setelah optimisasi pada train data: 0.0 = 0.00%
Perbedaan skor model setelah optimisasi pada test data: 0.00044543429844090543) = 0.04%


#### TF IDF With Stemming & Stopwords

In [None]:
# Train vs Train Comparison for Stemming, Stopwords, TF-IDF Logistic Regression
print("Train vs Train Comparison for Stemming, Stopwords, TF-IDF Logistic Regression:")
print(f"Stemming, Stopwords, TF-IDF Train Score vs Optimized Train Score: {stem_train_score_DTC} vs {optimized_stem_train_score_DTC}")
print("")

# Test vs Test Comparison for Stemming, Stopwords, TF-IDF Logistic Regression
print("Test vs Test Comparison for Stemming, Stopwords, TF-IDF Logistic Regression:")
print(f"Stemming, Stopwords, TF-IDF Test Score vs Optimized Test Score: {stem_test_score_DTC} vs {optimized_stem_test_score_DTC}")
print("")

# Percentage Differences
print("Percentage Differences:")
print(f"Percentage Difference (Train Score): Stemming, Stopwords, TF-IDF vs Optimized: {((optimized_stem_train_score_DTC - stem_train_score_DTC) / stem_train_score_DTC) * 100:.2f}%")
print(f"Percentage Difference (Test Score): Stemming, Stopwords, TF-IDF vs Optimized: {((optimized_stem_test_score_DTC - stem_test_score_DTC) / stem_test_score_DTC) * 100:.2f}%")


Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9999681812396589
Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.9965850037119525
Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9999681812396589
Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.996807720861173

Perbedaan skor model setelah optimisasi pada train data: 0.0 = 0.00%
Perbedaan skor model setelah optimisasi pada test data: 0.00022271714922050823) = 0.02%


#### Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler

In [None]:
# Train vs Train Comparison for Stemming, Stopwords, TF-IDF Logistic Regression
print("Train vs Train Comparison for Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Logistic Regression:")
print(f"Logistic Regression Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Train Score vs Optimized Train Score: {full_train_score_DTC} vs {optimized_full_train_score_DTC}")
print("")

# Test vs Test Comparison for Stemming, Stopwords, TF-IDF Logistic Regression
print("Test vs Test Comparison for Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Logistic Regression:")
print(f"Logistic Regression Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Test Score: {full_test_score_DTC} vs {optimized_full_test_score_DTC}")
print("")

# Percentage Differences
print("Percentage Differences:")
print(f"Percentage Difference (Train Score): Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler vs Optimized: {((optimized_full_train_score_DTC - full_train_score_DTC) / full_train_score_DTC) * 100:.2f}%")
print(f"Percentage Difference (Test Score): Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler vs Optimized: {((optimized_full_test_score_DTC - full_test_score_DTC) / full_test_score_DTC) * 100:.2f}%")


#### Optimized Model Comparison

In [None]:
# Model 1: Optimized Decision Tree Classifier TF IDF Scores
print("Model 1 Scores:")
print(f"Train Score: {optimized_tf_idf_train_score_DTC}")
print(f"Test Score: {optimized_tf_idf_test_score_DTC}")

# Model 2: Optimized Decision Tree Classifier Stemming, Stopwords, TF IDF Scores
print("\nModel 2 Scores:")
print(f"Train Score: {optimized_stem_train_score_DTC}")
print(f"Test Score: {optimized_stem_test_score_DTC}")

# Model 3: Optimized Stemming, Stopwords, TF IDF, Class Balancing, and Standard Scaler Scores
print("\nModel 3 Scores:")
print(f"Train Score: {optimized_full_train_score_DTC}")
print(f"Test Score: {optimized_full_test_score_DTC}")

print("")

# Percentage Differences between Model 1 and Model 2 on train and test data
print("Percentage Differences between Model 1 and Model 2:")
train_score_diff_1_2 = optimized_tf_idf_train_score_DTC - optimized_stem_train_score_DTC
test_score_diff_1_2 = optimized_tf_idf_test_score_DTC - optimized_stem_test_score_DTC
train_score_percent_diff_1_2 = (train_score_diff_1_2 / optimized_tf_idf_train_score_DTC) * 100
test_score_percent_diff_1_2 = (test_score_diff_1_2 / optimized_tf_idf_test_score_DTC) * 100
print(f"Train Data: {train_score_diff_1_2} = {train_score_percent_diff_1_2:.2f}%")
print(f"Test Data: {test_score_diff_1_2} = {test_score_percent_diff_1_2:.2f}%")

# Percentage Differences between Model 1 and Model 3 on train and test data
print("\nPercentage Differences between Model 1 and Model 3:")
train_score_diff_1_3 = optimized_tf_idf_train_score_DTC - optimized_full_train_score_DTC
test_score_diff_1_3 = optimized_tf_idf_test_score_DTC - optimized_full_test_score_DTC
train_score_percent_diff_1_3 = (train_score_diff_1_3 / optimized_tf_idf_train_score_DTC) * 100
test_score_percent_diff_1_3 = (test_score_diff_1_3 / optimized_tf_idf_test_score_DTC) * 100
print(f"Train Data: {train_score_diff_1_3} = {train_score_percent_diff_1_3:.2f}%")
print(f"Test Data: {test_score_diff_1_3} = {test_score_percent_diff_1_3:.2f}%")

# Percentage Differences between Model 2 and Model 3 on train and test data
print("\nPercentage Differences between Model 2 and Model 3:")
train_score_diff_2_3 = optimized_stem_train_score_DTC - optimized_full_train_score_DTC
test_score_diff_2_3 = optimized_stem_test_score_DTC - optimized_full_test_score_DTC
train_score_percent_diff_2_3 = (train_score_diff_2_3 / optimized_stem_train_score_DTC) * 100
test_score_percent_diff_2_3 = (test_score_diff_2_3 / optimized_stem_test_score_DTC) * 100
print(f"Train Data: {train_score_diff_2_3} = {train_score_percent_diff_2_3:.2f}%")
print(f"Test Data: {test_score_diff_2_3} = {test_score_percent_diff_2_3:.2f}%")


Optimized Decision Tree Classifier TF IDF Train Score: 0.9999681812396589
Optimized Decision Tree Classifier TF IDF Test Score: 0.9966592427616926
Optimized Logistic Regression Stemming, Stopwords, TF IDF Train Score: 0.9999681812396589
Optimized Logistic Regression Stemming, Stopwords, TF IDF Test Score: 0.996807720861173

Perbedaan skor antara 2 model pada train data: 0.0 = 0.00%
Perbedaan skor antara 2 model pada pada test data: 0.00014847809948037582) = 0.01%


# Model Implementation

In [None]:
ndef output_label(n):
  if n==0:
    return "It Is Fake News"
  elif n==1:
    return "It Is Genuine News"


In [None]:
def manual_testing(news):
    # Corrected syntax for defining dictionary
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_x_test = new_def_test["text"].apply(wordopt)

    # Assuming 'vectorization' is your vectorizer object
    new_xv_test = vectorization.transform(new_x_test)

    # Model predictions
    pred_lr = LR.predict(new_xv_test)
    pred_dtc = DTC.predict(new_xv_test)
    pred_rfc = RFC.predict(new_xv_test)

    return "\nLR Prediction: {}\nDTC Prediction: {}\nRFC Prediction: {}".format(
        output_label(pred_lr[0]),
        output_label(pred_dtc[0]),
        output_label(pred_rfc[0])
    )


In [None]:
news_article = str(input())

KeyboardInterrupt: Interrupted by user

In [None]:
manual_testing(news_article)