# Fake News Detector

## Import Library

In [149]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

## Import Data

In [55]:
true = pd.read_csv('True.csv', engine='python', encoding='utf-8', on_bad_lines='skip')
fake = pd.read_csv('Fake.csv', engine='python', encoding='utf-8', on_bad_lines='skip')

In [56]:
true['label'] = 1
fake['label'] = 0

# Data Preprocessing

## Data Integration

In [57]:
news = pd.concat([fake, true], axis=0)

In [58]:
news.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [59]:
news.tail()

Unnamed: 0,title,text,subject,date,label
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


## Data Cleaning

### Checking Null Values

In [60]:
news.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

### Dropping Unnecessary Column

In [61]:
news.drop(['title', 'subject', 'date'], axis=1)

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",1
21414,MINSK (Reuters) - In the shadow of disused Sov...,1
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,1


In [62]:
news

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


### Scramble Data

In [63]:
news = news.sample(frac=1)
news.reset_index(inplace=True)
news.drop(['index'], axis = 1, inplace=True)

In [64]:
news

Unnamed: 0,title,text,subject,date,label
0,WATCH: Here’s What It Sounds Like When People...,The nation is once again having a conversation...,News,"June 11, 2016",0
1,Donald Trump Said Something So Stupid That Mi...,"Sometimes, Donald Trump says something so stup...",News,"September 13, 2016",0
2,Indonesian forces prepare to storm Papua villa...,JAKARTA (Reuters) - Indonesian security forces...,worldnews,"November 11, 2017",1
3,WATCH: Joe Scarborough Gets MOCKED For Compar...,Joe Scarborough is not having the best morning...,News,"April 26, 2017",0
4,PRESIDENT TRUMP WARNS Comey…Tweets Veiled Thre...,Donald Trump lobbed a veiled threat at the for...,left-news,"May 12, 2017",0
...,...,...,...,...,...
44893,OBAMA’S SOLDIERS Cause 5-Hour SHUT DOWN On St ...,Whoever thought asking Obama s terrorists sold...,politics,"Jul 10, 2016",0
44894,House ethics panel launches wide-ranging probe...,WASHINGTON (Reuters) - The U.S. House of Repre...,politicsNews,"December 1, 2017",1
44895,"'The Rock' talks a future in politics, includi...",LOS ANGELES (Reuters) - After parleying a prof...,politicsNews,"November 14, 2016",1
44896,Hollywood moves from words to action in Trump era,LOS ANGELES (Reuters) - Hollywood is gearing u...,politicsNews,"February 8, 2017",1


### WordOPT

In [65]:
def wordopt(text):
    # Convert into lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|\www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove digits
    text = re.sub(r'\d', '', text)

    # Remove newline characters
    text = re.sub(r'\n', ' ', text)

    return text


In [66]:
news['text'] = news['text'].apply(wordopt)

In [67]:
news['text']

0        the nation is once again having a conversation...
1        sometimes donald trump says something so stupi...
2        jakarta reuters  indonesian security forces in...
3        joe scarborough is not having the best morning...
4        donald trump lobbed a veiled threat at the for...
                               ...                        
44893    whoever thought asking obama s terrorists sold...
44894    washington reuters  the us house of representa...
44895    los angeles reuters  after parleying a profess...
44896    los angeles reuters  hollywood is gearing up f...
44897    in a complete  degree turn a church that was t...
Name: text, Length: 44898, dtype: object

## Stopwords and Stemming

In [68]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [69]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [70]:
def preprocess_text(text):
    # Tokenisasi teks
    words = text.split()

    # Hapus stopwords dan stem kata
    processed_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]

    # Gabungkan kembali kata yang telah diproses
    return ' '.join(processed_words)

In [71]:
X_stem = [preprocess_text(sentence) for sentence in news['text']]

In [72]:
def custom_tokenizer(text):
    return preprocess_text(text).split()

In [74]:
stem_tf_idf_vectorization = TfidfVectorizer()

In [75]:
x_stem_tf_idf = stem_tf_idf_vectorization.fit_transform(X_stem)

## Feature Extraction (TF-IDF)

In [76]:
x = news['text']
y = news['label']

### TF-IDF Vectorizer

In [78]:
tf_idf_vectorization = TfidfVectorizer()

In [79]:
x_tfidf = tf_idf_vectorization.fit_transform(x)

## Split Data

### Split Data: TF-IDF

In [81]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.3, random_state=10)

### Split Data: TF IDF & Stemming & Stopwords

In [80]:
x_train_stem, x_test_stem, y_train_stem, y_test_stem = train_test_split(x_stem_tf_idf, y, test_size=0.3, random_state=10)

# Model

## Pembuatan Model Logistic Regression

### Model Logistic Regression

In [86]:
tf_idf_LR = LogisticRegression()
stem_LR = LogisticRegression()

In [87]:
tf_idf_LR.fit(x_train, y_train)
stem_LR.fit(x_train_stem, y_train_stem)

### Model Evaluation

#### TF-IDF Score

In [100]:
tf_idf_train_pred_lr = tf_idf_LR.predict(x_train)
tf_idf_test_pred_lr = tf_idf_LR.predict(x_test)

In [101]:
tf_idf_LR.score(x_train, y_train)

0.9926816851215476

In [102]:
tf_idf_LR.score(x_test, y_test)

0.9883444691907943

In [103]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16449
           1       0.99      0.99      0.99     14979

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7032
           1       0.99      0.99      0.99      6438

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



#### TF-IDF with Stemming and Stopwords Score

In [104]:
stem_train_pred_lr = stem_LR.predict(x_train_stem)
stem_test_pred_lr = stem_LR.predict(x_test_stem)

In [105]:
stem_LR.score(x_train_stem, y_train_stem)

0.9915998472699503

In [106]:
stem_LR.score(x_test_stem, y_test_stem)

0.9842613214550854

In [107]:
print(classification_report(y_train_stem, stem_train_pred_lr))
print(classification_report(y_test_stem, stem_test_pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16449
           1       0.99      0.99      0.99     14979

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      7032
           1       0.98      0.98      0.98      6438

    accuracy                           0.98     13470
   macro avg       0.98      0.98      0.98     13470
weighted avg       0.98      0.98      0.98     13470



### Optimasi Parameter Pada Logistic Regression

In [None]:
# Definisikan parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

In [None]:
# Definisikan parameter grid
tf_idf_param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# Definisikan GridSearchCV
tf_idf_grid_search_log_reg = GridSearchCV(estimator=stem_LR, param_grid=tf_idf_param_grid, cv=5, scoring='accuracy')

# Fit model
tf_idf_grid_search_log_reg.fit(x_train, y_train)


# Definisikan parameter grid
stem_param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# Definisikan GridSearchCV
stem_grid_search_log_reg = GridSearchCV(estimator=stem_LR, param_grid=stem_param_grid, cv=5, scoring='accuracy')

# Fit model
stem_grid_search_log_reg.fit(x_train_stem, y_train_stem)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
# Hasil terbaik
print(f"Best parameters for TF IDF Logistic Regression: {tf_idf_grid_search_log_reg.best_params_}")
print(f"Best parameters for Stemming and Stopwords Logistic Regression: {stem_grid_search_log_reg.best_params_}")

Best parameters for TF IDF Logistic Regression: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best parameters for Stemming and Stopwords Logistic Regression: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}


### Model Setelah Optimisasi

In [99]:
optimized_tf_idf_LR = LogisticRegression(C=100, penalty='l1', solver='liblinear')
optimized_stem_LR = LogisticRegression(C=10, penalty='l1', solver='liblinear')

In [111]:
optimized_tf_idf_LR.fit(x_train, y_train)
optimized_stem_LR.fit(x_train_stem, y_train_stem)

#### TF IDF

In [118]:
optimized_tf_idf_train_pred_lr = optimized_tf_idf_LR.predict(x_train)
optimized_tf_idf_test_pred_lr = optimized_tf_idf_LR.predict(x_test)

In [119]:
optimized_tf_idf_LR.score(x_train, y_train)

0.9999681812396589

In [120]:
optimized_tf_idf_LR.score(x_test, y_test)

0.9968819599109131

In [121]:
print(classification_report(y_train, optimized_tf_idf_train_pred_lr))
print(classification_report(y_test, optimized_tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16449
           1       1.00      1.00      1.00     14979

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7032
           1       1.00      1.00      1.00      6438

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



#### TF IDF with Stemming and Stopwords

In [130]:
optimized_stem_train_pred_lr = optimized_stem_LR.predict(x_train_stem)
optimized_stem_test_pred_lr = optimized_stem_LR.predict(x_test_stem)

In [131]:
optimized_stem_LR.score(x_train_stem, y_train_stem)

0.999681812396589

In [132]:
optimized_stem_LR.score(x_test_stem, y_test_stem)

0.9953971789161099

In [133]:
print(classification_report(y_train_stem, optimized_stem_train_pred_lr))
print(classification_report(y_test_stem, optimized_stem_test_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16449
           1       1.00      1.00      1.00     14979

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7032
           1       1.00      0.99      1.00      6438

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



## Pembuatan Model Decision Tree Classifier

### Model Decision Tree Classifier

In [134]:
tf_idf_DTC = DecisionTreeClassifier()
stem_DTC = DecisionTreeClassifier()

In [135]:
tf_idf_DTC.fit(x_train, y_train)
stem_DTC.fit(x_train_stem, y_train_stem)

### Model Evaluation

#### TF IDF

In [143]:
optimized_tf_idf_train_pred_lr = optimized_tf_idf_LR.predict(x_train)
optimized_tf_idf_test_pred_lr = optimized_tf_idf_LR.predict(x_test)

In [137]:
tf_idf_DTC.score(x_train, y_train)


0.9999681812396589

In [138]:
tf_idf_DTC.score(x_test, y_test)

0.9969561989606534

In [144]:
print(classification_report(y_train, tf_idf_train_pred_DTC))
print(classification_report(y_test, tf_idf_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16449
           1       1.00      1.00      1.00     14979

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7032
           1       1.00      1.00      1.00      6438

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



#### TF IDF with Stemming and Stopwords Score

In [145]:
stem_train_pred_DTC = stem_DTC.predict(x_train_stem)
stem_test_pred_DTC = stem_DTC.predict(x_test_stem)

In [146]:
stem_DTC.score(x_train_stem, y_train_stem)

0.9999681812396589

In [147]:
stem_DTC.score(x_test_stem, y_test_stem)

0.9961395694135116

In [148]:
print(classification_report(y_train_stem, stem_train_pred_DTC))
print(classification_report(y_test_stem, stem_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16449
           1       1.00      1.00      1.00     14979

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7032
           1       1.00      0.99      1.00      6438

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



### Optimasi Parameter Pada Decision Tree classifier

In [None]:
# Definisikan parameter grid
tf_idf_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
tf_idf_grid_search_dt = GridSearchCV(estimator=tf_idf_DTC, param_grid=tf_idf_param_grid, cv=5, scoring='accuracy')

# Fit model
tf_idf_grid_search_dt.fit(x_train, y_train)

stem_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
stem_grid_search_dt = GridSearchCV(estimator=stem_DTC, param_grid=stem_param_grid, cv=5, scoring='accuracy')

# Fit model
stem_grid_search_dt.fit(x_train_stem, y_train_stem)



In [None]:
print(f"Best parameters for TF IDF Decision Tree Classifier: {tf_idf_grid_search_dt.best_params_}")

print(f"Best parameters for Stem and Stopwords Decision Tree Classifier: {stem_grid_search_dt.best_params_}")

Best parameters for TF IDF Decision Tree Classifier: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best parameters for Stem and Stopwords Decision Tree Classifier: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}


### Model Decision Tree Clasifier Setelah Optimisasi

In [156]:
optimized_tf_idf_DTC = DecisionTreeClassifier(max_depth=30, min_samples_leaf=1, min_samples_split=10)
optimized_stem_DTC = DecisionTreeClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=10)

In [158]:
optimized_tf_idf_DTC.fit(x_train, y_train)
optimized_stem_DTC.fit(x_train_stem, y_train_stem)

#### TF IDF

In [159]:
optimized_tf_idf_train_pred_DTC = optimized_tf_idf_DTC.predict(x_train)
optimized_tf_idf_test_pred_DTC = optimized_tf_idf_DTC.predict(x_test)

In [153]:
tf_idf_LR.score(x_train, y_train)

0.9926816851215476

In [154]:
tf_idf_LR.score(x_test, y_test)

0.9883444691907943

In [155]:
print(classification_report(y_train, tf_idf_train_pred_lr))
print(classification_report(y_test, tf_idf_test_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16449
           1       1.00      1.00      1.00     14979

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7032
           1       1.00      1.00      1.00      6438

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



#### TF IDF With Stemming & Stopwords

In [160]:
optimized_stem_train_pred_DTC = optimized_stem_DTC.predict(x_train_stem)
optimized_stem_test_pred_DTC = optimized_stem_DTC.predict(x_test_stem)

In [161]:
optimized_stem_DTC.score(x_train_stem, y_train_stem)

0.9997454499172712

In [162]:
optimized_stem_DTC.score(x_test_stem, y_test_stem)

0.996807720861173

In [164]:
print(classification_report(y_train_stem, optimized_stem_train_pred_DTC))
print(classification_report(y_test_stem, optimized_stem_test_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16449
           1       1.00      1.00      1.00     14979

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7032
           1       1.00      1.00      1.00      6438

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



## Pembuatan Model Random Forest Classifier

### Model Random Forest Classifier

In [None]:
tf_idf_RFC = RandomForestClassifier()
stem_RFC = RandomForestClassifier()

In [None]:
tf_idf_RFC.fit(x_train, y_train)
stem_RFC.fit(x_train_stem, y_train_stem)

In [None]:
tf_idf_train_pred_RFC = tf_idf_RFC.predict(x_train)
tf_idf_test_pred_RFC = tf_idf_RFC.predict(x_test)

stem_train_pred_RFC = stem_RFC.predict(x_train_stem)
stem_test_pred_RFC = stem_RFC.predict(x_test_stem)

In [None]:
tf_idf_RFC.score(x_train, y_train)


1.0

In [None]:
tf_idf_RFC.score(x_test, y_test)


0.9878247958426132

In [None]:
stem_RFC.score(x_train_stem, y_train_stem)


1.0

In [None]:
stem_RFC.score(x_test_stem, y_test_stem)

0.984558277654046

In [None]:
print(classification_report(y_train, tf_idf_train_pred_RFC))
print(classification_report(y_test, tf_idf_test_pred_RFC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16473
           1       1.00      1.00      1.00     14955

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7008
           1       0.99      0.99      0.99      6462

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [None]:
print(classification_report(y_train_stem, stem_train_pred_RFC))
print(classification_report(y_test_stem, stem_test_pred_RFC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16473
           1       1.00      1.00      1.00     14955

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      7008
           1       0.98      0.99      0.98      6462

    accuracy                           0.98     13470
   macro avg       0.98      0.98      0.98     13470
weighted avg       0.98      0.98      0.98     13470



### Optimasi Parameter Pada Random Forest Classifier

In [None]:
from sklearn.model_selection import GridSearchCV


# Definisikan parameter grid
tf_idf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
tf_idf_grid_search_rf = GridSearchCV(estimator=tf_idf_RFC, param_grid=tf_idf_param_grid, cv=5, scoring='accuracy')

# Fit model
tf_idf_grid_search_rf.fit(x_train, y_train)

# Hasil terbaik
print(f"Best parameters for Random Forest Classifier: {tf_idf_grid_search_rf.best_params_}")

# Definisikan parameter grid
stem_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definisikan GridSearchCV
stem_grid_search_rf = GridSearchCV(estimator=stem_RFC, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit model
stem_grid_search_rf.fit(x_train_stem, y_train_stem)

# Hasil terbaik
print(f"Best parameters for Random Forest Classifier: {stem_grid_search_rf.best_params_}")


Best parameters for Random Forest Classifier: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


ValueError: Invalid parameter 'C' for estimator RandomForestClassifier(). Valid parameters are: ['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].

Best parameters for TF IDF Random Forest Classifier: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

In [None]:

print(f"Best parameters for TF IDF Random Forest Classifier: {tf_idf_grid_search_rf.best_params_}")
print(f"Best parameters for Stem and Stopwords Random Forest Classifier: {stem_grid_search_rf.best_params_}")

Best parameters for TF IDF Random Forest Classifier: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

### Model Setelah Optimisasi

# TESTING

In [None]:
def output_label(n):
  if n==0:
    return "It Is Fake News"
  elif n==1:
    return "It Is Genuine News"


In [None]:
def manual_testing(news):
    # Corrected syntax for defining dictionary
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_x_test = new_def_test["text"].apply(wordopt)

    # Assuming 'vectorization' is your vectorizer object
    new_xv_test = vectorization.transform(new_x_test)

    # Model predictions
    pred_lr = LR.predict(new_xv_test)
    pred_dtc = DTC.predict(new_xv_test)
    pred_rfc = RFC.predict(new_xv_test)

    return "\nLR Prediction: {}\nDTC Prediction: {}\nRFC Prediction: {}".format(
        output_label(pred_lr[0]),
        output_label(pred_dtc[0]),
        output_label(pred_rfc[0])
    )


In [None]:
news_article = str(input())

KeyboardInterrupt: Interrupted by user

In [None]:
manual_testing(news_article)