In [None]:
# Install libraries
!pip install -q nltk scikit-learn pandas

In [None]:
import os
import pickle
import string
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [None]:
# Download NLTK resources
nltk.download('punkt', force=True)
nltk.download('stopwords', force=True)
nltk.download('wordnet', force=True)
nltk.download('omw-1.4', force=True)
nltk.download('averaged_perceptron_tagger', force=True)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# Ganti dengan path file yang sesuai di Google Drive
fake_data = pd.read_csv('/content/drive/MyDrive/Dataset/Fake.csv')
true_data = pd.read_csv('/content/drive/MyDrive/Dataset/True.csv')

# Menampilkan beberapa baris pertama
print(fake_data.head())
print(true_data.head())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept t

In [None]:
# Add labels to the datasets
fake_data['label'] = 0  # Fake news label
true_data['label'] = 1  # True news label

# Combine both datasets
data = pd.concat([fake_data, true_data], ignore_index=True)

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# Preview data
data.head()


Unnamed: 0,title,text,subject,date,label
0,Couple Gives Up Buying $1 Million Boat To Sen...,The parents at a Southern California elementar...,News,"February 8, 2016",0
1,"Trump praises release of U.S.-Canadian family,...",WASHINGTON (Reuters) - U.S. President Donald T...,worldnews,"October 12, 2017",1
2,"Brace for a UK election next year, opposition ...",LONDON (Reuters) - Opposition Labour Party lea...,worldnews,"December 19, 2017",1
3,Britain preparing to transfer 400 million poun...,LONDON (Reuters) - Britain is preparing to tra...,worldnews,"November 16, 2017",1
4,BREAKING NEWS: Senator John McCain Diagnosed W...,The McCain family has requested everyone s pra...,politics,"Jul 19, 2017",0


In [None]:
# Download NLTK resources
nltk.download('punkt', force=True)
nltk.download('stopwords', force=True)
nltk.download('wordnet', force=True)
nltk.download('omw-1.4', force=True)
nltk.download('averaged_perceptron_tagger', force=True)
nltk.download('punkt_tab', force=True) # Download the missing resource

# Initialize NLTK tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function for text preprocessing
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove punctuation and stopwords
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)

# Apply preprocessing to the text column
data['cleaned_text'] = data['text'].apply(preprocess_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform the text data
X = vectorizer.fit_transform(data['cleaned_text'])

# Define the target variable
y = data['label']


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=200)

# Train the model
model.fit(X_train, y_train)


In [None]:
# Predict the labels for the test data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Display the metrics
print("="*50)
print("MODEL PERFORMANCE METRICS")
print("="*50)
print(f"Accuracy:  {acc:.4f} ({acc*100:.2f}%)")
print(f"Precision: {prec:.4f} ({prec*100:.2f}%)")
print(f"Recall:    {rec:.4f} ({rec*100:.2f}%)")
print(f"F1-Score:  {f1:.4f} ({f1*100:.2f}%)")
print("="*50)

# Detailed classification report
print("\nDETAILED CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred, target_names=['Fake News', 'True News']))


MODEL PERFORMANCE METRICS
Accuracy:  0.9865 (98.65%)
Precision: 0.9831 (98.31%)
Recall:    0.9888 (98.88%)
F1-Score:  0.9859 (98.59%)

DETAILED CLASSIFICATION REPORT:
              precision    recall  f1-score   support

   Fake News       0.99      0.98      0.99      4696
   True News       0.98      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [None]:
# Save the model and vectorizer
with open("logreg_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.


In [None]:
# Load the model and vectorizer
def load_model():
    with open("logreg_model.pkl", "rb") as f:
        model = pickle.load(f)
    with open("tfidf_vectorizer.pkl", "rb") as f:
        vectorizer = pickle.load(f)
    return model, vectorizer

model, vectorizer = load_model()


In [None]:
# Sample headline for prediction
sample_headline = "Breaking news: World leaders meet for emergency climate summit"

# Preprocess and transform the sample headline
sample_headline_clean = preprocess_text(sample_headline)
sample_vectorized = vectorizer.transform([sample_headline_clean])

# Predict using the loaded model
prediction = model.predict(sample_vectorized)

# Output the prediction result
if prediction == 1:
    print("This headline is TRUE.")
else:
    print("This headline is FAKE.")


This headline is FAKE.
