**DATA PREPROCESSING**

In [1]:
#step1
from google.colab import files
uploaded = files.upload()


Saving Fake.csv to Fake.csv
Saving True.csv to True.csv


In [2]:
# Step 2: Import libraries
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Step 3: Load datasets
fake_df = pd.read_csv("Fake.csv")
true_df = pd.read_csv("True.csv")


In [4]:
print(fake_df)

                                                   title  \
0       Donald Trump Sends Out Embarrassing New Year’...   
1       Drunk Bragging Trump Staffer Started Russian ...   
2       Sheriff David Clarke Becomes An Internet Joke...   
3       Trump Is So Obsessed He Even Has Obama’s Name...   
4       Pope Francis Just Called Out Donald Trump Dur...   
...                                                  ...   
23476  McPain: John McCain Furious That Iran Treated ...   
23477  JUSTICE? Yahoo Settles E-mail Privacy Class-ac...   
23478  Sunnistan: US and Allied ‘Safe Zone’ Plan to T...   
23479  How to Blow $700 Million: Al Jazeera America F...   
23480  10 U.S. Navy Sailors Held by Iranian Military ...   

                                                    text      subject  \
0      Donald Trump just couldn t wish all Americans ...         News   
1      House Intelligence Committee Chairman Devin Nu...         News   
2      On Friday, it was revealed that former Milwauk...    

In [5]:
print(true_df)

                                                   title  \
0      As U.S. budget fight looms, Republicans flip t...   
1      U.S. military to accept transgender recruits o...   
2      Senior U.S. Republican senator: 'Let Mr. Muell...   
3      FBI Russia probe helped by Australian diplomat...   
4      Trump wants Postal Service to charge 'much mor...   
...                                                  ...   
21412  'Fully committed' NATO backs new U.S. approach...   
21413  LexisNexis withdrew two products from Chinese ...   
21414  Minsk cultural hub becomes haven from authorities   
21415  Vatican upbeat on possibility of Pope Francis ...   
21416  Indonesia to buy $1.14 billion worth of Russia...   

                                                    text       subject  \
0      WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1      WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2      WASHINGTON (Reuters) - The special counsel inv... 

In [6]:
# Add labels
fake_df['label'] = 1
true_df['label'] = 0


In [7]:
# Combine datasets
df = pd.concat([fake_df, true_df], ignore_index=True)


In [9]:
# Drop unnecessary columns
df.drop(columns=['date', 'subject'], inplace=True, errors='ignore')

In [10]:
# Drop rows with missing values
df.dropna(inplace=True)


In [12]:
# Step 4: Clean text
def clean_text(text):
    text = text.lower()  # lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    words = text.split()
    words = [word for word in words if word not in stop_words]  # remove stopwords
    return ' '.join(words)

In [13]:
# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

# Preview
df[['title', 'clean_text', 'label']].head()


Unnamed: 0,title,clean_text,label
0,Donald Trump Sends Out Embarrassing New Year’...,donald trump wish americans happy new year lea...,1
1,Drunk Bragging Trump Staffer Started Russian ...,house intelligence committee chairman devin nu...,1
2,Sheriff David Clarke Becomes An Internet Joke...,friday revealed former milwaukee sheriff david...,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,christmas day donald trump announced would bac...,1
4,Pope Francis Just Called Out Donald Trump Dur...,pope francis used annual christmas day message...,1



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [14]:
#step5 : Feature Engineering
# Feature engineering: text length
df['text_length'] = df['clean_text'].apply(lambda x: len(x.split()))


In [16]:
# Install scikit-learn
!pip install scikit-learn

#step6:Normaliztaion

# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df['clean_text'])




In [18]:
# Normalize vectors
from sklearn.preprocessing import Normalizer # Import Normalizer

normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X_tfidf)

In [20]:
# Combine with text length
from scipy.sparse import hstack # Import hstack for sparse matrices
import numpy as np # Import numpy

X_combined = hstack([X_normalized, np.array(df['text_length']).reshape(-1, 1)])
y = df['label']

In [23]:
#step7 : splitting
# Import train_test_split
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [25]:
# Output shapes
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Training set shape: (35918, 5001)
Test set shape: (8980, 5001)


**MODEL DEVELOPMENT**

In [26]:
#Train and Evaluate Models

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import xgboost as xgb


In [28]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Remove the duplicated y_proba calculation and return statement
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_proba)
    }

In [30]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [31]:
# Evaluate each model
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    results[name] = evaluate_model(model, X_train, X_test, y_train, y_test)

# Display results
import pandas as pd
pd.DataFrame(results).T

Training Logistic Regression...
Training Random Forest...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,0.989866,0.992154,0.988591,0.990369,0.998899
Random Forest,0.998107,0.999153,0.997253,0.998202,0.999841
XGBoost,0.997884,0.999153,0.996831,0.99799,0.999868


In [32]:
!pip install mlflow


Collecting mlflow
  Downloading mlflow-2.22.1-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.1 (from mlflow)
  Downloading mlflow_skinny-2.22.1-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.1->mlflow)
  Downloading databricks_sdk-0.56.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==2.22.1->mlflow)
  Downloading opentelemetry_api-1.34.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==2.22.1->mlflow)
  Downloading op

In [33]:
!pip install mlflow --quiet

MODEL SAVING

In [35]:
# Import joblib for saving the model
import joblib

best_model = None
best_f1 = 0
best_model_name = ""

for name, model in models.items():
    # Evaluate the model using the defined evaluate_model function
    metrics = evaluate_model(model, X_train, X_test, y_train, y_test)
    f1 = metrics["F1 Score"] # Get the F1 score from the results

    # Keep track of the best model based on F1 score
    if f1 > best_f1:
        best_f1 = f1
        # Train the model again to get the trained model object
        # This is necessary because evaluate_model fits the model internally
        # and doesn't return the fitted model object directly.
        # Alternatively, modify evaluate_model to return the fitted model.
        model.fit(X_train, y_train) # Fit the model again to get the trained object
        best_model = model
        best_model_name = name

# Save the best model locally
if best_model: # Only save if a best model was found
    joblib.dump(best_model, f"{best_model_name}_best_model.pkl")
    print(f"Saved best model: {best_model_name}_best_model.pkl with F1 score: {best_f1:.4f}")
else:
    print("No models were evaluated.")

Parameters: { "use_label_encoder" } are not used.



Saved best model: Random Forest_best_model.pkl with F1 score: 0.9983


In [36]:
from google.colab import files
files.download(f"{best_model_name}_best_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>