In [5]:
# Step 1: Upload files directly
from google.colab import files
uploaded = files.upload()  # Select Fake.csv and True.csv from your PC


Saving Fake.csv to Fake.csv


In [6]:
# Step 1: Upload files directly
from google.colab import files
uploaded = files.upload()  # Select Fake.csv and True.csv from your PC


Saving True.csv to True.csv


**Load them into Pandas**

In [7]:
# Step 2: Load them into pandas
import pandas as pd

fake_df = pd.read_csv("Fake.csv")
real_df = pd.read_csv("True.csv")

# Step 3: Add labels and combine
fake_df['label'] = 'FAKE'
real_df['label'] = 'REAL'

df = pd.concat([fake_df, real_df])
df = df[['title', 'text', 'label']]

# Step 4: Preview
df.head()


Unnamed: 0,title,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,FAKE
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,FAKE
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",FAKE
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",FAKE
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,FAKE


**Full Preprocessing Code**

In [8]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Download stopwords
nltk.download('stopwords')

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Preprocessing function
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize and remove stopwords
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply to text column
df['text'] = df['text'].apply(clean_text)

# Preview cleaned text
df['text'].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,text
0,donald trump wish american happi new year leav...
1,hous intellig committe chairman devin nune go ...
2,friday reveal former milwauke sheriff david cl...
3,christma day donald trump announc would back w...
4,pope franci use annual christma day messag reb...


**Code for TF-IDF & Train-Test Split:**

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_df=0.7)

# Fit and transform the text column
X = tfidf.fit_transform(df['text'])

# Target labels
y = df['label']

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Train a Machine Learning Model**

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9878619153674832

Classification Report:
               precision    recall  f1-score   support

        FAKE       0.99      0.99      0.99      4733
        REAL       0.99      0.99      0.99      4247

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
 [[4671   62]
 [  47 4200]]


**Train and Compare More Models**

In [11]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))


Naive Bayes Accuracy: 0.9501113585746103


In [12]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))


Random Forest Accuracy: 0.9903118040089087


In [13]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, svm_pred))


SVM Accuracy: 0.9964365256124722


In [14]:
print("\nModel Comparison:")
print("Logistic Regression:", accuracy_score(y_test, y_pred))
print("Naive Bayes:", accuracy_score(y_test, nb_pred))
print("Random Forest:", accuracy_score(y_test, rf_pred))
print("SVM:", accuracy_score(y_test, svm_pred))



Model Comparison:
Logistic Regression: 0.9878619153674832
Naive Bayes: 0.9501113585746103
Random Forest: 0.9903118040089087
SVM: 0.9964365256124722


In [15]:
import pickle

# Save trained model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save your TF-IDF vectorizer (or CountVectorizer)
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
