In [13]:
import pandas as pd
true_df = pd.read_csv("C:\\Users\\AKHIL\\Documents\\Fake News Project\\True.csv")
fake_df = pd.read_csv("C:\\Users\\AKHIL\\Documents\\Fake News Project\\Fake.csv")
true_df["label"] = 1
fake_df["label"] = 0
df = pd.concat([true_df, fake_df], ignore_index=True)
print(df["label"].value_counts())
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

label
0    23481
1    21417
Name: count, dtype: int64


Unnamed: 0,title,text,subject,date,label
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",0
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",0
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",0
3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",1
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",0


In [3]:
df.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [4]:
df.shape

(44898, 5)

In [5]:
df.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [6]:
df = df[['title', 'text', 'label']]
df['content'] = df['title'] + " " + df['text']
df = df[['content', 'label']]
df.head()


Unnamed: 0,content,label
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,0
1,Failed GOP Candidates Remembered In Hilarious...,0
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,0
3,California AG pledges to defend birth control ...,1
4,AZ RANCHERS Living On US-Mexico Border Destroy...,0


In [8]:
import nltk
import re
from nltk.corpus import stopwords

nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned_content'] = df['content'].apply(preprocess_text)
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AKHIL/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,content,label,cleaned_content
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,0,breaking gop chairman grassley enough demands ...
1,Failed GOP Candidates Remembered In Hilarious...,0,failed gop candidates remembered hilarious moc...
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,0,mike pence new dc neighbors hilariously trolli...
3,California AG pledges to defend birth control ...,1,california ag pledges defend birth control ins...
4,AZ RANCHERS Living On US-Mexico Border Destroy...,0,az ranchers living us mexico border destroy na...


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000) 
X = tfidf.fit_transform(df['cleaned_content']).toarray() 
y = df['label']
print(X.shape, y.shape)


(44898, 5000) (44898,)


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(35918, 5000) (8980, 5000) (35918,) (8980,)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))


Accuracy: 0.9887527839643653
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4669
           1       0.99      0.99      0.99      4311

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [12]:
import pickle
with open("fake_news_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)
with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)

In [14]:
print(model.predict(tfidf.transform(["NASA successfully lands rover on Mars"])))  # Expected: 1 (Real)
print(model.predict(tfidf.transform(["Aliens found on the dark side of the moon!"])))  # Expected: 0 (Fake)

[0]
[0]


In [15]:
from sklearn.utils import resample

df_fake_downsampled = df[df["label"] == 0].sample(n=21417, random_state=42)
df_balanced = pd.concat([df_fake_downsampled, df[df["label"] == 1]])

print(df_balanced["label"].value_counts()) 


label
0    21417
1    21417
Name: count, dtype: int64


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df_balanced["text"]) 
y = df_balanced["label"]

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [21]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [22]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print(f"New Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

New Accuracy: 0.9875102136103654
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4288
           1       0.99      0.99      0.99      4279

    accuracy                           0.99      8567
   macro avg       0.99      0.99      0.99      8567
weighted avg       0.99      0.99      0.99      8567



In [23]:
test_texts = [
    "NASA successfully lands rover on Mars",  # Real
    "Aliens found on the dark side of the moon!",  # Fake
    "Government announces new tax policies for 2025",  # Real
    "Elon Musk reveals time travel technology"  # Fake
]

predictions = model.predict(tfidf.transform(test_texts))

for text, pred in zip(test_texts, predictions):
    print(f"'{text}' → {'Real' if pred == 1 else 'Fake'}")

'NASA successfully lands rover on Mars' → Fake
'Aliens found on the dark side of the moon!' → Fake
'Government announces new tax policies for 2025' → Real
'Elon Musk reveals time travel technology' → Fake


In [24]:
print(df_balanced['label'].value_counts())

label
0    21417
1    21417
Name: count, dtype: int64


In [25]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
print(f"New Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


New Accuracy: 0.9978989144391269
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4288
           1       1.00      1.00      1.00      4279

    accuracy                           1.00      8567
   macro avg       1.00      1.00      1.00      8567
weighted avg       1.00      1.00      1.00      8567



In [26]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X_tfidf = tfidf.fit_transform(df_balanced["text"])

In [27]:
import pickle

with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)

In [28]:
print(y_train.value_counts())
print(y_test.value_counts())


label
1    17138
0    17129
Name: count, dtype: int64
label
0    4288
1    4279
Name: count, dtype: int64


In [29]:
test_texts = [
    "NASA announces new space mission for 2025.",
    "Aliens discovered on the dark side of the moon!",
    "Government increases healthcare funding.",
    "Elon Musk reveals working time machine."
]

test_vectors = tfidf.transform(test_texts)  
predictions = model.predict(test_vectors)  
print(predictions)  

[0 0 0 0]


In [30]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4288
           1       0.99      0.99      0.99      4279

    accuracy                           0.99      8567
   macro avg       0.99      0.99      0.99      8567
weighted avg       0.99      0.99      0.99      8567



In [31]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)


In [35]:
column_name = "text"  # Change this to the column you're trying to access
if column_name in df.columns:
    print(df[column_name].head())
else:
    print(f"Column '{column_name}' not found!")


0    Donald Trump s White House is in chaos, and th...
1    Now that Donald Trump is the presumptive GOP n...
2    Mike Pence is a huge homophobe. He supports ex...
3    SAN FRANCISCO (Reuters) - California Attorney ...
4    Twisted reasoning is all that comes from Pelos...
Name: text, dtype: object


In [33]:
print(df.columns)


Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')


In [34]:
df.columns = df.columns.str.strip()


In [40]:
print(df['text'].head(5))  
print(df['cleaned_text'].head(5))


0    Donald Trump s White House is in chaos, and th...
1    Now that Donald Trump is the presumptive GOP n...
2    Mike Pence is a huge homophobe. He supports ex...
3    SAN FRANCISCO (Reuters) - California Attorney ...
4    Twisted reasoning is all that comes from Pelos...
Name: text, dtype: object
0    donald trump s white house is in chaos and the...
1    now that donald trump is the presumptive gop n...
2    mike pence is a huge homophobe he supports ex ...
3    san francisco reuters california attorney gene...
4    twisted reasoning is all that comes from pelos...
Name: cleaned_text, dtype: object


In [39]:
print(df.columns)  # List all columns in DataFrame


Index(['title', 'text', 'subject', 'date', 'label', 'cleaned_text'], dtype='object')


In [38]:
import re

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    return text.strip()

df['cleaned_text'] = df['text'].apply(preprocess_text)  # Create the column

print(df.head())  # Check if the column is added


                                               title  \
0   BREAKING: GOP Chairman Grassley Has Had Enoug...   
1   Failed GOP Candidates Remembered In Hilarious...   
2   Mike Pence’s New DC Neighbors Are HILARIOUSLY...   
3  California AG pledges to defend birth control ...   
4  AZ RANCHERS Living On US-Mexico Border Destroy...   

                                                text       subject  \
0  Donald Trump s White House is in chaos, and th...          News   
1  Now that Donald Trump is the presumptive GOP n...          News   
2  Mike Pence is a huge homophobe. He supports ex...          News   
3  SAN FRANCISCO (Reuters) - California Attorney ...  politicsNews   
4  Twisted reasoning is all that comes from Pelos...      politics   

               date  label                                       cleaned_text  
0     July 21, 2017      0  donald trump s white house is in chaos and the...  
1       May 7, 2016      0  now that donald trump is the presumptive gop n...  
2 

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text']).toarray()  
y = df['label'].values 
print(X.shape, y.shape)


(44898, 5000) (44898,)


In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)  # Check shapes


(35918, 5000) (8980, 5000) (35918,) (8980,)


In [43]:
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Save the trained model
import pickle
with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

print("Model training complete & saved!")

Model training complete & saved!


In [44]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9876391982182628
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4696
           1       0.98      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

