In [None]:
!pip install pandas numpy scikit-learn


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle


In [4]:
# Update this path with the location of your CSV file
data_path = r"D:\Chinmay\ML PROJECTS\Fake News Detection\news.csv"

# Load the dataset
df = pd.read_csv(data_path)

# Display first few rows to check structure
df.head()


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [6]:
# Remove any rows with missing values
df.dropna(inplace=True)

# Map the text labels to numerical values: 'FAKE' -> 0, 'REAL' -> 1
df['label'] = df['label'].map({'FAKE': 0, 'REAL': 1})

# Check for any remaining missing labels
print("Missing labels:", df['label'].isna().sum())

# Drop rows where label is missing (if any)
df = df.dropna(subset=['label'])

# Verify the changes
df.head()


Missing labels: 0


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


In [8]:
# Extract the features and target variable
X = df['text']     # News content
y = df['label'].values  # Target labels

# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])


Training samples: 5068
Test samples: 1267


In [10]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit on training data and transform both training and test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

print("TF-IDF shape (training):", X_train_tfidf.shape)
print("TF-IDF shape (test):", X_test_tfidf.shape)


TF-IDF shape (training): (5068, 5000)
TF-IDF shape (test): (1267, 5000)


In [12]:
# Initialize and train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [14]:
# Predict on the test data
y_pred = model.predict(X_test_tfidf)

# Compute and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.91

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.92      0.91       628
           1       0.92      0.91      0.91       639

    accuracy                           0.91      1267
   macro avg       0.91      0.91      0.91      1267
weighted avg       0.91      0.91      0.91      1267


Confusion Matrix:
[[579  49]
 [ 60 579]]


In [16]:
# Save the trained model to disk
with open('fake_news_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the vectorizer to disk
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)


In [18]:
def predict_news(news_text):
    # Transform the input text using the saved vectorizer
    text_tfidf = vectorizer.transform([news_text])
    # Predict the label using the trained model
    prediction = model.predict(text_tfidf)[0]
    # Return the result as human-readable text
    return "REAL" if prediction == 1 else "FAKE"

# Test the prediction function with an example
example_text = "Breaking news: The world is flat!"
print("Prediction for example text:")
print(predict_news(example_text))


Prediction for example text:
FAKE
