In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Step 1: Load Dataset
#url = "https://raw.githubusercontent.com/Asifkhan180/Task-2-Detecting-fake-news/main/fake_or_real_news.csv"
data = pd.read_csv("C:\\Users\\DELL\\Downloads\\fake_or_real_news.csv")

# Inspect the dataset
print("First few rows of the dataset:")
print(data.head())
print("\nDataset info:")
print(data.info())
print("\nLabel distribution:")
print(data['label'].value_counts())

# Step 2: Preprocess the Data
# Features and labels
X = data['text']
y = data['label']

# Vectorize the text data
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_vectorized = tfidf.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Step 3: Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 4: Save the Model and Vectorizer
with open("logistic_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("\nModel and vectorizer saved successfully!")

# Step 5: Predict User Input
# Load the model and vectorizer
with open("logistic_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

with open("tfidf_vectorizer.pkl", "rb") as f:
    loaded_tfidf = pickle.load(f)

# Function for user input prediction
def predict_news(news_input):
    news_vectorized = loaded_tfidf.transform([news_input])
    prediction = loaded_model.predict(news_vectorized)
    if prediction[0] == "FAKE":
        return "The news is likely FAKE."
    else:
        return "The news is likely REAL."

# Take user input
news_input = input("\nEnter the news text to classify: ")
print(predict_news(news_input))

First few rows of the dataset:
   Unnamed: 0                                              title  \
0        8476                       You Can Smell Hillary’s Fear   
1       10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2        3608        Kerry to go to Paris in gesture of sympathy   
3       10142  Bernie supporters on Twitter erupt in anger ag...   
4         875   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  