In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

# Load the datasets
fake = pd.read_csv('/kaggle/input/fake-news-detection/fake.csv')
real = pd.read_csv('/kaggle/input/fake-news-detection/true.csv')

# Add a label column to each dataframe
fake['label'] = 'FAKE'
real['label'] = 'REAL'

# Combine both datasets into one dataframe
data = pd.concat([fake, real], ignore_index=True)

# (Optional) You may combine title and text if desired:
# data['text'] = data['title'] + " " + data['text']

# Use only the 'text' column as features and 'label' as target
X = data['text']
y = data['label']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF Vectorizer (without NLTK, using built-in tokenization)
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Evaluate the model
predictions = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print("Model Accuracy:", accuracy)

# Save the trained model and vectorizer using pickle
with open('model_updated.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
with open('vectorizer_updated.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)


Model Accuracy: 0.9865256124721603
