In [1]:
import numpy as np
import pandas as pd
import re
import pickle
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the dataset
news_df = pd.read_csv('train.csv')

# Fill missing values
news_df = news_df.fillna(' ')

# Combine 'author' and 'title' into a new 'content' column
news_df['content'] = news_df['author'] + " " + news_df['title']

# Stemming
ps = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

news_df['content'] = news_df['content'].apply(stemming)

# Prepare the data
X = news_df['content'].values
y = news_df['label'].values

# Vectorization
vector = TfidfVectorizer()
X = vector.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
train_y_pred = model.predict(X_train)
print("Train accuracy:", accuracy_score(train_y_pred, y_train))

test_y_pred = model.predict(X_test)
print("Test accuracy:", accuracy_score(test_y_pred, y_test))

# Save the model and vectorizer to .pkl files
with open('logistic_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vector, vectorizer_file)

# Load the model and vectorizer from .pkl files
with open('logistic_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    loaded_vector = pickle.load(vectorizer_file)

# Prediction system
input_data = X_test[20]
prediction = loaded_model.predict(input_data)
if prediction[0] == 1:
    print('Fake news')
else:
    print('Real news')

print(news_df['content'][20])


Train accuracy: 0.9868389423076923
Test accuracy: 0.9766826923076923
Fake news
news hope gop nude paul ryan emerg ayahuasca tent vision new republican parti
