In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import re
import nltk
from nltk.corpus import stopwords 
import pickle

In [13]:
df = pd.read_csv('../Datasets/sortedDataFrame.csv')

In [14]:
# Find indices where 'text' column is null
null_text_indices = df[df['text'].isnull()].index

# Delete rows with null 'text'
df.drop(null_text_indices, inplace=True)

In [15]:
# Initialize TfidfVectorizer for text features
tfidf_vectorizer_text = TfidfVectorizer()
tfidf_vectorizer_combined_sentiment = TfidfVectorizer()

# Fit and transform the 'text' feature
X_text_features_text = tfidf_vectorizer_text.fit_transform(df['text'])

# Fit and transform the 'combined_sentiment' feature
X_text_features_combined_sentiment = tfidf_vectorizer_combined_sentiment.fit_transform(df['combined_sentiment'])

# Combine text features into one feature matrix
X = pd.concat([pd.DataFrame(X_text_features_text.toarray()), 
               pd.DataFrame(X_text_features_combined_sentiment.toarray())], axis=1)

# Target variable
y = df['label']

# Split the df into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Selection and Training
rf_model = RandomForestClassifier()  # Example, you can try different models
rf_model.fit(X_train, y_train)

# Model Evaluation
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

Random Forest Accuracy: 0.8960317460317461


In [16]:
#Ukranian news about Russia
path = "../Datasets/test.txt"
with open(path, 'r', encoding = 'utf-8') as file:
    sample_text = file.read()
def prediction(text):
    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower().split()
    temp = [word for word in review if word not in stopwords.words('english')]
    text_string = ' '.join(temp)
    return text_string

text = prediction(sample_text)

tfidf_sampletext = tfidf_vectorizer_text.transform([text])
tfidf_sentiment = tfidf_vectorizer_combined_sentiment.transform([text])

test_features = pd.concat([pd.DataFrame(tfidf_sampletext.toarray()), 
                           pd.DataFrame(tfidf_sentiment.toarray())], axis=1)
samplePred = rf_model.predict(test_features)

print("Predicted Label: ", samplePred)

Predicted Label:  ['FAKE']


In [17]:
#CNN news about America
path = "../Datasets/test1.txt"
with open(path, 'r', encoding = 'utf-8') as file:
    sample_text = file.read()
def prediction(text):
    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower().split()
    stemmed_review = [word for word in review if word not in stopwords.words('english')]
    text_string = ' '.join(stemmed_review)
    return text_string

text = prediction(sample_text)

tfidf_sampletext = tfidf_vectorizer_text.transform([text])
tfidf_sentiment = tfidf_vectorizer_combined_sentiment.transform([text])

test_features = pd.concat([pd.DataFrame(tfidf_sampletext.toarray()), 
                           pd.DataFrame(tfidf_sentiment.toarray())], axis=1)
samplePred = rf_model.predict(test_features)

print("Predicted Label: ", samplePred)

Predicted Label:  ['REAL']


In [19]:
#Dumping model as pkl
with open('FakeNewsModel_V3.pkl', 'wb') as f:
    pickle.dump((rf_model,tfidf_vectorizer_text,tfidf_vectorizer_combined_sentiment),f)

