In [1]:
# Import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk



In [2]:
# Download the NLTK stop words and tokenizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load the dataset
df = pd.read_csv('fake_news_corpus.csv', nrows=1000000)

df.describe()

df.head(10)


  df = pd.read_csv('fake_news_corpus.csv', nrows=1000000)


Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,0,2,express.co.uk,rumor,https://www.express.co.uk/news/science/738402/...,"Life is an illusion, at least on a quantum lev...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Is life an ILLUSION? Researchers prove 'realit...,Sean Martin,,[''],THE UNIVERSE ceases to exist when we are not l...,,,
1,1,6,barenakedislam.com,hate,http://barenakedislam.com/category/donald-trum...,"Unfortunately, he hasn’t yet attacked her for ...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",,[''],,,,
2,2,7,barenakedislam.com,hate,http://barenakedislam.com/category/donald-trum...,The Los Angeles Police Department has been den...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",,[''],,,,
3,3,8,barenakedislam.com,hate,http://barenakedislam.com/2017/12/24/more-winn...,The White House has decided to quietly withdra...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"MORE WINNING! Israeli intelligence source, DEB...","Cleavis Nowell, Cleavisnowell, Clarence J. Fei...",,[''],,,,
4,4,9,barenakedislam.com,hate,http://barenakedislam.com/2017/12/25/oh-trump-...,“The time has come to cut off the tongues of t...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"“Oh, Trump, you coward, you just wait, we will...","F.N. Lehner, Don Spilman, Clarence J. Feinour,...",,[''],,,,
5,5,10,barenakedislam.com,hate,http://barenakedislam.com/2017/12/27/following...,The Central American nation and six other stat...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Following Guatemala’s decision to move its emb...,"Pamela Jeffersons, Cleavis Nowell, Cleavisnowe...",,[''],,,,
6,6,11,blackagendareport.com,unreliable,https://blackagendareport.com/un-backed-police...,UN-Backed Police Massacred Haitians With Impun...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,UN-Backed Police Massacred Haitians With Impunity,"Akinyele Umoja, Glen Ford, Bar Executive Edito...",,[''],"“When the police finally left the campus, arou...",,,
7,7,13,blackagendareport.com,unreliable,https://blackagendareport.com/articlelist/inte...,It should have come as no surprise that the sa...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Black Agenda Report,"Nellie Bailey, Glen Ford, Margaret Kimberley, ...",,[''],,,,
8,8,14,blackagendareport.com,unreliable,https://blackagendareport.com/articlelist/inte...,"“When the police finally left the campus, arou...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Black Agenda Report,"Ann Garrison, Bar Contributor, Ken Morgan, Jem...",,[''],,,,
9,9,15,blackagendareport.com,unreliable,https://blackagendareport.com/zambia-must-clar...,Zambia Must Clarify Whether It Will Host Israe...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Zambia Must Clarify Whether It Will Host Israe...,"Glen Ford, Bar Executive Editor, Margaret Kimb...",,[''],“Israel since 1948 has collaborated with the s...,,,


In [7]:
# Filter relevant columns (content and type)
df = df[['content', 'type']]

# Remove records with missing data or other types of news that are not 'fake' or 'reliable'
df = df.dropna()
df = df[(df['type'] == 'fake') | (df['type'] == 'reliable')]

df['type'].unique()


array(['fake', 'reliable'], dtype=object)

In [8]:
# Count the occurrences of each class
class_counts = df['type'].value_counts()

# Determine the minimum count of the classes
min_count = class_counts.min()

# Sample an equal number of instances from each class
balanced_df = df.groupby('type').apply(lambda x: x.sample(n=min_count, random_state=42)).reset_index(drop=True)

# Data Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = text.lower()
    text = re.sub(r'^br$', ' ', text)
    text = re.sub(r'\s+br\s+', ' ', text)
    text = re.sub(r'\s+[a-z]\s+', ' ', text)
    text = re.sub(r'^b\s+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

balanced_df['clean_content'] = balanced_df['content'].apply(lambda x: clean_text(x))
balanced_df['clean_content'] = balanced_df['clean_content'].apply(lambda x: lemmatize_text(x))

# Split the balanced data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(balanced_df['clean_content'], balanced_df['type'], test_size=0.2, random_state=42)

# Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform the training set, transform the test set
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

# Initialize a LogisticRegression classifier
classifier = LogisticRegression()

# Fit the classifier with the training data
classifier.fit(tfidf_train, y_train)

# Predict on the test set
predictions = classifier.predict(tfidf_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, predictions)

print('Model Accuracy:', accuracy)

# Show the confusion matrix
confusion_mat = confusion_matrix(y_test, predictions)
print('Confusion Matrix:\n', confusion_mat)

Model Accuracy: 0.9555765595463138
Confusion Matrix:
 [[508  32]
 [ 15 503]]
