In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# run "mkdir -p fnd-venv/nltk_data" in your venv terminal to create the directory for these 3 nltk data files
nltk.data.path.append("fnd-venv/nltk_data")
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):

    # 1. Convert text to lowercase
    text = text.lower()

    # 2. Remove any special characters 
    text = re.sub(r'[^a-z\s]', '', text)

    # 3. Tokenize the text
    tokens = nltk.word_tokenize(text)

    # 4. Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    # 5. Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to c:\VSCode Codes\Fake-
[nltk_data]     News-Detection-Project\fnd-venv\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to c:\VSCode Codes\Fake-News-
[nltk_data]     Detection-Project\fnd-venv\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to c:\VSCode Codes\Fake-News-
[nltk_data]     Detection-Project\fnd-venv\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to c:\VSCode Codes\Fake-
[nltk_data]     News-Detection-Project\fnd-venv\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Preprocessing the text

Steps in order:

1. Basic Cleaning

    This comes first because:

    You want to normalize the text before breaking it into tokens.

    Things like punctuation, and casing might interfere with accurate tokenization.

    Cleaning also avoids breaking words like "can't" into ["can", "t"] incorrectly.

2. Remove any special characters

3. Tokenize the text so we can further preprocess each word

    Once the text is clean, now you:

    Split it into sentences (if needed for tasks like summarization or document-level analysis).

    Then split into words (tokens) to perform word-level operations like stopword removal and lemmatization.

4. Remove any stopwords

    Now that you have tokens, remove stopwords (like "the", "is", etc.).

5. Lemmatize each tokenized word

    Lemmatization often depends on correct token spelling and POS tags.

In [2]:
fake_text = pd.read_csv('Fake.csv')
real_text = pd.read_csv('True.csv')

Importing the csv files

In [3]:
real_text.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
real_text.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
fake_text['label'] = 1
real_text['label'] = 0

Since we are detecting 'Fake News', so the label should be '1' for the fake text and '0' for the real text

In [6]:
df = pd.concat([fake_text, real_text]).reset_index(drop=True)

combining both real and fake text into one dataframe

In [7]:
df.tail()

Unnamed: 0,title,text,subject,date,label
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0
44897,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",0


In [8]:
df = df.drop(columns=['title', 'subject', 'date'])

dropping the irrelevant columns, and keeping only the important features, ie 'text', and 'label'

In [9]:
df.head()

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1
2,"On Friday, it was revealed that former Milwauk...",1
3,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis used his annual Christmas Day mes...,1


In [10]:
df.tail()

Unnamed: 0,text,label
44893,BRUSSELS (Reuters) - NATO allies on Tuesday we...,0
44894,"LONDON (Reuters) - LexisNexis, a provider of l...",0
44895,MINSK (Reuters) - In the shadow of disused Sov...,0
44896,MOSCOW (Reuters) - Vatican Secretary of State ...,0
44897,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,0


In [11]:
df['cleaned_text'] = df['text'].apply(preprocess_text)

applying the preprocess_text function that we have created earlier to the dataframe 

In [12]:
df.head()

Unnamed: 0,text,label,cleaned_text
0,Donald Trump just couldn t wish all Americans ...,1,donald trump wish american happy new year leav...
1,House Intelligence Committee Chairman Devin Nu...,1,house intelligence committee chairman devin nu...
2,"On Friday, it was revealed that former Milwauk...",1,friday revealed former milwaukee sheriff david...
3,"On Christmas day, Donald Trump announced that ...",1,christmas day donald trump announced would bac...
4,Pope Francis used his annual Christmas Day mes...,1,pope francis used annual christmas day message...


In [13]:
df.tail()

Unnamed: 0,text,label,cleaned_text
44893,BRUSSELS (Reuters) - NATO allies on Tuesday we...,0,brussels reuters nato ally tuesday welcomed pr...
44894,"LONDON (Reuters) - LexisNexis, a provider of l...",0,london reuters lexisnexis provider legal regul...
44895,MINSK (Reuters) - In the shadow of disused Sov...,0,minsk reuters shadow disused sovietera factory...
44896,MOSCOW (Reuters) - Vatican Secretary of State ...,0,moscow reuters vatican secretary state cardina...
44897,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,0,jakarta reuters indonesia buy sukhoi fighter j...


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']

applying TfidfVectorize to transform its representation before training it to become a model later 

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

LOGISTIC REGRESSION

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred_lr))
print("\n")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\n")
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Accuracy:  0.9881959910913141


Confusion Matrix:
 [[4199   48]
 [  58 4675]]


Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4247
           1       0.99      0.99      0.99      4733

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



SVM

In [None]:
# svm

from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred_svm))
print("\n")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\n")
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

Accuracy:  0.9947661469933184


Confusion Matrix:
 [[4227   20]
 [  27 4706]]


Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      4247
           1       1.00      0.99      1.00      4733

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [18]:
from sklearn.metrics import roc_auc_score

roc_auc_lr = roc_auc_score(y_test, y_pred_lr)
roc_auc_svm = roc_auc_score(y_test, y_pred_svm)

print("ROC AUC Score for Logistic Regression: ", roc_auc_lr)
print("ROC AUC Score for SVM: ", roc_auc_svm)

ROC AUC Score for Logistic Regression:  0.9882217601457756
ROC AUC Score for SVM:  0.9947930832074403


NAIVE BAYES

In [19]:
# import naive bayes
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred_nb))
print("\n")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("\n")
print("Classification Report:\n", classification_report(y_test, y_pred_nb))
print("\n")
print("ROC AUC Score for MNB: ", roc_auc_score(y_test, y_pred_nb))

Accuracy:  0.9289532293986636


Confusion Matrix:
 [[3894  353]
 [ 285 4448]]


Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.92      0.92      4247
           1       0.93      0.94      0.93      4733

    accuracy                           0.93      8980
   macro avg       0.93      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980



ROC AUC Score for MNB:  0.9283334985817407


Testing on unseen data using the most accurate model created

In [20]:
unseen_text = """

	Germany’s chancellor Friedrich Merz has backed Ukrainian military strikes deep into Russian territory, following his earlier commitment to supply Kyiv with German long-range missiles.

Germany, Britain, France and the US have removed all range restrictions for weapons delivered to Ukraine, Merz said on Monday.

Paris, London and Washington have supplied long-range missiles to Kyiv and have already allowed strikes in Russian territory.

But Merz’s stance contrasts with that of his Social Democratic predecessor Olaf Scholz, whom he replaced this month.

The former chancellor repeatedly rejected pleas from Kyiv and its allies to supply the Ukrainian armed forces with German Taurus missiles, which have an intelligent warhead system that can inflict huge damage to structures such as bridges and bunkers. 

While Merz has decided to stop disclosing weapon deliveries since taking office — a stance in line with the “strategic ambiguity” approach of French President Emmanuel Macron — he had previously said he favoured deliveries of Taurus missiles to Kyiv if they were co-ordinated with European allies.

“There is no longer any range restriction on weapons delivered to Ukraine, neither by the British, nor by the French, nor by us, nor by the Americans,” Merz said at a conference in Berlin on Monday.

He added: “This means that Ukraine can now defend itself, for example by attacking military positions in Russia. It couldn’t do that until some time ago, it did do that with very few exceptions. [Ukraine] can do that now. In jargon we call this long range fire.”

The centre-right chancellor made his comments after three days of Russian air attacks on Ukraine that involved more than 1,000 drone and missile strikes.

Dmitry Peskov, President Vladimir Putin’s spokesman, said that decisions to give Ukraine longer-range missiles were “potentially dangerous” and could frustrate “attempts to reach a settlement” in Ukraine, according to state newswire Tass.

The 500km-plus range of the Taurus cruise missile is further than the Storm Shadows and Scalps supplied by the British and French and the Army Tactical Missile System (Atacms) provided by the US.

Those British, French and American missiles were first used against military targets inside Russia’s Bryansk and Kursk regions in November, when the respective governments quietly lifted geographical restrictions on their use.

Separately, Kyiv has also developed its own Neptune long-range missiles, as well as drones that target Russian territory.

Merz’s comments come as European leaders are racing to come up with a plan to increase pressure on Moscow, after US President Donald Trump signalled he is inclined to leave them to sort the conflict among themselves.

UK Prime Minister Keir Starmer will meet Merz in Aachen in western Germany on Thursday, where EU commission president Ursula von der Leyen is receiving the Charlemagne prize.

British officials said Starmer would talk to Merz about increasing economic pressure on Russia and ensuring Ukraine has the financial and military support it needs to maintain the fight.

Ukraine President Volodymyr Zelenskyy is due to travel to Berlin on Wednesday, according to Der Spiegel.

With Washington’s commitment to Ukraine’s security seemingly waning, the French, British and German leaders are increasingly divided over how much military support can be provided to Ukraine after the war.

France and the UK, the two instigators of a so-called “coalition of the willing”, have insisted that an initial plan of deploying troops on the ground in Ukraine is still feasible.

Other nations, including Germany, are more sceptical, since the US remains opposed to the idea and has not promised the security “backstop” previously sought by European powers for such a force.

Starmer and Macron continue to back the troops proposal, people involved in the negotiations said, so as to keep Europe involved in the ceasefire talks and maintain Ukrainian morale, as well as demonstrating their commitment to Trump.

One European official said the troops plan was “dead”, since it was “preposterous without the help of Trump, and he’s not willing to provide it”.

But a French diplomat countered that the reports of the proposal’s death were “not only greatly exaggerated” but also “totally untrue”, adding that the countries were still working on the plans “at normal pace”.
"""

# testing pipeline
def predict_fake_news(unseen_text):
    unseen_text_cleaned = preprocess_text(unseen_text)
    unseen_vectorized = vectorizer.transform([unseen_text_cleaned])
    prediction = lr.predict(unseen_vectorized)
    return prediction

result = predict_fake_news(unseen_text)

if result == 1:
    print("Prediction: Fake News")
else:
    print("Prediction: Real News")

Prediction: Real News
