Fake or Real News?

Requirements for news to work

ID: unique id for news article,
TITLE of news article and author name,
ARTICLE TEXT (doesn't have to be complete),
LABEL: marks fake or real news

importing dependencies

In [1]:
import numpy as py
import pandas as pd
import re
import warnings
import nltk
import os
import sys
import shutil
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.data import find
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#deepseek suggestions...
from tqdm.auto import tqdm  # Import tqdm for progress bar

In [2]:
from nltk import data

def get_nltk_data_dirs():
    """Return all possible NLTK data directories"""
    dirs = data.path.copy()  # Default NLTK paths
    
    # Add standard possible locations
    possible_dirs = [
        os.path.expanduser('~/nltk_data'),
        os.path.join(sys.prefix, 'nltk_data'),
        os.path.join(sys.prefix, 'share', 'nltk_data'),
        os.path.join(sys.prefix, 'lib', 'nltk_data'),
    ]
    
    # Add Windows-specific locations if on Windows
    if sys.platform == 'win32':
        possible_dirs.extend([
            os.path.join(os.environ.get('APPDATA', ''), 'nltk_data'),
            os.path.join(os.environ.get('LOCALAPPDATA', ''), 'nltk_data'),
            os.path.join(os.environ.get('PROGRAMDATA', ''), 'nltk_data'),
        ])
    
    # Add only existing directories to avoid creating unnecessary paths
    return dirs + [d for d in possible_dirs if os.path.exists(d)]

# Get all NLTK data locations
nltk_dirs = get_nltk_data_dirs()

print("Found NLTK data locations:")
for dir_path in nltk_dirs:
    print(f"- {dir_path}")

# Safely remove them
for dir_path in set(nltk_dirs):
    if os.path.exists(dir_path):
        try:
            shutil.rmtree(dir_path)
            print(f"Successfully removed: {dir_path}")
        except Exception as e:
            print(f"Failed to remove {dir_path}: {str(e)}")

# Download required datasets with progress
required_datasets = [
    'punkt',       # Tokenizer models
    'stopwords',   # Stopwords corpus
    'wordnet',     # WordNet lexical database
    'omw-1.4',     # Open Multilingual WordNet
    'punkt_tab'    # Additional tokenizer data
]

for dataset in required_datasets:
    print(f"\nDownloading {dataset}...")
    try:
        nltk.download(dataset, quiet=False)
    except Exception as e:
        print(f"Error downloading {dataset}: {str(e)}")

# Verify installation
try:
    stopwords_file = os.path.join(nltk.data.find('corpora/stopwords'), 'english')
    print(f"\nVerification: Stopwords exist at {stopwords_file}: {os.path.exists(stopwords_file)}")
    
    from nltk.corpus import stopwords
    print("Sample English stopwords:", stopwords.words('english')[:10])
except Exception as e:
    print(f"\nVerification failed: {str(e)}")

Found NLTK data locations:
- C:\Users\danie/nltk_data
- C:\Users\danie\AppData\Local\Programs\Python\Python313\nltk_data
- C:\Users\danie\AppData\Local\Programs\Python\Python313\share\nltk_data
- C:\Users\danie\AppData\Local\Programs\Python\Python313\lib\nltk_data
- C:\Users\danie\AppData\Roaming\nltk_data
- C:\nltk_data
- D:\nltk_data
- E:\nltk_data
- C:\Users\danie\AppData\Roaming\nltk_data
Successfully removed: C:\Users\danie\AppData\Roaming\nltk_data

Downloading punkt...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.



Downloading stopwords...

Downloading wordnet...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...



Downloading omw-1.4...


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...



Downloading punkt_tab...


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.



Verification: Stopwords exist at C:\Users\danie\AppData\Roaming\nltk_data\corpora\stopwords\english: True
Sample English stopwords: ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']


In [3]:
print(stopwords.words('english')) #list of english stopwords

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data pre-processing

In [4]:
# loading data set to pandas dataframe
news_dataset = pd.read_csv('newss.csv') #load from inserting into jupyter notebook address

  news_dataset = pd.read_csv('newss.csv') #load from inserting into jupyter notebook address


In [5]:
news_dataset.shape
# changing the amount
columns_to_keep = ['title', 'text', 'label']  # Example columns
news_dataset = news_dataset[columns_to_keep]  # Overwrites DataFrame

In [6]:
# print first 5 rows of dataframe
news_dataset.head() #figure out how to remove columns

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [7]:
# counting number of missing values in dataset
news_dataset.isnull().sum()

title     610
text      866
label    1040
dtype: int64

In [8]:
# replacing null values with empty string
news_dataset = news_dataset.fillna('')

In [9]:
# merging author and news title names
news_dataset['content'] = news_dataset['title'] + ' ' + news_dataset['text']

In [10]:
print(news_dataset['content'])

0       You Can Smell Hillary’s Fear Daniel Greenfield...
1       Watch The Exact Moment Paul Ryan Committed Pol...
2       Kerry to go to Paris in gesture of sympathy U....
3       Bernie supporters on Twitter erupt in anger ag...
4       The Battle of New York: Why This Primary Matte...
                              ...                        
7790    State Department says it can't find emails fro...
7791    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
7792    Anti-Trump Protesters Are Tools of the Oligarc...
7793    In Ethiopia, Obama seeks progress on peace, se...
7794    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: content, Length: 7795, dtype: object


In [11]:
# seperating data & label
X = news_dataset.drop(columns = 'label', axis = 1)
Y = news_dataset['label']

In [12]:
print(X)
print(Y)

                                                  title  \
0                          You Can Smell Hillary’s Fear   
1     Watch The Exact Moment Paul Ryan Committed Pol...   
2           Kerry to go to Paris in gesture of sympathy   
3     Bernie supporters on Twitter erupt in anger ag...   
4      The Battle of New York: Why This Primary Matters   
...                                                 ...   
7790  State Department says it can't find emails fro...   
7791  The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...   
7792  Anti-Trump Protesters Are Tools of the Oligarc...   
7793  In Ethiopia, Obama seeks progress on peace, se...   
7794  Jeb Bush Is Suddenly Attacking Trump. Here's W...   

                                                   text  \
0     Daniel Greenfield, a Shillman Journalism Fello...   
1     Google Pinterest Digg Linkedin Reddit Stumbleu...   
2     U.S. Secretary of State John F. Kerry said Mon...   
3     — Kaydee King (@KaydeeKing) November 9, 2016 T...

Stemming
- process of reducing a word to root word
- examples include
- actor, actress, acting --> act

In [13]:
stemmer = SnowballStemmer('english')
regex = re.compile('[^a-zA-Z]')
stop_words = set(stopwords.words('english'))  # Define stop_words globally

In [14]:

def clean_text(content):
    """Improved text cleaning"""
    if not isinstance(content, str):
        return ""
    content = re.sub(r"[^\w\s'-]", " ", content)
    return re.sub(r"\s+", " ", content).lower().strip()

def stemming(content):
    """Safe stemming function"""
    content = clean_text(content)
    if not content:
        return "[EMPTY]"
    try:
        words = word_tokenize(content)
        stems = [stemmer.stem(word) for word in words if word not in stop_words]
        return ' '.join(stems) if stems else "[EMPTY]"
    except Exception as e:
        print(f"Error in stemming: {e}")
        return "[EMPTY]"

# Test with your original problematic text
test_text = "Running quickly, the hackers avoided detection (2024)."
processed = stemming(test_text)
print(f"\nInput: {test_text}")
print(f"Output: {processed}")


Input: Running quickly, the hackers avoided detection (2024).
Output: run quick hacker avoid detect 2024


In [15]:
# news_dataset['content'] = news_dataset['content'].apply(stemming) not needed anymore due to loop

In [16]:
#removed

In [17]:
# seperating data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [18]:
print(X)
print(Y)

 'Watch The Exact Moment Paul Ryan Committed Political Suicide At A Trump Rally (VIDEO) Google Pinterest Digg Linkedin Reddit Stumbleupon Print Delicious Pocket Tumblr \nThere are two fundamental truths in this world: Paul Ryan desperately wants to be president. And Paul Ryan will never be president. Today proved it. \nIn a particularly staggering example of political cowardice, Paul Ryan re-re-re-reversed course and announced that he was back on the Trump Train after all. This was an aboutface from where he was a few weeks ago. He had previously declared he would not be supporting or defending Trump after a tape was made public in which Trump bragged about assaulting women. Suddenly, Ryan was appearing at a pro-Trump rally and boldly declaring that he already sent in his vote to make him President of the United States. It was a surreal moment. The figurehead of the Republican Party dosed himself in gasoline, got up on a stage on a chilly afternoon in Wisconsin, and lit a match. . @Spe

In [19]:
Y.shape


(7795,)

In [20]:
# converting textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [21]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2178493 stored elements and shape (7795, 67463)>
  Coords	Values
  (0, 1620)	0.015761014164625112
  (0, 2301)	0.020069889648424964
  (0, 2394)	0.01293345848392442
  (0, 2482)	0.022983544857798327
  (0, 2629)	0.01555183513744674
  (0, 2633)	0.04165327232950833
  (0, 2742)	0.05037800840747752
  (0, 2972)	0.021424762365947505
  (0, 3029)	0.019558365575233133
  (0, 3210)	0.057728052408511345
  (0, 3230)	0.0073421951594629
  (0, 3252)	0.03412665329099384
  (0, 3260)	0.015751264453940077
  (0, 3269)	0.014958948179829549
  (0, 3273)	0.03504789791397069
  (0, 3323)	0.011623153056156485
  (0, 3369)	0.014151966456912213
  (0, 3738)	0.01945854776263577
  (0, 3750)	0.033656607386818006
  (0, 3762)	0.018597330305994864
  (0, 3800)	0.029901942290249985
  (0, 3822)	0.014833189608545296
  (0, 3887)	0.02163388179699249
  (0, 3892)	0.014230613790950366
  (0, 4050)	0.017478664866133725
  :	:
  (7794, 64649)	0.020308332801205364
  (7794, 64658)

splitting dataset to test and train data

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 3)

Training the model: Logistic Regressor

In [23]:
model = LogisticRegression()

In [24]:
model.fit(X_train, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


Evaluation for accuracy score

In [25]:
# acc score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [26]:
print('Accuracy score of training data: ', training_data_accuracy)

Accuracy score of training data:  0.9065105837075048


In [27]:
# acc score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [28]:
print('Accuracy score of test data: ', test_data_accuracy)

Accuracy score of test data:  0.8576010262989096


Making prediction system

In [29]:
X_new = X_test[0]

prediction = model.predict(X_new)
print(prediction)

if(prediction[0] == 0):
    print('news is real')
else:
    print('news is fake') 

['FAKE']
news is fake


In [30]:
print(Y_test[2]) #modify for confirmation


