In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:
fake_text = pd.read_csv('Fake.csv')
real_text = pd.read_csv('True.csv')

In [3]:
fake_text['label'] = 1
real_text['label'] = 0

In [4]:
df = pd.concat([fake_text, real_text]).reset_index(drop=True)

In [5]:
df

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0


In [5]:
df = df.drop(columns=['title', 'subject', 'date'])

In [7]:
df

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1
2,"On Friday, it was revealed that former Milwauk...",1
3,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis used his annual Christmas Day mes...,1
...,...,...
44893,BRUSSELS (Reuters) - NATO allies on Tuesday we...,0
44894,"LONDON (Reuters) - LexisNexis, a provider of l...",0
44895,MINSK (Reuters) - In the shadow of disused Sov...,0
44896,MOSCOW (Reuters) - Vatican Secretary of State ...,0


In [8]:
# run "mkdir -p fnd-venv/nltk_data" in your venv terminal to create the directory for these 3 nltk data files
nltk.data.path.append("fnd-venv/nltk_data")
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):

    # 1. Convert text to lowercase
    text = text.lower()

    # 2. Remove any special characters 
    text = re.sub(r'[^a-z\s]', '', text)

    # 3. Tokenize the text
    tokens = nltk.word_tokenize(text)

    # 4. Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    # 5. Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

[nltk_data] Downloading package stopwords to c:\VSCode Codes\Fake-
[nltk_data]     News-Detection-Project\fnd-venv\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to c:\VSCode Codes\Fake-News-
[nltk_data]     Detection-Project\fnd-venv\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to c:\VSCode Codes\Fake-News-
[nltk_data]     Detection-Project\fnd-venv\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to c:\VSCode Codes\Fake-
[nltk_data]     News-Detection-Project\fnd-venv\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [9]:
df['tokens'] = df['text'].apply(preprocess_text)

In [10]:
df

Unnamed: 0,text,label,tokens
0,Donald Trump just couldn t wish all Americans ...,1,"[donald, trump, wish, american, happy, new, ye..."
1,House Intelligence Committee Chairman Devin Nu...,1,"[house, intelligence, committee, chairman, dev..."
2,"On Friday, it was revealed that former Milwauk...",1,"[friday, revealed, former, milwaukee, sheriff,..."
3,"On Christmas day, Donald Trump announced that ...",1,"[christmas, day, donald, trump, announced, wou..."
4,Pope Francis used his annual Christmas Day mes...,1,"[pope, francis, used, annual, christmas, day, ..."
...,...,...,...
44893,BRUSSELS (Reuters) - NATO allies on Tuesday we...,0,"[brussels, reuters, nato, ally, tuesday, welco..."
44894,"LONDON (Reuters) - LexisNexis, a provider of l...",0,"[london, reuters, lexisnexis, provider, legal,..."
44895,MINSK (Reuters) - In the shadow of disused Sov...,0,"[minsk, reuters, shadow, disused, sovietera, f..."
44896,MOSCOW (Reuters) - Vatican Secretary of State ...,0,"[moscow, reuters, vatican, secretary, state, c..."


In [11]:
from collections import Counter

all_tokens = [token for tokens in df['tokens'] for token in tokens]

# Keep only tokens that appear >= 2 times
vocab = {word: i+2 for i, (word, count) in enumerate(Counter(all_tokens).items()) if count >= 2}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

print("Vocab size:", len(vocab))


Vocab size: 111204


In [12]:
def encode(tokens):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

df['encoded'] = df['tokens'].apply(encode)


In [13]:
max_index = max([max(seq) for seq in df['encoded'] if len(seq) > 0])
print("Max index used after re-encoding:", max_index)
print("Vocab size:", len(vocab))


Max index used after re-encoding: 203389
Vocab size: 111204


In [14]:
bad_indices = [(i, max(seq)) for i, seq in enumerate(df['encoded']) if len(seq) > 0 and max(seq) > len(vocab)]
print("Number of sequences with out-of-range indices:", len(bad_indices))


Number of sequences with out-of-range indices: 28659


In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

# Setup nltk
nltk.data.path.append("fnd-venv/nltk_data")
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Preprocessing setup
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocess function that returns tokens
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# STEP 3: Drop unused columns and preprocess
df = df.drop(columns=['title', 'subject', 'date'])
df['tokens'] = df['text'].apply(preprocess_text)

# STEP 4: Build vocab ONLY from preprocessed tokens
all_tokens = [token for tokens in df['tokens'] for token in tokens]
vocab = {word: i+2 for i, (word, count) in enumerate(Counter(all_tokens).items()) if count >= 2}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

print("Vocab size:", len(vocab))  # should be in sync with actual token indices

# STEP 5: Encode using vocab
def encode(tokens):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

df['encoded'] = df['tokens'].apply(encode)

# STEP 6: Check for any out-of-range indices
max_idx = max([max(seq) if seq else 0 for seq in df['encoded']])
print("Max index used after re-encoding:", max_idx)


[nltk_data] Downloading package stopwords to c:\VSCode Codes\Fake-
[nltk_data]     News-Detection-Project\fnd-venv\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to c:\VSCode Codes\Fake-News-
[nltk_data]     Detection-Project\fnd-venv\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to c:\VSCode Codes\Fake-News-
[nltk_data]     Detection-Project\fnd-venv\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Vocab size: 111204
Max index used after re-encoding: 203389


In [7]:
for i in range(3):
    print(f"Encoded example {i}:", df['encoded'].iloc[i])


Encoded example 0: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 4, 31, 32, 13, 14, 33, 15, 16, 17, 18, 6, 34, 7, 8, 35, 36, 37, 38, 39, 8, 40, 25, 26, 27, 28, 29, 30, 4, 31, 32, 13, 14, 33, 15, 16, 17, 18, 6, 34, 7, 8, 39, 8, 40, 2, 41, 3, 42, 43, 3, 44, 45, 1, 1, 48, 35, 49, 7, 8, 50, 51, 52, 53, 54, 55, 3, 56, 57, 33, 58, 59, 60, 61, 62, 4, 5, 63, 6, 7, 8, 64, 65, 66, 67, 43, 23, 51, 68, 69, 43, 70, 71, 72, 39, 8, 40, 73, 74, 75, 76, 77, 78, 79, 80, 43, 81, 82, 83, 84, 85, 86, 87, 86, 88, 89, 1, 43, 91, 92, 14, 7, 8, 4, 93, 93, 43, 94, 6, 7, 8, 1, 1, 1, 43, 3, 7, 8, 98, 44, 6, 7, 8, 99, 84, 13, 100, 101, 102, 103, 104, 2, 41, 3, 42, 43, 105, 7, 3, 106, 107, 108, 13, 14, 7, 8, 109, 110, 111, 1, 113, 114, 115, 43, 3, 116, 44, 117, 1, 61, 119, 120, 121, 35, 122, 1, 1, 43, 125, 51, 126, 127, 8, 128, 129, 130, 1, 1, 43, 133, 134, 91, 135, 14, 136, 1, 43, 138, 8, 139, 103, 140, 43, 85, 141, 142, 143, 71, 144, 145, 146, 147, 14

In [8]:
for token in df['tokens'].iloc[0]:
    if token not in vocab:
        print("Missing token:", token)


Missing token: welll
Missing token: expectwhat
Missing token: alansandoval
Missing token: koren
Missing token: pollitt
Missing token: korencarpenter
Missing token: pictwittercomfpaekypa
Missing token: presidentialhow
Missing token: goodine
Missing token: sgoodine
Missing token: schulze
Missing token: thbthttt
Missing token: wendywhistles
Missing token: olderphoto


In [9]:
words_to_check = [
    "welll", "expectwhat", "alansandoval", "koren", "pollitt", 
    "korencarpenter", "pictwittercomfpaekypa", "presidentialhow",
    "goodine", "sgoodine", "schulze", "thbthttt", "wendywhistles", "olderphoto"
]

for word in words_to_check:
    tokens = tokenizer.tokenize(word)
    print(f"Word: {word} -> Tokens: {tokens}")

NameError: name 'tokenizer' is not defined