In [1]:
import numpy as np
import pandas as pd

true_data = pd.read_csv('data/True.csv')
fake_data = pd.read_csv('data/Fake.csv')

In [2]:
print("True articles")
true_data.head()


True articles


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [16]:
print("Fake articles")
fake_data.head()

Fake articles


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [39]:
true_data.describe()

Unnamed: 0,title,text,subject,date
count,21417,21417,21417,21417
unique,20826,21192,2,716
top,Factbox: Trump fills top jobs for his administ...,(Reuters) - Highlights for U.S. President Dona...,politicsNews,"December 20, 2017"
freq,14,8,11272,182


In [2]:
# Creating a single dataset with labels for true = 1 and false = 0

true_data['label'] = 1
fake_data['label'] = 0

# Cleaning true_data before concatinating
## Removing journal identifier
true_data['text'] = true_data['text'].str.partition('- ')[2]

# Removing duplicates rows of titles where text is empty
# Create a placeholder to check if text is empty
is_text_functionally_empty = (
    true_data['text'].isna() | 
    true_data['text'].astype(str).str.strip().eq('')
)
rows_to_drop = true_data[is_text_functionally_empty].duplicated(subset=['title'], keep='first')
drop_indices = true_data[is_text_functionally_empty][rows_to_drop].index
cleaned_true = true_data.drop(index=drop_indices)
# Doing the same for fake articles
fake_is_text_functionally_empty = (
    fake_data['text'].isna() | 
    fake_data['text'].astype(str).str.strip().eq('')
)
rows_to_drop = fake_data[fake_is_text_functionally_empty].duplicated(subset=['title'], keep='first')
drop_indices = fake_data[fake_is_text_functionally_empty][rows_to_drop].index
cleaned_fake = fake_data.drop(index=drop_indices)

print(f"Removed {len(true_data)-len(cleaned_true)} duplicate title rows from true articles")
print(f"Removed {len(fake_data)-len(cleaned_fake)} duplicate title rows from fake articles")
print("")

# Removing duplicates of text but keeping unique rows of title
has_content = ~(
    cleaned_fake['text'].isna() |
    cleaned_fake['text'].astype(str).str.strip().eq('')
)
rows_to_remove = cleaned_fake[has_content].duplicated(subset=['text'], keep='first')

drop_indices = cleaned_fake[has_content][rows_to_remove].index

new_cleaned_fake = cleaned_fake.drop(index=drop_indices)

# Doing the same for true articles

has_content = ~(
    cleaned_true['text'].isna() |
    cleaned_true['text'].astype(str).str.strip().eq('')
)
rows_to_remove = cleaned_true[has_content].duplicated(subset=['text'], keep='first')

drop_indices = cleaned_true[has_content][rows_to_remove].index

new_cleaned_true = cleaned_true.drop(index=drop_indices)

print(f"Removed {len(cleaned_true)-len(new_cleaned_true)} duplicate text rows from true articles")
print(f"Removed {len(cleaned_fake)-len(new_cleaned_fake)} duplicate text rows from fake articles")
print("______________________________________________________________________")
print(f"Removed {len(true_data)-len(new_cleaned_true)} duplicate rows from true articles in total")
print(f"Removed {len(fake_data)-len(new_cleaned_fake)} duplicate rows from fake articles in total")

# Concatinating dataframes
data = pd.concat([new_cleaned_true, new_cleaned_fake])

Removed 6 duplicate title rows from true articles
Removed 184 duplicate title rows from fake articles

Removed 229 duplicate text rows from true articles
Removed 5398 duplicate text rows from fake articles
______________________________________________________________________
Removed 235 duplicate rows from true articles in total
Removed 5582 duplicate rows from fake articles in total


In [3]:
# Text standardizing
import string


# Creating new columns to preserve original text
data['title_standard'] = data['title']
data['text_standard'] = data['text']

# Removing punctuations including special letter which didn't get picked up by string.punctuation
punctuation_and_special = string.punctuation + '“”‘’' 
punctuation = str.maketrans('', '', punctuation_and_special)

data['title_standard'] = data['title'].astype(str).str.translate(punctuation)
data['text_standard'] = data['text'].astype(str).str.translate(punctuation)

# Lowercasing 
data['title_standard'] = data['title_standard'].astype(str).str.strip().str.lower()
data['text_standard'] = data['text_standard'].astype(str).str.strip().str.lower()


# Naive bayes

In [4]:
# Baseline naive bayes model for article texts
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


# Define Features (X) and Target (y)
X = data['text_standard']  
y = data['label']

# Splitting the data and making sure we get the same number of labels in each set with stratify=y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y
)

# Setting max_df to 0.7 we exclude words that appears in more than 70% of all articles in the training set so we should get more unique words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data
X_train_counts = count_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_counts = count_vectorizer.transform(X_test)

# Train Multinomial Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_counts, y_train)

# 5. Predict and Evaluate
y_pred_counts = nb_classifier.predict(X_test_counts)

# Get the feature names to see which words are helping us predict
feature_names = count_vectorizer.get_feature_names_out()

# Getting log coefficients 
log_probs_fake = nb_classifier.feature_log_prob_[0]
log_probs_true = nb_classifier.feature_log_prob_[1]

# Create a DataFrame so we can sort them 
feature_df = pd.DataFrame({
    'feature': feature_names,
    'log_prob_fake': log_probs_fake,
    'log_prob_true': log_probs_true
})

# Calculate the difference between coefficients 
# A larger positive difference means the word is highly associated with fake news
feature_df['fake_score'] = feature_df['log_prob_fake'] - feature_df['log_prob_true']

# 5. Get top 20 for Fake (highest positive scores)
top_fake_features = feature_df.sort_values(by='fake_score', ascending=False).head(20)

# 6. Get top 20 for True (lowest negative scores)
top_true_features = feature_df.sort_values(by='fake_score', ascending=True).head(20)


# Print results
print("--- Naive Bayes Classification Results (Using Raw Word Counts) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_counts):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_counts))

# Print important classifying words
print("--- Top 20 Words Predicting FAKE News (Label 0) ---")
print(top_fake_features[['feature', 'fake_score']].to_markdown(index=False))

print("\n--- Top 20 Words Predicting TRUE News (Label 1) ---")
print(top_true_features[['feature', 'fake_score']].to_markdown(index=False))

--- Naive Bayes Classification Results (Using Raw Word Counts) ---
Accuracy: 0.9424

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.91      0.94      3580
           1       0.93      0.97      0.95      4237

    accuracy                           0.94      7817
   macro avg       0.94      0.94      0.94      7817
weighted avg       0.94      0.94      0.94      7817

--- Top 20 Words Predicting FAKE News (Label 0) ---
| feature             |   fake_score |
|:--------------------|-------------:|
| 2017realdonaldtrump |      7.0647  |
| 21wire              |      6.35848 |
| belowfeatured       |      6.28281 |
| getty               |      6.2585  |
| 2017the             |      6.08819 |
| flickr              |      6.06944 |
| 21wiretv            |      5.91799 |
| 2016realdonaldtrump |      5.82926 |
| somodevillagetty    |      5.74693 |
| screenshot          |      5.74319 |
| acr                 |      5.70112 |
| cda

In [5]:
# We do the same for the article titles

X = data['title_standard']
y = data['label']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
count_vectorizer = CountVectorizer(stop_words='english')


X_train_counts = count_vectorizer.fit_transform(X_train)


X_test_counts = count_vectorizer.transform(X_test)


nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_counts, y_train)


y_pred_counts = nb_classifier.predict(X_test_counts)

feature_names = count_vectorizer.get_feature_names_out()

log_probs_fake = nb_classifier.feature_log_prob_[0]
log_probs_true = nb_classifier.feature_log_prob_[1]


feature_df = pd.DataFrame({
    'feature': feature_names,
    'log_prob_fake': log_probs_fake,
    'log_prob_true': log_probs_true
})

feature_df['fake_score'] = feature_df['log_prob_fake'] - feature_df['log_prob_true']
top_fake_features = feature_df.sort_values(by='fake_score', ascending=False).head(20)
top_true_features = feature_df.sort_values(by='fake_score', ascending=True).head(20)

print("--- Naive Bayes Classification Results (Using Raw Word Counts) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_counts):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_counts))
print("--- Top 20 Words Predicting FAKE News (Label 0) ---")
print(top_fake_features[['feature', 'fake_score']].to_markdown(index=False))
print("\n--- Top 20 Words Predicting TRUE News (Label 1) ---")
print(top_true_features[['feature', 'fake_score']].to_markdown(index=False))

--- Naive Bayes Classification Results (Using Raw Word Counts) ---
Accuracy: 0.9364

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93      3580
           1       0.95      0.93      0.94      4237

    accuracy                           0.94      7817
   macro avg       0.94      0.94      0.94      7817
weighted avg       0.94      0.94      0.94      7817

--- Top 20 Words Predicting FAKE News (Label 0) ---
| feature     |   fake_score |
|:------------|-------------:|
| hillarys    |      5.25011 |
| wow         |      5.09675 |
| video       |      5.09181 |
| heres       |      5.09121 |
| hilarious   |      4.89547 |
| busted      |      4.62564 |
| bombshell   |      4.51338 |
| hilariously |      4.49337 |
| epic        |      4.47297 |
| sarah       |      4.45213 |
| gop         |      4.44685 |
| supporter   |      4.43085 |
| fck         |      4.32907 |
| brilliant   |      4.32907 |
| awesome     | 

In [4]:
# Added cleaning step to handle noise in fake articles
data['text_cleaned'] = data['text_standard']
# Remove common URL patterns and link shorteners
data['text_cleaned'] = data['text_cleaned'].str.replace(r'http[s]?://\S+|www\.\S+|\S+\.(com|org|net|co|ly)|pictwittercom|httpstco|bitly', '', regex=True)
# Remove photo/site credit words (getty, flickr, wikimedia, etc.)
credit_patterns = r'getty|flickr|wikimedia|belowfeatured|somodevillagetty|mcnameegetty|angerergetty|wiretv|acr|cdata|filessupport'
data['text_cleaned'] = data['text_cleaned'].str.replace(credit_patterns, '', regex=True)
# Remove common code snippets and internal tags
code_patterns = r'var|js|dgetelementsbytagnames|dcreateelements|dgetelementbyidid|jssrc|jsid|wfb|featured|screenshot|raedle|gage|donnell|whinedr|src|xfbml|parentnodeinsertbefore|versionv|screengrab|subscribing|nyp'
data['text_cleaned'] = data['text_cleaned'].str.replace(code_patterns, ' ', regex=True)
# Find all words that consist only of letters (a-z) and more than 2 characters long to get rid of fx 21Wire
data['text_cleaned'] = data['text_cleaned'].str.findall(r'[a-z]{2,}')
# Join the tokenized words into single string again
data['text_cleaned'] = data['text_cleaned'].str.join(' ')



# Naive bayes v2

In [11]:

# Define Features (X) and Target (y)
X = data['text_cleaned']  
y = data['label']

# Splitting the data and making sure we get the same number of labels in each set with stratify=y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y
)

# Setting max_df to 0.7 we exclude words that appears in more than 70% of all articles in the training set so we should get more unique words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data
X_train_counts = count_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_counts = count_vectorizer.transform(X_test)

# Train Multinomial Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_counts, y_train)

# 5. Predict and Evaluate
y_pred_counts = nb_classifier.predict(X_test_counts)

# Get the feature names to see which words are helping us predict
feature_names = count_vectorizer.get_feature_names_out()

# Getting log coefficients 
log_probs_fake = nb_classifier.feature_log_prob_[0]
log_probs_true = nb_classifier.feature_log_prob_[1]

# Create a DataFrame so we can sort them 
feature_df = pd.DataFrame({
    'feature': feature_names,
    'log_prob_fake': log_probs_fake,
    'log_prob_true': log_probs_true
})

# Calculate the difference between coefficients 
# A larger positive difference means the word is highly associated with fake news
feature_df['fake_score'] = feature_df['log_prob_fake'] - feature_df['log_prob_true']

# 5. Get top 20 for Fake (highest positive scores)
top_fake_features = feature_df.sort_values(by='fake_score', ascending=False).head(20)

# 6. Get top 20 for True (lowest negative scores)
top_true_features = feature_df.sort_values(by='fake_score', ascending=True).head(20)


# Print results
print("--- Naive Bayes Classification Results (Using Raw Word Counts) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_counts):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_counts))

# Print important classifying words
print("--- Top 20 Words Predicting FAKE News (Label 0) ---")
print(top_fake_features[['feature', 'fake_score']].to_markdown(index=False))

print("\n--- Top 20 Words Predicting TRUE News (Label 1) ---")
print(top_true_features[['feature', 'fake_score']].to_markdown(index=False))

--- Naive Bayes Classification Results (Using Raw Word Counts) ---
Accuracy: 0.9418

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.91      0.94      3580
           1       0.93      0.96      0.95      4237

    accuracy                           0.94      7817
   macro avg       0.94      0.94      0.94      7817
weighted avg       0.94      0.94      0.94      7817

--- Top 20 Words Predicting FAKE News (Label 0) ---
| feature        |   fake_score |
|:---------------|-------------:|
| reilly         |      5.38192 |
| finicum        |      5.36565 |
| fcking         |      5.23757 |
| henningsen     |      5.18661 |
| whined         |      5.09064 |
| bundy          |      5.05214 |
| hammonds       |      4.81013 |
| behar          |      4.81013 |
| fck            |      4.7909  |
| shit           |      4.7844  |
| somodevilla    |      4.77129 |
| watters        |      4.75129 |
| elizabethforma |      4.74114 |
| 

In [5]:
# CREATING EMBEDDINGS
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from sentence_transformers import SentenceTransformer

# Initialize the encoder
## all-miniLM-L6-v2 truncates sentences longer than 256 words, meaning that only the first 256 words of the sentences are embedded
sent_encoder = SentenceTransformer('all-MiniLM-L6-v2')

# Extract columns and convert to lists (using existing column names)
title_list = data['title'].tolist()
# Comment out text since the text in the articles can be very long, and maybe the titles are all we need.
text_list = data['text'].tolist()

# Generate embeddings

title_embeddings = sent_encoder.encode(title_list, show_progress_bar=True)
text_embeddings = sent_encoder.encode(text_list, show_progress_bar=True)

# Correctly store the 2D arrays back into the DataFrame
data['title_embedding'] = pd.Series(list(title_embeddings), index=data.index)
data['text_embedding'] = pd.Series(list(text_embeddings), index=data.index)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 1222/1222 [00:24<00:00, 50.82it/s]
Batches: 100%|██████████| 1222/1222 [03:01<00:00,  6.75it/s]


In [6]:
data['title_embedding'].iloc[0].shape

(384,)

In [7]:
# Dimensionality reduction for better to handle large matrices
import umap

# Use the full title_embeddings matrix (UMAP can handle it better than t-SNE)
reducer = umap.UMAP(n_components=3, random_state=20)
X_umap = reducer.fit_transform(title_embeddings)
Y_umap = reducer.fit_transform(text_embeddings)

  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [8]:
# Adding dimensionality reduced embeddings to list
data['title_dimreduced_embedding'] = pd.Series(list(X_umap), index=data.index)
data['text_dimreduced_embedding'] = pd.Series(list(Y_umap), index=data.index)

# Classifying with cosine

In [8]:
from sklearn.model_selection import train_test_split

X = np.stack(data['title_dimreduced_embedding'].values)
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [9]:
df_train = pd.DataFrame({'title_dimreduced_embedding': list(X_train), 'label': y_train})

title_mean = df_train.groupby('label')['title_dimreduced_embedding'].apply(
    lambda x: np.mean(np.stack(x.values), axis=0)
)

title_mean_true = title_mean[1]
title_mean_fake = title_mean[0]

In [10]:
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import classification_report, accuracy_score


mean_matrix = np.stack([title_mean_fake, title_mean_true])


distance_matrix = cosine_distances(X_test, mean_matrix)


y_pred = np.argmin(distance_matrix, axis=1)


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Fake (0)', 'True (1)'])

print("--- CLASSIFICATION RESULTS (Nearest Centroid) ---")
print(f"Accuracy on UNSEEN Test Set: {accuracy:.4f}")
print("\nClassification Report:")
print(report)

--- CLASSIFICATION RESULTS (Nearest Centroid) ---
Accuracy on UNSEEN Test Set: 0.7595

Classification Report:
              precision    recall  f1-score   support

    Fake (0)       0.69      0.85      0.76      3580
    True (1)       0.84      0.68      0.76      4237

    accuracy                           0.76      7817
   macro avg       0.77      0.77      0.76      7817
weighted avg       0.77      0.76      0.76      7817



In [10]:
from transformers import pipeline
from tqdm.auto import tqdm
tqdm.pandas()

topic_classifier = pipeline("text-classification", model="classla/multilingual-IPTC-news-topic-classifier", device=0, max_length=512, truncation=True)


print(f"Applying topic classifier to {len(data['text_cleaned'])} articles. This might take a while...")
data['topic_predictions'] = data['text_cleaned'].progress_apply(lambda x: topic_classifier(x)[0] if pd.notna(x) and x != '' else None)

Device set to use mps:0


Applying topic classifier to 39081 articles. This might take a while...


100%|██████████| 39081/39081 [1:14:19<00:00,  8.76it/s]


In [29]:
testdata = data[0:10]

# Apply the split, which returns a list of strings for each row
testdata['placeholder'] = testdata['text'].str.split(".")


# Clean the resulting list of strings:
def clean_sentences(sentence_list):
    """Strips whitespace and filters out empty strings from the list."""
    
    # 1. Strip leading/trailing whitespace from every element
    cleaned_list = [s.strip() for s in sentence_list]
    
    # 2. Filter out elements that are empty ('' or just whitespace) 
    # and ensure they have a minimum length (e.g., > 1 character)
    final_sentences = [s for s in cleaned_list if len(s) > 1]
    
    return final_sentences

# Apply the cleaning function to the new column
testdata['split_text'] = testdata['placeholder'].apply(clean_sentences)



# sent_encoder = SentenceTransformer('distilbert-base-uncased')
# _embeddings = sent_encoder.encode(paragraphs, show_progress_bar=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdata['placeholder'] = testdata['text'].str.split(".")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdata['split_text'] = testdata['placeholder'].apply(clean_sentences)


In [11]:
# Text standardizing for tokens
import string

# Removing punctuations including special letter which didn't get picked up by string.punctuation
punctuation_and_special = string.punctuation + '“”‘’' 
punctuation = str.maketrans('', '', punctuation_and_special)

testdata['split_text'] = testdata['text'].astype(str).str.translate(punctuation)

# Lowercasing 
testdata['split_text'] = testdata['split_text'].astype(str).str.strip().str.lower()


# Remove common URL patterns and link shorteners
testdata['split_text'] = testdata['split_text'].str.replace(r'http[s]?://\S+|www\.\S+|\S+\.(com|org|net|co|ly)|pictwittercom|httpstco|bitly', '', regex=True)
# Remove photo/site credit words (getty, flickr, wikimedia, etc.)
credit_patterns = r'getty|flickr|wikimedia|belowfeatured|somodevillagetty|mcnameegetty|angerergetty|wiretv|acr|cdata|filessupport'
testdata['split_text'] = testdata['split_text'].str.replace(credit_patterns, '', regex=True)
# Remove common code snippets and internal tags
code_patterns = r'var|js|dgetelementsbytagnames|dcreateelements|dgetelementbyidid|jssrc|jsid|wfb|featured|screenshot|raedle|gage|donnell|whinedr|src|xfbml|parentnodeinsertbefore|versionv|screengrab|subscribing|nyp'
testdata['split_text'] = testdata['split_text'].str.replace(code_patterns, ' ', regex=True)
# Find all words that consist only of letters (a-z) and more than 2 characters long to get rid of fx 21Wire
testdata['split_text'] = testdata['split_text'].str.findall(r'[a-z]{2,}')
# Join the tokenized words into single string again
testdata['split_text'] = testdata['split_text'].str.join(' ')




NameError: name 'testdata' is not defined

In [12]:
import pandas as pd
import numpy as np
import string
import re
from collections.abc import Iterable
# A. Initial Split (We use the original 'text' column)
data['sentence_list'] = data['text'].str.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', expand=False)
# NOTE: Using a more sophisticated regex split above to better handle titles/abbreviations like 'U.S.'

def clean_and_filter_sentences(sentence_list):
    """Strips whitespace and filters out empty/short strings."""
    try:
        if pd.isna(sentence_list).all():
            return []
    except AttributeError:
        # Fallback for single-element NaNs (which don't have a .all() method)
        if pd.isna(sentence_list):
            return []
    
    # --------------------------------------------------------------------------
    
    # 1. Force array-like objects into a Python list
    if isinstance(sentence_list, (np.ndarray, pd.Series)):
        sentence_list = sentence_list.tolist()
        
    # 2. Handle non-iterable inputs (should only be a single string if it passed NaN check)
    if not isinstance(sentence_list, Iterable) or isinstance(sentence_list, str):
        # Wrap single non-list items into a list (coerces to string)
        sentence_list = [str(sentence_list)]
    # 1. Strip leading/trailing whitespace
    cleaned_list = [s.strip() for s in sentence_list]
    
    # 2. Filter out elements that are empty or too short (e.g., just 'U.')
    final_sentences = [s for s in cleaned_list if len(s) > 5]
    
    return final_sentences

# Apply the cleaning and filtering
data['sentences'] = data['sentence_list'].apply(clean_and_filter_sentences)


# --- STAGE 2: IN-SENTENCE CLEANING (Apply rules to the list content) ---

# Define all cleaning patterns
punctuation_to_remove = string.punctuation + '“”‘’' 
credit_patterns = r'getty|flickr|wikimedia|belowfeatured|somodevillagetty|mcnameegetty|angerergetty|wiretv|acr|cdata|filessupport'
code_patterns = r'var|js|dgetelementsbytagnames|dcreateelements|dgetelementbyidid|jssrc|jsid|wfb|featured|screenshot|raedle|gage|donnell|whinedr|src|xfbml|parentnodeinsertbefore|versionv|screengrab|subscribing|nyp'
url_patterns = r'http[s]?://\S+|www\.\S+|\S+\.(com|org|net|co|ly)|pictwittercom|httpstco|bitly'

def apply_text_cleaning(sentence_list):
    """Applies all cleaning rules to every string inside the list."""
    cleaned_sentences = []
    
    for sentence in sentence_list:
        text = str(sentence)
        
        # Lowercasing and strip
        text = text.strip().lower()

        # Remove URLs and links
        text = re.sub(url_patterns, '', text)
        
        # Remove credit/code patterns
        text = re.sub(credit_patterns, '', text)
        text = re.sub(code_patterns, ' ', text)
        
        # Remove punctuation (This should happen LAST)
        text = text.translate(str.maketrans('', '', punctuation_to_remove))

        # IMPORTANT: We DO NOT use .findall(r'[a-z]{2,}') and .join(' ') here.
        # The Sentence Transformer needs the sentences as full strings.
        
        cleaned_sentences.append(text)
        
    return cleaned_sentences

# Apply the complex cleaning function
data['text_sentence_tokens'] = data['sentences'].apply(apply_text_cleaning)


In [14]:
# Cleaning up dataframe 
data = data.drop(columns=['subject', 'sentences', 'date', 'sentence_list'])


In [15]:
import numpy as np
from itertools import chain
from sentence_transformers import SentenceTransformer

# 1. Flatten the list of lists in the 'cleaned_sentences' column 
# into one single list of all sentences.
all_sentences = list(chain.from_iterable(data['text_sentence_tokens'].dropna()))

print(f"Total number of individual sentences to embed: {len(all_sentences)}")

Total number of individual sentences to embed: 588610


In [17]:
# Initialize the Sentence Transformer model
sent_encoder = SentenceTransformer('distilbert-base-uncased')

# Run the encoding on the flattened list
print("Starting embedding process...")
X_embeddings = sent_encoder.encode(
    all_sentences, 
    show_progress_bar=True,
    # Use convert_to_tensor=True if you need PyTorch tensors later, 
    # but NumPy is fine for initial processing:
    convert_to_tensor=False 
)

print(f"Embedding complete. Final embedding shape: {X_embeddings.shape}")

No sentence-transformers model found with name distilbert-base-uncased. Creating a new one with mean pooling.


Starting embedding process...


Batches: 100%|██████████| 18395/18395 [15:05<00:00, 20.31it/s]


Embedding complete. Final embedding shape: (588610, 768)


In [18]:
import numpy as np
import pandas as pd

# The dimension of your embedding vectors (e.g., 768 for DistilBERT)
embedding_dimension = X_embeddings.shape[1] 

mean_embeddings = []
current_index = 0

print("Starting aggregation of sentence embeddings...")

# Iterate through each row in your DataFrame
for index, sentence_list in data['text_sentence_tokens'].items():
    num_sentences = len(sentence_list)
    
    if num_sentences == 0:
        # Handle rows where cleaning resulted in zero valid sentences
        article_embedding = np.zeros(embedding_dimension)
    else:
        # 1. Slice: Extract the sentences belonging to the current article
        article_sentence_embeddings = X_embeddings[current_index : current_index + num_sentences]
        
        # 2. Mean: Calculate the average vector across all sentences (axis=0 averages down the rows)
        article_embedding = np.mean(article_sentence_embeddings, axis=0)
        
        # 3. Advance: Move the pointer to the start of the next article's embeddings
        current_index += num_sentences
        
    mean_embeddings.append(article_embedding)

print("Aggregation complete.")
# Verification: The total number of new embeddings should match the number of rows in your DataFrame
print(f"Total number of aggregated article embeddings created: {len(mean_embeddings)}")

Starting aggregation of sentence embeddings...
Aggregation complete.
Total number of aggregated article embeddings created: 39081


In [19]:
# Convert the list of mean embeddings into a NumPy array
X_features = np.stack(mean_embeddings)

# Store the final aggregated feature vector in the DataFrame
# Storing as a list of NumPy arrays is often best for Pandas
data['aggregated_text_embedding'] = list(X_features)

print(f"Final feature matrix shape for classification/clustering: {X_features.shape}")

Final feature matrix shape for classification/clustering: (39081, 768)


In [20]:
# Saving once more so i can easily load without having to run everything
# DONT USE CSV it ruins everything


file_path = 'data.pkl'

# 3. Save the DataFrame using pickle
data.to_pickle(file_path)

In [18]:
# Use a pipeline as a high-level helper
from transformers import pipeline

text_block = 'dennis michael lynch has made some shocking and eyeopening movies that address our failure as a nation to secure our borders  they come to america iii the cost of obama s legacy is his most compelling and shocking movie to date there are some terrifying moments when dennis goes to and exposes muslim training camps in america he travels to both our southern and northern us borders to show just how easily they can be crossed by anyone who cares to cross over into the united states his interviews with an undercover us border agent is shocking and exposes the coverups and lies we are being fed by our government about our border crisiswatch this shocking trailer as it shows how vulnerable our nation is to terror attacks from the norththey come to america iii is frightening and should be showing in movie theaters across america if every americans saw this movie they would be screaming from their rooftops and demanding that congress get our borders under controlif you buy one they come to america iii dvd at 1999 he will give you 2 free dvd s to share for a total of 3 moviesorder now while this special offer is still availablethey come to america iii the cost of obama s legacy price 1999 shipping 395 order comes with two additional copies for sharing 3 total in orderyou can pay by credit card online banking or by using pay pal simply by clicking the buy now button below    order your movies now to get your dvd s in time for christmas and hanukah gifts you ll want to share this movie with all of your relatives and friendshere is a very interesting interview with dennis michael lynch on fox and friends as they discuss how dennis exposes how easily muslim terrorists can make their way into our nation in his  they come to america iii  movie.'
pipe = pipeline("summarization", model="Falconsai/topic_change_point")
res1 = pipe(text_block, max_length=1024, min_length=512, do_sample=False)
print(res1)


Device set to use mps:0
Your max_length is set to 1024, but your input_length is only 408. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=204)


[{'summary_text': "Topics: 1) Steven Stevenson 2) Movies 3) Ubama's Legacy 4) Muslim Training Camps 5) Borders 6) Undercover Us Border Agent 7) Coverups and Lünes 8) Border Crisiswatch 9) Vulnerability to Terror Attacks from the Nord 10) Movie Theaters across America 11) Screaming from Their Roofs 12) Demand for Border Control 13) Credit Card Online Banking 14) Pay Pal 15) Buy Now Movies 16) Christmas and Hanukah Gifts 17) Interview with Michael Llynch 18) fox and Friends 19) Muslim Terrorists 20) Border Patrolmanship Topic changes and Locations: Sentence 1: Stevenson to Movies SentENCE 2: Movies to Ubami's Promise Sentential Cost to Muslim Education Camps Sentestence 3: Muslim Learning Camps to Borders to Undercover U. Surgeoning Borders SenTence 4: Undercover Out Out of U.S. to Border Patrol Agent Sentiment to Coverup Screams from The Dwarf to Border Critic Watch Sentent to Vegetableness to Torture Attacks From the Nord Senttence 5: Border Crisis Watch to Movie TheaterScreaming of Th

In [11]:
# Computing mean of title, text and dimensionality reduced embeddings.

mean_title = []
for i in X_train['title_dimreduced_embedding']:
    m = np.mean(i, axis=0)
    mean_title.append(m)

mean_text = []
for i in X_train['text_dimreduced_embedding']:
    m = np.mean(i, axis=0)
    mean_text.append(m)

In [12]:
# adding back to original dataframe
# data['mean_title_embedding'] = pd.Series(list(mean_title), index=data.index)
# data['mean_text_embedding'] = pd.Series(list(mean_text), index=data.index)

In [17]:


class_means_title = data.groupby('label')['mean_title_embedding'].apply(lambda x: np.mean(np.stack(x.values), axis=0))
class_means_text = data.groupby('label')['mean_text_embedding'].apply(lambda x: np.mean(np.stack(x.values), axis=0))


mean_true_title = class_means_title[1]
mean_fake_title = class_means_title[0]

mean_true_text = class_means_text[1]
mean_fake_text = class_means_text[0]

In [19]:
print(mean_true_text)
print(mean_fake_text)

6.25369
6.559492


In [20]:
print(mean_true_title)
print(mean_fake_title)

2.805622
2.3256469


In [5]:
# Creating new columns to contain tokenized words 
data['title_words'] = data['title_standard'].str.split()
data['text_words'] = data['text_standard'].str.split()



In [6]:
# Creating sentence tokens
# Removing stopwords from token columns

#nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords = stopwords.words('english')


# Function to remove stopwords
def remove_stopwords(tokenized_list):
    text_without_stopwords = [word for word in tokenized_list if word not in stopwords]
    return text_without_stopwords


data['title_token'] = data['title_words'].apply(lambda x: remove_stopwords(x))
data['text_token'] = data['text_words'].apply(lambda x: remove_stopwords(x))

