# Loading the data

In [1]:
import pandas as pd

true_data = pd.read_csv('data/True.csv')
fake_data = pd.read_csv('data/Fake.csv')



## Creating a single, clean dataset with labels for true = 1 and fake = 0

In [2]:
# Creating a single dataset with labels for true = 1 and false = 0

true_data['label'] = 1
fake_data['label'] = 0

# Cleaning true_data before concatinating
## Removing journal identifier
true_data['text'] = true_data['text'].str.partition('- ')[2]

# Removing duplicates rows of titles where text is empty
# Create a boolean placeholder to check if text is empty
is_text_functionally_empty = (
    true_data['text'].isna() | 
    true_data['text'].astype(str).str.strip().eq('')
)
# Drop rows if true and keep first instance of title if there are duplicates
rows_to_drop = true_data[is_text_functionally_empty].duplicated(subset=['title'], keep='first')
drop_indices = true_data[is_text_functionally_empty][rows_to_drop].index
cleaned_true = true_data.drop(index=drop_indices)
# Doing the same for fake articles
fake_is_text_functionally_empty = (
    fake_data['text'].isna() | 
    fake_data['text'].astype(str).str.strip().eq('')
)
rows_to_drop = fake_data[fake_is_text_functionally_empty].duplicated(subset=['title'], keep='first')
drop_indices = fake_data[fake_is_text_functionally_empty][rows_to_drop].index
cleaned_fake = fake_data.drop(index=drop_indices)

print(f"Removed {len(true_data)-len(cleaned_true)} duplicate title rows from true articles")
print(f"Removed {len(fake_data)-len(cleaned_fake)} duplicate title rows from fake articles")
print("")

# Removing duplicates of text but keeping unique rows of title
has_content = ~(
    cleaned_fake['text'].isna() |
    cleaned_fake['text'].astype(str).str.strip().eq('')
)
rows_to_remove = cleaned_fake[has_content].duplicated(subset=['text'], keep='first')

drop_indices = cleaned_fake[has_content][rows_to_remove].index

new_cleaned_fake = cleaned_fake.drop(index=drop_indices)

# Doing the same for true articles

has_content = ~(
    cleaned_true['text'].isna() |
    cleaned_true['text'].astype(str).str.strip().eq('')
)
rows_to_remove = cleaned_true[has_content].duplicated(subset=['text'], keep='first')

drop_indices = cleaned_true[has_content][rows_to_remove].index

new_cleaned_true = cleaned_true.drop(index=drop_indices)

print(f"Removed {len(cleaned_true)-len(new_cleaned_true)} duplicate text rows from true articles")
print(f"Removed {len(cleaned_fake)-len(new_cleaned_fake)} duplicate text rows from fake articles")
print("______________________________________________________________________")
print(f"Removed {len(true_data)-len(new_cleaned_true)} duplicate rows from true articles in total")
print(f"Removed {len(fake_data)-len(new_cleaned_fake)} duplicate rows from fake articles in total")

# Concatinating dataframes
data = pd.concat([new_cleaned_true, new_cleaned_fake])
data = data.reset_index(drop=True, inplace=False)

Removed 6 duplicate title rows from true articles
Removed 184 duplicate title rows from fake articles

Removed 229 duplicate text rows from true articles
Removed 5398 duplicate text rows from fake articles
______________________________________________________________________
Removed 235 duplicate rows from true articles in total
Removed 5582 duplicate rows from fake articles in total


# Text standardizing

In [3]:
# Text standardizing
import string

# Creating new columns to preserve original text
data['title_standard'] = data['title']
data['text_standard'] = data['text']

# Removing punctuations including special letter which didn't get picked up by string.punctuation
punctuation_and_special = string.punctuation + '“”‘’' 
punctuation = str.maketrans('', '', punctuation_and_special)

data['title_standard'] = data['title'].astype(str).str.translate(punctuation)
data['text_standard'] = data['text'].astype(str).str.translate(punctuation)

# Lowercasing 
data['title_standard'] = data['title_standard'].astype(str).str.strip().str.lower()
data['text_standard'] = data['text_standard'].astype(str).str.strip().str.lower()


# Aiming for a baseline with naive bayes

In [4]:
# Baseline naive bayes model for article texts
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


# Define Features (X) and Target (y)
X = data['text_standard']  
y = data['label']

# Splitting the data and making sure we get the same number of labels in each set with stratify=y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y
)

# Setting max_df to 0.7 we exclude words that appears in more than 70% of all articles in the training set so we should get more unique words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data
X_train_counts = count_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_counts = count_vectorizer.transform(X_test)

# Train Multinomial Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_counts, y_train)

# Predict and Evaluate
y_pred_counts = nb_classifier.predict(X_test_counts)

# Get the feature names to see which words are helping us predict
feature_names = count_vectorizer.get_feature_names_out()

# Getting log coefficients 
log_probs_fake = nb_classifier.feature_log_prob_[0]
log_probs_true = nb_classifier.feature_log_prob_[1]

# Create a DataFrame so we can sort them 
feature_df = pd.DataFrame({
    'feature': feature_names,
    'log_prob_fake': log_probs_fake,
    'log_prob_true': log_probs_true
})

# Calculate the difference between coefficients 
# A larger positive difference means the word is highly associated with fake news
feature_df['fake_score'] = feature_df['log_prob_fake'] - feature_df['log_prob_true']

# Get top 20 for Fake (highest positive scores)
top_fake_features = feature_df.sort_values(by='fake_score', ascending=False).head(20)

# Get top 20 for True (lowest negative scores)
top_true_features = feature_df.sort_values(by='fake_score', ascending=True).head(20)


# Print results
print("--- Naive Bayes Classification Results")
print(f"Accuracy: {accuracy_score(y_test, y_pred_counts):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_counts))

# Print important classifying words
print("--- Top 20 Words Predicting FAKE News (Label 0) ---")
print(top_fake_features[['feature', 'fake_score']].to_markdown(index=False))

print("\n--- Top 20 Words Predicting TRUE News (Label 1) ---")
print(top_true_features[['feature', 'fake_score']].to_markdown(index=False))

--- Naive Bayes Classification Results (Using Raw Word Counts) ---
Accuracy: 0.9424

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.91      0.94      3580
           1       0.93      0.97      0.95      4237

    accuracy                           0.94      7817
   macro avg       0.94      0.94      0.94      7817
weighted avg       0.94      0.94      0.94      7817

--- Top 20 Words Predicting FAKE News (Label 0) ---
| feature             |   fake_score |
|:--------------------|-------------:|
| 2017realdonaldtrump |      7.0647  |
| 21wire              |      6.35848 |
| belowfeatured       |      6.28281 |
| getty               |      6.2585  |
| 2017the             |      6.08819 |
| flickr              |      6.06944 |
| 21wiretv            |      5.91799 |
| 2016realdonaldtrump |      5.82926 |
| somodevillagetty    |      5.74693 |
| screenshot          |      5.74319 |
| acr                 |      5.70112 |
| cda

## Do the same for article titles

In [5]:
# We do the same for the article titles

X = data['title_standard']
y = data['label']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
count_vectorizer = CountVectorizer(stop_words='english')


X_train_counts = count_vectorizer.fit_transform(X_train)


X_test_counts = count_vectorizer.transform(X_test)


nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_counts, y_train)


y_pred_counts = nb_classifier.predict(X_test_counts)

feature_names = count_vectorizer.get_feature_names_out()

log_probs_fake = nb_classifier.feature_log_prob_[0]
log_probs_true = nb_classifier.feature_log_prob_[1]


feature_df = pd.DataFrame({
    'feature': feature_names,
    'log_prob_fake': log_probs_fake,
    'log_prob_true': log_probs_true
})

feature_df['fake_score'] = feature_df['log_prob_fake'] - feature_df['log_prob_true']
top_fake_features = feature_df.sort_values(by='fake_score', ascending=False).head(20)
top_true_features = feature_df.sort_values(by='fake_score', ascending=True).head(20)

print("--- Naive Bayes Classification Results (Using Raw Word Counts) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_counts):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_counts))
print("--- Top 20 Words Predicting FAKE News (Label 0) ---")
print(top_fake_features[['feature', 'fake_score']].to_markdown(index=False))
print("\n--- Top 20 Words Predicting TRUE News (Label 1) ---")
print(top_true_features[['feature', 'fake_score']].to_markdown(index=False))

--- Naive Bayes Classification Results (Using Raw Word Counts) ---
Accuracy: 0.9364

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93      3580
           1       0.95      0.93      0.94      4237

    accuracy                           0.94      7817
   macro avg       0.94      0.94      0.94      7817
weighted avg       0.94      0.94      0.94      7817

--- Top 20 Words Predicting FAKE News (Label 0) ---
| feature     |   fake_score |
|:------------|-------------:|
| hillarys    |      5.25011 |
| wow         |      5.09675 |
| video       |      5.09181 |
| heres       |      5.09121 |
| hilarious   |      4.89547 |
| busted      |      4.62564 |
| bombshell   |      4.51338 |
| hilariously |      4.49337 |
| epic        |      4.47297 |
| sarah       |      4.45213 |
| gop         |      4.44685 |
| supporter   |      4.43085 |
| fck         |      4.32907 |
| brilliant   |      4.32907 |
| awesome     | 

### Added cleaning step to handle noise observed from naive bayes model

In [8]:
# Added cleaning step to handle noise in fake articles
data['text_cleaned'] = data['text_standard']
# Remove common URL patterns and link shorteners
data['text_cleaned'] = data['text_cleaned'].str.replace(r'http[s]?://\S+|www\.\S+|\S+\.(com|org|net|co|ly)|pictwittercom|httpstco|bitly', '', regex=True)
# Remove photo/site credit words (getty, flickr, wikimedia, etc.)
credit_patterns = r'getty|flickr|wikimedia|belowfeatured|somodevillagetty|mcnameegetty|angerergetty|wiretv|acr|cdata|filessupport'
data['text_cleaned'] = data['text_cleaned'].str.replace(credit_patterns, '', regex=True)
# Remove common code snippets and internal tags
code_patterns = r'var|js|dgetelementsbytagnames|dcreateelements|dgetelementbyidid|jssrc|jsid|wfb|featured|screenshot|raedle|gage|donnell|whinedr|src|xfbml|parentnodeinsertbefore|versionv|screengrab|subscribing|nyp'
data['text_cleaned'] = data['text_cleaned'].str.replace(code_patterns, ' ', regex=True)
# Find all words that consist only of letters (a-z) and more than 2 characters long to get rid of fx 21Wire
data['text_cleaned'] = data['text_cleaned'].str.findall(r'[a-z]{2,}')
# Join the tokenized words into single string again
data['text_cleaned'] = data['text_cleaned'].str.join(' ')

## Re-run of bayes on newly cleaned article text since text had higher accuracy

In [10]:

# Define Features (X) and Target (y)
X = data['text_cleaned']  
y = data['label']

# Splitting the data and making sure we get the same number of labels in each set with stratify=y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y
)

# Setting max_df to 0.7 we exclude words that appears in more than 70% of all articles in the training set so we should get more unique words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data
X_train_counts = count_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_counts = count_vectorizer.transform(X_test)

# Train Multinomial Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_counts, y_train)

# 5. Predict and Evaluate
y_pred_counts = nb_classifier.predict(X_test_counts)

# Get the feature names to see which words are helping us predict
feature_names = count_vectorizer.get_feature_names_out()

# Getting log coefficients 
log_probs_fake = nb_classifier.feature_log_prob_[0]
log_probs_true = nb_classifier.feature_log_prob_[1]

# Create a DataFrame so we can sort them 
feature_df = pd.DataFrame({
    'feature': feature_names,
    'log_prob_fake': log_probs_fake,
    'log_prob_true': log_probs_true
})

# Calculate the difference between coefficients 
# A larger positive difference means the word is highly associated with fake news
feature_df['fake_score'] = feature_df['log_prob_fake'] - feature_df['log_prob_true']

# 5. Get top 20 for Fake (highest positive scores)
top_fake_features = feature_df.sort_values(by='fake_score', ascending=False).head(20)

# 6. Get top 20 for True (lowest negative scores)
top_true_features = feature_df.sort_values(by='fake_score', ascending=True).head(20)


# Print results
print("--- Naive Bayes Classification Results (Using Raw Word Counts) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_counts):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_counts))

# Print important classifying words
print("--- Top 20 Words Predicting FAKE News (Label 0) ---")
print(top_fake_features[['feature', 'fake_score']].to_markdown(index=False))

print("\n--- Top 20 Words Predicting TRUE News (Label 1) ---")
print(top_true_features[['feature', 'fake_score']].to_markdown(index=False))

--- Naive Bayes Classification Results (Using Raw Word Counts) ---
Accuracy: 0.9418

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.91      0.94      3580
           1       0.93      0.96      0.95      4237

    accuracy                           0.94      7817
   macro avg       0.94      0.94      0.94      7817
weighted avg       0.94      0.94      0.94      7817

--- Top 20 Words Predicting FAKE News (Label 0) ---
| feature        |   fake_score |
|:---------------|-------------:|
| reilly         |      5.38192 |
| finicum        |      5.36565 |
| fcking         |      5.23757 |
| henningsen     |      5.18661 |
| whined         |      5.09064 |
| bundy          |      5.05214 |
| hammonds       |      4.81013 |
| behar          |      4.81013 |
| fck            |      4.7909  |
| shit           |      4.7844  |
| somodevilla    |      4.77129 |
| watters        |      4.75129 |
| elizabethforma |      4.74114 |
| 

# Classifying with cosine and k-means clustering

## Tokenization of text sentences

In [None]:
import pandas as pd
import numpy as np
import string
import re
from collections.abc import Iterable

# Cleaning process to split on "." before removing other punctuation
# Initial Split (We use the original 'text' column)
data['sentence_list'] = data['text'].str.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', expand=False)


def clean_and_filter_sentences(sentence_list):
    """Strips whitespace and filters out empty/short strings."""
    try:
        if pd.isna(sentence_list).all():
            return []
    except AttributeError:
        # handle nans
        if pd.isna(sentence_list):
            return []
    
    # array-like objects into a Python list
    if isinstance(sentence_list, (np.ndarray, pd.Series)):
        sentence_list = sentence_list.tolist()
        
    # Handle non-iterable inputs
    if not isinstance(sentence_list, Iterable) or isinstance(sentence_list, str):
        # Wrap into a list
        sentence_list = [str(sentence_list)]
    # Strip leading/trailing whitespace after "." from sentences
    cleaned_list = [s.strip() for s in sentence_list]
    
    # Filter out elements that are empty or too short (like just "U. ")
    final_sentences = [s for s in cleaned_list if len(s) > 5]
    
    return final_sentences

# Apply the cleaning and filtering
data['sentences'] = data['sentence_list'].apply(clean_and_filter_sentences)



# Clean sentences like we did before with the entire article text now just on our sentence tokens
# Define all cleaning patterns
punctuation_to_remove = string.punctuation + '“”‘’' 
credit_patterns = r'getty|flickr|wikimedia|belowfeatured|somodevillagetty|mcnameegetty|angerergetty|wiretv|acr|cdata|filessupport'
code_patterns = r'var|js|dgetelementsbytagnames|dcreateelements|dgetelementbyidid|jssrc|jsid|wfb|featured|screenshot|raedle|gage|donnell|whinedr|src|xfbml|parentnodeinsertbefore|versionv|screengrab|subscribing|nyp'
url_patterns = r'http[s]?://\S+|www\.\S+|\S+\.(com|org|net|co|ly)|pictwittercom|httpstco|bitly'

def apply_text_cleaning(sentence_list):
    """Applies all cleaning rules to every string inside the list."""
    cleaned_sentences = []
    
    for sentence in sentence_list:
        text = str(sentence)
        
        # Lowercasing and strip
        text = text.strip().lower()

        # Remove URLs and links
        text = re.sub(url_patterns, '', text)
        
        # Remove credit/code patterns
        text = re.sub(credit_patterns, '', text)
        text = re.sub(code_patterns, ' ', text)
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', punctuation_to_remove))
        
        cleaned_sentences.append(text)
        
    return cleaned_sentences

# Apply the complex cleaning function to get our clean sentence tokens
data['text_sentence_tokens'] = data['sentences'].apply(apply_text_cleaning)


### Embed sentence tokens

#### Note, embedding the words of the sentences takes very long and requires too much memory making the kernel crash. So we stick with embedding the sentences of the articles

In [None]:
# CREATING EMBEDDINGS FOR TITLES
import pandas as pd
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from sentence_transformers import SentenceTransformer

# Initialize the encoder
title_sent_encoder = SentenceTransformer('all-MiniLM-L12-v2')

# Extract columns and convert to lists (using existing column names)
title_list = data['title_standard'].tolist()

# Generate embeddings for titles which consist of one sentence 
title_embeddings = title_sent_encoder.encode(title_list, show_progress_bar=True)

# Correctly store the 2D arrays back into the DataFrame
data['title_embedding'] = pd.Series(list(title_embeddings), index=data.index)

In [None]:
from itertools import chain

# Flatten the list of lists in the 'text_sentence_tokens' column 
# into one single list of all sentences.
all_sentences = list(chain.from_iterable(data['text_sentence_tokens'].dropna()))

print(f"Total number of individual sentences to embed: {len(all_sentences)}")

In [None]:
# This part takes a while

# Initialize the Sentence Transformer model
from sentence_transformers import SentenceTransformer

# Initialize the Sentence Transformer model
sent_encoder = SentenceTransformer('all-MiniLM-L12-v2')

# Run the encoding on the flattened list
print("Starting embedding process...")
X_embeddings = sent_encoder.encode(
    all_sentences, 
    show_progress_bar=True,
    convert_to_tensor=False 
)

print(f"Embedding complete. Final embedding shape: {X_embeddings.shape}")

In [None]:
import numpy as np
# The dimension of our embedding vectors 384 for all-miniLM transformer
embedding_dimension = X_embeddings.shape[1] 

mean_embeddings = []
current_index = 0

# Iterate through each row in our DataFrame
for index, sentence_list in data['text_sentence_tokens'].items():
    num_sentences = len(sentence_list)
    
    if num_sentences == 0:
        # Handle rows where cleaning resulted in zero valid sentences
        article_embedding = np.zeros(embedding_dimension)
    else:
        # Extract the sentences belonging to the current article
        article_sentence_embeddings = X_embeddings[current_index : current_index + num_sentences]
        
        # Calculate the average vector across all sentences (axis=0 averages down the rows)
        article_embedding = np.mean(article_sentence_embeddings, axis=0)
        
        # Move the pointer to the start of the next article embedding
        current_index += num_sentences
        
    mean_embeddings.append(article_embedding)

# The total number of new embeddings should match the number of rows in our DataFrame
print(f"Total number of aggregated article embeddings created: {len(mean_embeddings)}")

# Classifying with cosine

In [None]:
from sklearn.model_selection import train_test_split

#-------- Uncomment one of the text_features to choose between the title sentence embeddings or the text embeddings------------

# ------ Title embeddings ------------
#text_features = np.stack(data['title_embedding'].to_numpy())
# ------------------------------------

# ------ text embeddings  ------------
text_features = np.stack(data['aggregated_text_embedding'].to_numpy())
# ------------------------------------

X = text_features
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [None]:
df_train = pd.DataFrame({'embeddings': list(X_train), 'label': y_train})

title_mean = df_train.groupby('label')['embeddings'].apply(
    lambda x: np.mean(np.stack(x.values), axis=0)
)

title_mean_true = title_mean[1]
title_mean_fake = title_mean[0]

In [None]:
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import classification_report, accuracy_score


mean_matrix = np.stack([title_mean_fake, title_mean_true])


distance_matrix = cosine_distances(X_test, mean_matrix)


y_pred = np.argmin(distance_matrix, axis=1)


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Fake (0)', 'True (1)'])

print(f"Accuracy on unseen Test Set: {accuracy:.4f}")
print(report)

# K-means clustering

In [None]:
import umap
import numpy as np 


title_features = data['title_embedding']

# I use umap, since t-sne was problematic for me, to reduce embedding dimensions to the 3 most important or the ones that explain the most variance.
reducer = umap.UMAP(n_components=3, random_state=20)

# Convert the series of embedding arrays into a 2D numpy array
X_features = np.stack(title_features.to_numpy())

X_umap = reducer.fit_transform(X_features)

print("UMAP transformation complete.")
print(f"UMAP Output Shape: {X_umap.shape}")



In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

silhouette_values = []

for k in range(2, 10):
    print(f'fitting k = {k}')
    kmeans = KMeans(n_clusters=k, random_state=20, n_init=10)
    cluster_labels = kmeans.fit_predict(X_umap)
    
    # Calculate silhouette score
    score = silhouette_score(X_umap, cluster_labels)
    silhouette_values.append(score)

# plotting
plt.plot(range(2, 10), silhouette_values, marker='x')
plt.title('Silhouette Score vs Number of Clusters')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

In [None]:
from sklearn.cluster import KMeans

import numpy as np
import matplotlib.pyplot as plt

elbow_point = 5

# Refit with optimal clusters
kmeans = KMeans(n_clusters=elbow_point, random_state=20, n_init = 10)
kmeans.fit(X_umap) 

# Re-assign cluster labels based on the optimal clustering
cluster_assignments = kmeans.labels_

paragraphs = data['title'].tolist() 

central_sentences = []

for cluster_id in range(elbow_point): 
 cluster_indices = (cluster_assignments == cluster_id)
 cluster_X_paragraphs = np.array(paragraphs)[cluster_indices]
 X_umap_cluster = X_umap[cluster_indices]

 if len(cluster_X_paragraphs) > 0:

  cluster_umap_points = X_umap[cluster_indices]
  if len(cluster_umap_points) > 0:
      centroid_umap = kmeans.cluster_centers_[cluster_id]

      distances_to_centroid = np.sum((cluster_umap_points - centroid_umap)**2, axis=1)
      closest_point_index_in_cluster_umap = np.argmin(distances_to_centroid)


      central_sentence = cluster_X_paragraphs[closest_point_index_in_cluster_umap]
      central_sentences.append(central_sentence)

# Visualize clusters in 2D using X_umap
scatter = plt.scatter(X_umap[:, 0], X_umap[:, 1], c=cluster_assignments, cmap='viridis', alpha=0.5)
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.title(f'Clusters with sentence-level transformer embeddings {elbow_point}')
plt.show()

# Print central sentences of clusters
print("\nCentral Sentences:")
for i, sentence in enumerate(central_sentences):
 print(f"Cluster {i}: {sentence}\n{'-' * 50}")

In [None]:
# We add cluster assignments to our df
data['cluster_id'] = cluster_assignments

# Group by cluster_id and label to see the distribution of true/fake articles in each cluster
cluster_label_distribution = data.groupby(['cluster_id', 'label']).size().unstack(fill_value=0)

cluster_label_distribution.rename(columns={0: 'Fake Articles', 1: 'True Articles'}, inplace=True)

display(cluster_label_distribution)

# Topic modelling

In [None]:
from transformers import pipeline
from tqdm.auto import tqdm
tqdm.pandas()

topic_classifier = pipeline("text-classification", model="classla/multilingual-IPTC-news-topic-classifier", device=0, max_length=512, truncation=True)

print(f"Applying topic classifier to {len(data['text_cleaned'])} articles. This might take a while...")
data['topic_predictions'] = data['text_cleaned'].progress_apply(lambda x: topic_classifier(x)[0] if pd.notna(x) and x != '' else None)

## Subsetting dataframe with overlapping clusters

In [None]:
subset_df = data[data['cluster_id'].isin([1, 2])]

In [None]:
# Top 3 topics in subset
for label in subset_df['label'].unique():
    class_subset = subset_df[subset_df['label'] == label]
    

    topic_series = class_subset['topic_predictions'].apply(lambda x: x['label'] if isinstance(x, dict) and 'label' in x else None)
    
    top_3_topics = topic_series.value_counts().head(3)
    
    label_name = "True" if label == 1 else "Fake"
    print(f"Top 3 topics for {label_name} (label {label}):")
    print(top_3_topics)
    print("-" * 30)

In [None]:
from transformers import pipeline

# Subjectivity model
classify = pipeline(
    task="text-classification",
    model="cffl/bert-base-styleclassification-subjective-neutral",
    return_all_scores=True,
)

results = classify(subset_df['title_standard'].tolist(), truncation=True)

def get_top_style(scores):
    return max(scores, key=lambda x: x['score'])['label']

subset_df['style'] = [get_top_style(res) for res in results]

for label in [0, 1]:
    label_name = "True" if label == 1 else "Fake"
    print(f"Style/Subjectivity for {label_name} (label {label}):")
    print(subset_df[subset_df['label'] == label]['style'].value_counts())
    print("-" * 30)

In [None]:
# ALL ARTICLES TONE

from transformers import pipeline

# Model
classify = pipeline(
    task="text-classification",
    model="cffl/bert-base-styleclassification-subjective-neutral",
    return_all_scores=True,
)

results = classify(data['title_standard'].tolist(), truncation=True)

def get_top_style(scores):
    return max(scores, key=lambda x: x['score'])['label']

data['style'] = [get_top_style(res) for res in results]


for label in [0, 1]:
    label_name = "True" if label == 1 else "Fake"
    print(f"Style/Subjectivity for {label_name} (label {label}):")
    print(data[data['label'] == label]['style'].value_counts())
    print("-" * 30)

In [None]:
# EMOTIONS
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=2)

results = classifier(data['title_standard'].tolist(), truncation=True)

def get_top_style(scores):
    return max(scores, key=lambda x: x['score'])['label']

data['emotions'] = [get_top_style(res) for res in results]

for label in [0, 1]:
    label_name = "True" if label == 1 else "Fake"
    print(f"Emotions for {label_name} (label {label}):")
    print(data[data['label'] == label]['emotions'].value_counts())
    print("-" * 30)

# Decision tree

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Use clusters since these are made based on embeddings.
features_embeddings = pd.DataFrame(data['cluster_id']).reset_index(drop=True)


features_meta = pd.get_dummies(data[['style', 'emotions']]).reset_index(drop=True)

# Combine
X = pd.concat([features_embeddings, features_meta], axis=1)
y = data['label']

X.columns = X.columns.astype(str)
# Split to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Start tree
tree_clf = DecisionTreeClassifier(max_depth=4, random_state=42)
tree_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = tree_clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Plotting

from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(25,12))
plot_tree(tree_clf, 
          feature_names=X.columns, 
          class_names=['Fake', 'True'], 
          filled=True, 
          impurity=False, 
          proportion=True, 
          precision=2
          )
 
plt.show()