In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import networkx as nx
import matplotlib.pyplot as plt


In [2]:
# Load cleaned dataset
df = pd.read_csv("../data/preprocessed data/cleaned_fakenews.csv")

print("Dataset shape:", df.shape)
print(df['target'].value_counts())


Dataset shape: (16310, 4)
target
1    8454
0    7856
Name: count, dtype: int64


In [3]:
# Ensure text column exists
df['final_text'] = df['final_text'].astype(str)

# Domain A: Fake news
fake_news = df[df['target'] == 0]['final_text']

# Domain B: True news
true_news = df[df['target'] == 1]['final_text']

print("Fake news count:", len(fake_news))
print("True news count:", len(true_news))


Fake news count: 7856
True news count: 8454


In [4]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english'
)

tfidf_all = vectorizer.fit_transform(
    pd.concat([fake_news, true_news])
)

fake_vecs = tfidf_all[:len(fake_news)]
true_vecs = tfidf_all[len(fake_news):]

In [5]:
similarity_matrix = cosine_similarity(fake_vecs, true_vecs)

print("Similarity matrix shape:", similarity_matrix.shape)

Similarity matrix shape: (7856, 8454)


In [6]:
TOP_K = 3
THRESHOLD = 0.3

links = []

for i in range(similarity_matrix.shape[0]):
    top_indices = np.argsort(similarity_matrix[i])[::-1][:TOP_K]
    
    for j in top_indices:
        score = similarity_matrix[i][j]
        if score >= THRESHOLD:
            links.append({
                "Fake_Index": fake_news.index[i],
                "True_Index": true_news.index[j],
                "Similarity": score
            })

links_df = pd.DataFrame(links)
print("Total cross-domain links:", len(links_df))
links_df.head()


Total cross-domain links: 11536


Unnamed: 0,Fake_Index,True_Index,Similarity
0,3,9817,0.582936
1,3,5254,0.527638
2,3,1256,0.522317
3,10,8572,0.367338
4,10,15999,0.330947


In [7]:
links_df.to_csv("../outputs/cross_domain_links.csv", index=False)
print("✅ cross_domain_links.csv saved")

✅ cross_domain_links.csv saved


In [8]:
G = nx.Graph()

# Add nodes
for idx in links_df['Fake_Index'].unique():
    G.add_node(f"Fake_{idx}", domain="Fake")

for idx in links_df['True_Index'].unique():
    G.add_node(f"True_{idx}", domain="True")

# Add edges
for _, row in links_df.iterrows():
    G.add_edge(
        f"Fake_{row['Fake_Index']}",
        f"True_{row['True_Index']}",
        weight=row['Similarity']
    )

plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, seed=42)

node_colors = [
    "skyblue" if G.nodes[n]["domain"] == "Fake" else "lightgreen"
    for n in G.nodes()
]

nx.draw(
    G, pos,
    with_labels=False,
    node_color=node_colors,
    node_size=300,
    edge_color="gray"
)

plt.title("Cross-Domain Linking: Fake ↔ True News")
plt.show()


KeyboardInterrupt: 

<Figure size 1200x800 with 0 Axes>