In [2]:
import re

import numpy as np
from scipy.sparse import csr_matrix, save_npz
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

from networkx.algorithms.community import greedy_modularity_communities
import networkx as nx

import nltk
import string 
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import warnings
warnings.filterwarnings('ignore')

# About the raw data:

### You can get it from here:
*https://www.kaggle.com/mdepak/fakenewsnet?select=BuzzFeedUserUser.txt*

### Then unzip it and put to the "Data/" folder

# BuzFeedNews:

## News-User Engagement Matrix:

In [3]:
row = []
col = []
val = []
with open("Data/BuzzFeedNewsUser.txt", "r") as f:
    while d := f.readline():
        triple = [int(i) for i in d.split()]
        row.append(triple[0] - 1)
        col.append(triple[1] - 1)
        val.append(triple[2])

U = csr_matrix((val, (row, col))) # news-user mtx
print(U.shape)
save_npz('prep_data/U_bf.npz', U)

(182, 15257)


## N-Gram Count Matrix:

In [5]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def preproc(text):
    no_digits = re.sub('[0-9]+', '', text.lower())
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(no_digits) if w not in string.punctuation])


In [6]:
fake_content = pd.read_csv("Data/BuzzFeed_fake_news_content.csv")
real_content = pd.read_csv("Data/BuzzFeed_real_news_content.csv")
content = pd.concat([real_content["text"], fake_content["text"]], ignore_index=True)

vectorizer = CountVectorizer(preprocessor=preproc, stop_words="english", ngram_range=(1, 2), max_features=3000)
N = vectorizer.fit_transform(content)

save_npz('prep_data/N_bf.npz', N)

## Labels:

In [10]:
n_news, _ = N.shape
y_labels = np.array([0]*(n_news // 2) + [1]*(n_news // 2))
np.savez('prep_data/labels_bf.npz', y_labels)

## User-Community Matrix:

In [None]:
row = []
col = []
val = []
with open("Data/BuzzFeedUserUser.txt", "r") as f:
    while d := f.readline():
        triple = [int(i) for i in d.split()]
        row.append(triple[0] - 1)
        col.append(triple[1] - 1)
        val.append(1)

C0 = csr_matrix((val, (row, col))) # user-user mtx

G = nx.from_scipy_sparse_matrix(C0)
communities = greedy_modularity_communities(G)

row = []
col = []
val = []

for i, cc in enumerate(communities):
    users = list(cc)
    row.extend(users)
    val.extend([1]*len(users))
    col.extend([i]*len(users))

C = csr_matrix((val, (row, col)))

save_npz('prep_data/C_bf.npz', C)

# PolitiFact:

## News-User Engagement Matrix:

In [11]:
row = []
col = []
val = []
with open("Data/PolitiFactNewsUser.txt", "r") as f:
    while d := f.readline():
        triple = [int(i) for i in d.split()]
        row.append(triple[0] - 1)
        col.append(triple[1] - 1)
        val.append(triple[2])

U = csr_matrix((val, (row, col))) # news-user mtx
print(U.shape)
save_npz('prep_data/U_pf.npz', U)

(240, 23865)


## N-Gram Count Matrix:

In [12]:
fake_content = pd.read_csv("Data/PolitiFact_fake_news_content.csv")
real_content = pd.read_csv("Data/PolitiFact_real_news_content.csv")
content = pd.concat([real_content["text"], fake_content["text"]], ignore_index=True)

vectorizer = CountVectorizer(preprocessor=preproc, stop_words="english", ngram_range=(1, 2), max_features=3000)
N = vectorizer.fit_transform(content)

save_npz('prep_data/N_pf.npz', N)

## Labels:

In [16]:
n_news, _ = N.shape
y_labels = np.array([0]*(n_news // 2) + [1]*(n_news // 2))
np.savez('prep_data/labels_pf.npz', y_labels)

## User-Community Matrix:

In [17]:
row = []
col = []
val = []
with open("Data/PolitiFactUserUser.txt", "r") as f:
    while d := f.readline():
        triple = [int(i) for i in d.split()]
        row.append(triple[0] - 1)
        col.append(triple[1] - 1)
        val.append(1)

C0 = csr_matrix((val, (row, col))) # user-user mtx

G = nx.from_scipy_sparse_matrix(C0)
communities = greedy_modularity_communities(G)

row = []
col = []
val = []

for i, cc in enumerate(communities):
    users = list(cc)
    row.extend(users)
    val.extend([1]*len(users))
    col.extend([i]*len(users))

C = csr_matrix((val, (row, col)))

save_npz('prep_data/C_pf.npz', C)