In [2]:
# System tools
import os

# Data analysis
import pandas as pd
from collections import Counter
from itertools import combinations 
from tqdm import tqdm

# NLP
import spacy
nlp = spacy.load("en_core_web_sm")

# drawing
import networkx as nx
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,20)

## All data

In [None]:
"""
--------------------- KING JAMES BIBLE ------------------------------
"""
input_file = os.path.join("..", "data", "raw_data", "king_james_bible.csv")
data = pd.read_csv(input_file)

text_entities = []

for text in tqdm(data):
    # create temporary list 
    tmp_entities = []
    # create doc object
    doc = nlp(text)
    # for every named entity
    for entity in doc.ents:
        # if that entity is a person
        if entity.label_ == "PERSON":
            # append to temp list
            tmp_entities.append(entity.text)
    # append temp list to main list
    text_entities.append(tmp_entities)

edgelist = []
# iterate over every document
for text in text_entities:
    # use itertools.combinations() to create edgelist
    edges = list(combinations(text, 2))
    # for each combination - i.e. each pair of 'nodes'
    for edge in edges:
        # append this to final edgelist
        edgelist.append(tuple(sorted(edge)))
        

counted_edges = []
for key, value in Counter(edgelist).items():
    source = key[0]
    target = key[1]
    weight = value
    counted_edges.append((source, target, weight))
    
edges_df = pd.DataFrame(counted_edges, columns=["nodeA", "nodeB", "weight"])

edges_df.to_csv("../data/edgelists/bible_edgelist.csv", index=False)

"""
------------------ 100 English Novels ------------------------
"""
input_file = os.path.join("..", "data", "raw_data", "100_english_novels.csv")
data = pd.read_csv(input_file)

text_entities = []

for text in tqdm(data):
    # create temporary list 
    tmp_entities = []
    # create doc object
    doc = nlp(text)
    # for every named entity
    for entity in doc.ents:
        # if that entity is a person
        if entity.label_ == "GPE":
            # append to temp list
            tmp_entities.append(entity.text)
    # append temp list to main list
    text_entities.append(tmp_entities)

edgelist = []
# iterate over every document
for text in text_entities:
    # use itertools.combinations() to create edgelist
    edges = list(combinations(text, 2))
    # for each combination - i.e. each pair of 'nodes'
    for edge in edges:
        # append this to final edgelist
        edgelist.append(tuple(sorted(edge)))
        

counted_edges = []
for key, value in Counter(edgelist).items():
    source = key[0]
    target = key[1]
    weight = value
    counted_edges.append((source, target, weight))
    
edges_df = pd.DataFrame(counted_edges, columns=["nodeA", "nodeB", "weight"])

edges_df.to_csv("../data/edgelists/100_english_novels_edgelist.csv", index=False)

"""
------------------------------------------ FAKE NEWS ---------------------
"""
input_file = os.path.join("..", "data", "raw_data", "king_james_bible.csv")
data = pd.read_csv(input_file)
data = data[data["label"]=="FAKE"]["text"]

text_entities = []

for text in tqdm(data):
    # create temporary list 
    tmp_entities = []
    # create doc object
    doc = nlp(text)
    # for every named entity
    for entity in doc.ents:
        # if that entity is a person
        if entity.label_ == "PERSON":
            # append to temp list
            tmp_entities.append(entity.text)
    # append temp list to main list
    text_entities.append(tmp_entities)

edgelist = []
# iterate over every document
for text in text_entities:
    # use itertools.combinations() to create edgelist
    edges = list(combinations(text, 2))
    # for each combination - i.e. each pair of 'nodes'
    for edge in edges:
        # append this to final edgelist
        edgelist.append(tuple(sorted(edge)))
        

counted_edges = []
for key, value in Counter(edgelist).items():
    source = key[0]
    target = key[1]
    weight = value
    counted_edges.append((source, target, weight))
    
edges_df = pd.DataFrame(counted_edges, columns=["nodeA", "nodeB", "weight"])

edges_df.to_csv("../data/edgelists/fake_news_edgelist.csv", index=False)


## King James

In [None]:
input_file = os.path.join("..", "data", "raw_data", "king_james_bible.csv")
data = pd.read_csv(input_file)

text_entities = []

for text in tqdm(data):
    # create temporary list 
    tmp_entities = []
    # create doc object
    doc = nlp(text)
    # for every named entity
    for entity in doc.ents:
        # if that entity is a person
        if entity.label_ == "PERSON":
            # append to temp list
            tmp_entities.append(entity.text)
    # append temp list to main list
    text_entities.append(tmp_entities)

edgelist = []
# iterate over every document
for text in text_entities:
    # use itertools.combinations() to create edgelist
    edges = list(combinations(text, 2))
    # for each combination - i.e. each pair of 'nodes'
    for edge in edges:
        # append this to final edgelist
        edgelist.append(tuple(sorted(edge)))
        

counted_edges = []
for key, value in Counter(edgelist).items():
    source = key[0]
    target = key[1]
    weight = value
    counted_edges.append((source, target, weight))
    
edges_df = pd.DataFrame(counted_edges, columns=["nodeA", "nodeB", "weight"])

edges_df.to_csv("../data/edgelists/bible_edgelist.csv", index=False)

## 100 english novels

In [None]:
input_file = os.path.join("..", "data", "raw_data", "100_english_novels.csv")
data = pd.read_csv(input_file)

text_entities = []

for text in tqdm(data):
    # create temporary list 
    tmp_entities = []
    # create doc object
    doc = nlp(text)
    # for every named entity
    for entity in doc.ents:
        # if that entity is a person
        if entity.label_ == "GPE":
            # append to temp list
            tmp_entities.append(entity.text)
    # append temp list to main list
    text_entities.append(tmp_entities)

edgelist = []
# iterate over every document
for text in text_entities:
    # use itertools.combinations() to create edgelist
    edges = list(combinations(text, 2))
    # for each combination - i.e. each pair of 'nodes'
    for edge in edges:
        # append this to final edgelist
        edgelist.append(tuple(sorted(edge)))
        

counted_edges = []
for key, value in Counter(edgelist).items():
    source = key[0]
    target = key[1]
    weight = value
    counted_edges.append((source, target, weight))
    
edges_df = pd.DataFrame(counted_edges, columns=["nodeA", "nodeB", "weight"])

edges_df.to_csv("../data/edgelists/100_english_novels_edgelist.csv", index=False)

## Real news

In [4]:
input_file = os.path.join("..", "data", "raw_data", "fake_or_real_news.csv")
data = pd.read_csv(input_file)
data = data[data["label"]=="REAL"]["text"]

text_entities = []

for text in tqdm(data):
    # create temporary list 
    tmp_entities = []
    # create doc object
    doc = nlp(text)
    # for every named entity
    for entity in doc.ents:
        # if that entity is a person
        if entity.label_ == "PERSON":
            # append to temp list
            tmp_entities.append(entity.text)
    # append temp list to main list
    text_entities.append(tmp_entities)

edgelist = []
# iterate over every document
for text in text_entities:
    # use itertools.combinations() to create edgelist
    edges = list(combinations(text, 2))
    # for each combination - i.e. each pair of 'nodes'
    for edge in edges:
        # append this to final edgelist
        edgelist.append(tuple(sorted(edge)))
        

counted_edges = []
for key, value in Counter(edgelist).items():
    source = key[0]
    target = key[1]
    weight = value
    counted_edges.append((source, target, weight))
    
edges_df = pd.DataFrame(counted_edges, columns=["nodeA", "nodeB", "weight"])

edges_df.to_csv("../data/edgelists/real_news_edgelist.csv", index = False)

  2%|▏         | 52/3171 [00:08<08:21,  6.22it/s]


KeyboardInterrupt: 

## Fake news

In [2]:
input_file = os.path.join("..", "data", "raw_data", "fake_or_real_news.csv")
data = pd.read_csv(input_file)
data = data[data["label"]=="FAKE"]["text"]

text_entities = []

for text in tqdm(data):
    # create temporary list 
    tmp_entities = []
    # create doc object
    doc = nlp(text)
    # for every named entity
    for entity in doc.ents:
        # if that entity is a person
        if entity.label_ == "PERSON":
            # append to temp list
            tmp_entities.append(entity.text)
    # append temp list to main list
    text_entities.append(tmp_entities)

edgelist = []
# iterate over every document
for text in text_entities:
    # use itertools.combinations() to create edgelist
    edges = list(combinations(text, 2))
    # for each combination - i.e. each pair of 'nodes'
    for edge in edges:
        # append this to final edgelist
        edgelist.append(tuple(sorted(edge)))
        

counted_edges = []
for key, value in Counter(edgelist).items():
    source = key[0]
    target = key[1]
    weight = value
    counted_edges.append((source, target, weight))
    
edges_df = pd.DataFrame(counted_edges, columns=["nodeA", "nodeB", "weight"])

edges_df.to_csv("../data/edgelists/fake_news_edgelist.csv", index=False)

100%|██████████| 3164/3164 [06:46<00:00,  7.79it/s]
