## We start by importing all of the libaries and functions we'll need 

In [None]:
# System tools
import os

# Data analysis
import pandas as pd
from collections import Counter
from itertools import combinations 
from tqdm import tqdm

# NLP
import spacy
nlp = spacy.load("en_core_web_sm")

# Network analysis tools
import networkx as nx
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,20)

## Define input file and read data to dataframe

In [None]:
input_file = os.path.join("..", "data", "tabular_data", "fake_or_real_news.csv")

In [None]:
data = pd.read_csv(input_file)

In [None]:
data

## Extract all named individuals

We use ```spaCy``` to extract only mentions of PERSONS or LOC in the texts.

NB: See final comment below under ```Problems```!

In [None]:
real_text = data[data["label"] == "REAL"]["text"]

In [None]:
# get list of lists of entities
person_list = []

# iterate over every text
for doc in tqdm(nlp.pipe(real_text, batch_size=500)):
    # temp list
    tmp_list = []
    # get named entities for each text
    for entity in doc.ents:
        # if it is a PERSON
        if entity.label_ == "PERSON":
            # append to temporary list
            tmp_list.append(entity.text)
    # add to output list
    person_list.append(tmp_list)

## Create edgelist using ```itertools.combinations()```

In [None]:
test = ["A", "B", "C", "D"]

In [None]:
edgelist2 = []
for sublist in person_list:
    edgelist2.extend(list(combinations(sublist, 2)))

In [None]:
# create output edgelist
edgelist = []

# go over each list or "document" one at a time
for sublist in person_list:
    # get pairings in this doc
    edges = list(combinations(sublist, 2))
    # for every possible edge
    for edge in edges:
        # if the two values are the same
        if edge[0]==edge[1]:
            # do nothing
            pass
        # otherwise append to output
        else:
            edgelist.append(edge)

## Count occurrences using ```Counter()```

Create DF from Counter object, showing each node pair and the edge weight.

In [None]:
# create a weighted edgelist
weighted_edges = []

# use counter on edgelist
for key, value in Counter(edgelist).items():
    nodeA = key[0]
    nodeB = key[1]
    weight = value
    # append to output
    weighted_edges.append((nodeA, nodeB, weight))

In [None]:
edges_df = pd.DataFrame(weighted_edges, columns=["nodeA", "nodeB", "weight"])

In [None]:
edges_df

__Filter based on edgeweight__

In [None]:
filtered = edges_df[edges_df["weight"]>100]

## Create network

Create a graph object called ```G```

In [None]:
G = nx.from_pandas_edgelist(filtered, "nodeA", "nodeB", ["weight"])

We're just going to use the simplest plotting algorithm. But feel free to experiment with different approaches and see how they perform differently:

https://networkx.org/documentation/stable/reference/drawing.html

In [None]:
nx.draw_networkx(G, with_labels=True, node_size=20, font_size=10)

Make sure the folder ```../viz``` exists already for saving the image

In [None]:
outpath_viz = os.path.join('..', 'viz',' network.png')
plt.savefig(outpath_viz, dpi=300, bbox_inches="tight")

## Centrality measures

In [None]:
ev = nx.eigenvector_centrality(G)

In [None]:
eigenvector_df = pd.DataFrame(ev.items())

In [None]:
eigenvector_df.sort_values(1, ascending=False)

In [None]:
bc = nx.betweenness_centrality(G)
betweenness_df = pd.DataFrame(bc.items()).sort_values(1, ascending=False)

## Problems

- How much of an issue is coreference in the data?

- We've said that we're basing this on document co-occurence. But then why are there some node pairs with a greater edge weight than the number of documents?

- We could resolve this by changing the final line of our ```spaCy``` pipeline to be something like ```post_entities.append(set(sorted(tmp_entities)))```. 
    - What does this code do?