In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import networkx as nx
from community import community_louvain

import pycountry
import seaborn as sns

import collections
import jellyfish

import bs4 as bs  
import urllib.request  
import re  
import nltk

#from py_translator import Translator
from googletrans import Translator
from gensim.models import Word2Vec

# Importing the notebook with all the methods definition
import importlib
import GKG

In [3]:
switzerland_news = pd.read_csv('switzerland_actors.csv', sep='\t', delimiter='\t', decimal = '.', error_bad_lines=False, 
                                  engine='c', encoding='latin-1')

In [4]:
# Extraction of the themes mentioned in the Tunisia Actor dataframe
theme_list = GKG.theme_list_extraction(switzerland_news)

In [5]:
tf_total, idf_total, tf_idf_total = GKG.tf_idf_computation(dataframe=switzerland_news, themes_of_interest=theme_list)

Num:  100889


KeyboardInterrupt: 

In [None]:
# Cleaning the idf dictionary to keep only the most common themes in the news
idf_total_clean = pd.DataFrame(columns=['theme'])
count = 0
for key, value in idf_total.items():
    if value > 300:
        for count_theme in range(0, value):
            idf_total_clean.loc[count]= key
            count += 1
            
descending_order = idf_total_clean['theme'].value_counts().sort_values(ascending=False).index

plt.figure(figsize=(8,9))
g = sns.countplot(y=idf_total_clean['theme'], order=descending_order)
g.set_ylabel('Themes')
g.set_title('Occurrences of themes in the Switzerland GDELT Dataframe')
plt.show()

In [6]:
# Actor network creation
G_actor = nx.Graph()
GKG.social_graph_creation(G_actor, switzerland_news)

list_actor = list(G_actor.nodes)
nodes_actor = len(list(G_actor.nodes))
edges_actor = len(list(G_actor.edges))
print("--- Switzerland News ---")
print("A total of %d actors have been detected" % nodes_actor)
print("A total of %d edges have been drawn" % edges_actor)


--- Switzerland News ---
A total of 7311 actors have been detected
A total of 34271 edges have been drawn


In [7]:
# Computation of the page rank for each actor. This value is then assigned as an attribute

pagerank_actor = nx.algorithms.link_analysis.pagerank_alg.pagerank(G_actor)
        
for actor in list(G_actor.nodes):
    G_actor.node[actor]['pageRank'] = float(pagerank_actor[actor])

In [8]:
# By using the PageRank attribute, we can extract the 10 most influential actors in the network

actor_dict = dict(G_actor.nodes)
sorted_actor_dict = dict(sorted(actor_dict.items(), key = lambda x: x[1]['pageRank'], reverse=True))

topk_actor = GKG.topk_actor_extraction(sorted_actor_dict, k=10)

In [9]:
# Study of the cliques in the actor social network

partition = community_louvain.best_partition(G_actor)
for node_actor in list(G_actor.nodes):
    G_actor.node[node_actor]['community'] = partition[node_actor]

filepath_actor = '../Gephi Files/switzerland_news.gexf'
nx.write_gexf(G_actor, filepath_actor, version="1.2draft")

In [12]:
# Theme network generation

G_themes = nx.Graph()
G_themes = nx.create_empty_copy(G_actor)
list_actor = list(G_themes.nodes)
GKG.theme_network_creation(G_themes, list_actor, switzerland_news, theme_list, tf_idf_total)

nodes_themes = len(list(G_themes.nodes)) - nodes_actor
edges_themes = len(list(G_themes.edges))
print("--- Switzerland Themes ---")
print("A total of %d themes have been detected" % nodes_themes)
print("A total of %d edges have been drawn" % edges_themes)

for theme in themes_of_interest:
    if G_themes.has_node(theme):
        G_themes.node[theme]['nodeType'] = 'Theme'
    
for node in list(G_themes.nodes):
    if node not in themes_of_interest:
        G_themes.node[node]['nodeType'] = 'Actor'

filepath_theme = '../Gephi Files/switzerland_themes.gexf'
nx.write_gexf(G_themes, filepath_theme, version='1.2draft')

--- Tunisia Themes ---
A total of 40 themes have been detected
A total of 12158 edges have been drawn


In [13]:
# Definition of a new graph, with only the most influential actors kept

G_influence = G_themes.copy()
nodes_to_remove = []

for node in G_influence.nodes:
    remove_node = node not in themes_of_interest and node not in topk_actor
    #print("node: ", node, remove_node)
    
    if remove_node:
        nodes_to_remove.append(node)

G_influence.remove_nodes_from(nodes_to_remove)
G_influence.nodes

filepath_influence = '../Gephi Files/switzerland_influence.gexf'
nx.write_gexf(G_influence, filepath_influence, version='1.2draft')

In [14]:
strongest_themes = {}

for node in G_influence.nodes:
    if G_influence.node[node]['nodeType'] == 'Actor':
    
        theme_list = []
        weight_list = []

        strongest_themes[node] = {}
        edges = list(G_influence.edges(node, data=True))

        for edge in edges:
            theme_edge = edge[1]
            weight_edge = edge[2]['weight']

            theme_list.append(theme_edge)
            weight_list.append(weight_edge)
    
        index_themes = np.flip(np.argsort(weight_list))[0:5]
        
        print("--- ", node, " ---")
            
        print([theme_list[index] for index in index_themes])

---  Youssef Chahed  ---
['GENERAL_GOVERNMENT', 'LEADER', 'ELECTION', 'LEGISLATION', 'ALLIANCE']
---  Muhammad Karki  ---
['LEADER', 'GENERAL_GOVERNMENT', 'ELECTION', 'RELIGION', 'TERROR']
---  El Sebsi  ---
['LEADER', 'GENERAL_GOVERNMENT', 'ELECTION', 'BORDER', 'MEDIA_MSM']
---  Rashid Ghannouchi  ---
['LEADER', 'GENERAL_GOVERNMENT', 'ELECTION', 'ALLIANCE', 'BORDER']
---  Baji Kaid Essebsi  ---
['LEADER', 'GENERAL_GOVERNMENT', 'ELECTION', 'LEGISLATION', 'BORDER']
---  Ben Ali  ---
['LEADER', 'GENERAL_GOVERNMENT', 'LEGISLATION', 'TOURISM', 'RELIGION']
---  El Abidine Ben Ali  ---
['LEADER', 'GENERAL_GOVERNMENT', 'ELECTION', 'LEGISLATION', 'ALLIANCE']
---  Hafedh Essebssi  ---
['LEADER', 'GENERAL_GOVERNMENT', 'ELECTION', 'LEGISLATION', 'MEDIA_MSM']
---  Kanzler Saad Hariri  ---
['GENERAL_GOVERNMENT', 'LEADER', 'ELECTION', 'DEMOCRACY', 'RELIGION']
---  Kanzler Ali  ---
['GENERAL_GOVERNMENT', 'LEADER', 'ELECTION', 'MEDIA_MSM', 'TERROR']
