In [57]:
import psycopg2

import pandas as pd
import numpy as np

from copy import deepcopy

import ast

import networkx as nx

import itertools

import matplotlib.pyplot as plt

In [17]:
sehir_matches_by_id = pd.read_csv('datasets/twitter_fb_matches.csv', index_col="ID").set_index("twitter_ID")
sehir_matches_by_id.head(5)

Unnamed: 0_level_0,sehir_matches,twitter_screen_name,profile_description,followers_count,friends_count,favourites_count,statuses_count,lang,twitter_name,fb_ID,full_name,membership
twitter_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
185410794,muhammed caki,m_sysL,iühf 🐎,349,323,1852,7807,tr,muhammed mucahit,10211725757681071,muhammed cak,Sehir Dersler&Hocalar
849869543522656256,muhammed caki,Muhamme74015968,Haber Gündem Spor Futbol Müzik,2,57,1,7,tr,muhammed ali,10211725757681071,muhammed cak,Sehir Dersler&Hocalar
938059400744456192,muhammed caki,Muhammed4166,,335,2220,1068,114,tr,muhammed ali,10211725757681071,muhammed cak,Sehir Dersler&Hocalar
401477209,muhammed caki,maliustun,a değil.,366,271,1251,1186,tr,muhammed ali,10211725757681071,muhammed cak,Sehir Dersler&Hocalar
218821230,muhammed caki,MuhammedCiya21,,173,290,85,0,tr,muhammed ciya,10211725757681071,muhammed cak,Sehir Dersler&Hocalar


In [22]:
len(sehir_matches_by_id)

1800

## Connect to sql Database

In [10]:
connection = psycopg2.connect('dbname=twitter_accounts_new host=localhost user=postgres password=1_sehir_1')

twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)\
.rename(columns={'id': 'GUID', 
                 'name': 'twitter_name',
                 'description': 'profile_description',
                 'screen_name': 'twitter_screen_name'})

user_connections = pd.read_sql("SELECT * FROM twitter_connection", connection).drop('id', axis=1)

## Constructing the network

In [184]:
G = nx.Graph()
users_ids = dict()  # user name > id
ids_users = dict()  # user id > (twitter_ID,username)

In [185]:
for i, row in user_connections.iterrows():
    from_ = row['from_user_id']
    to = row['to_user_id']
    try:
        from_name = sehir_matches_by_id.loc[from_]["sehir_matches"]
        to_name = sehir_matches_by_id.loc[to]["sehir_matches"]
    except (KeyError,TypeError) as e:
        continue
#         print(from_name, to_name)
    if "sehir" in from_name or "sehir" in to_name:
        continue
    if type(from_name) == str:
        from_name = [from_name]
    if type(to_name) == str:
        to_name = [to_name]
    from_to = itertools.product(list(from_name), list(to_name))
    for f,t in from_to:
        users_ids.setdefault(f, len(users_ids))
        ids_users[users_ids[f]] = (from_, f)
        
        users_ids.setdefault(t, len(users_ids))
        ids_users[users_ids[t]] = (to, t)
        
        G.add_edge(users_ids[f],users_ids[t])

In [186]:
len(users_ids),G.number_of_nodes()

(459, 459)

In [187]:
G.number_of_edges()

699

## Augmenting the nodes with information

In [214]:
for ix in list(G.nodes()):
    twitter_id, sehir_name = ids_users[ix]
    match = sehir_matches_by_id.loc[twitter_id]
    if type(match) == pd.DataFrame: # handling duplicate rows for the same twitter ids
        # handling duplicate rows for the same twitter ids and sehir name (different fb ids)
        if type(match.set_index("sehir_matches").loc[sehir_name]) == pd.DataFrame:
            match = list(match.iterrows())[0][1]  # just picking the first ones
    for k, v in match.items():
        try:
            G.node[ix][k] = int(v)  # making sure it is not np.int64 to appeal to JSON
        except ValueError:
            G.node[ix][k] = str(v)

## Adding Network metrics

In [215]:
for ix,deg in list(G.degree()):
    G.node[ix]['degree'] = deg
    G.node[ix]['parity'] = (1-deg%2)

In [216]:
evc = nx.eigenvector_centrality(G)
closeness = nx.closeness_centrality(G)
betweenness = nx.betweenness_centrality(G)

In [217]:
metrics = {"eigenvector_centrality":evc,
           "closeness_centrality":closeness,
          "betweenness":betweenness}

In [218]:
for metric_name, metric in metrics.items():
    for ix,v in metric.items():
        G.node[ix][metric_name] = v

In [219]:
list(G.nodes(data=True))[0]

(0,
 {'betweenness': 0.0,
  'closeness_centrality': 0.281499692685925,
  'degree': 1,
  'eigenvector_centrality': 0.013644429140367793,
  'favourites_count': 0,
  'fb_ID': 597073473973832,
  'followers_count': 0,
  'friends_count': 9,
  'full_name': 'ahmet sehir',
  'lang': 'tr',
  'membership': 'Sehir Dersler&Hocalar',
  'parity': 0,
  'profile_description': 'nan',
  'sehir_matches': 'ahmet emir',
  'statuses_count': 8,
  'twitter_name': 'ahmet demir',
  'twitter_screen_name': 'ahmetde43241739'})

In [220]:
import json
from networkx.readwrite import json_graph
data = nx.node_link_data(G)
with open('twitter_fb.json', 'w') as f:
    json.dump(data, f, indent=4)