In [1]:
import psycopg2

import pandas as pd
import numpy as np

from copy import deepcopy

import ast
import random
import networkx as nx
import time, unicodedata
import itertools

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from joblib import Parallel, delayed
from datetime import datetime

In [2]:
rc = pd.read_csv("../REST/static/random_connections.csv",index_col="Unnamed: 0")
rc.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation
4570,928678200,745389817,{'2018.05.22': True}
12321,19869574,141529642,{'2018.05.17': True}
5024,188561280,314762385,{'2018.05.30': True}
3337,202625973,368390845,{'2018.05.21': True}
6969,836326702,913846233,{'2018.05.13': True}


In [53]:
import ast
str2dict = lambda d : ast.literal_eval(d)
truncate = lambda x: int(str(int(x))[:9])

In [4]:
rc.formation = rc.formation.apply(str2dict)

In [5]:
def present_in_date(changes_dates, queried_date):
    """
    checking if a connection is present in a queried date
    changes_dates: {d1:True, d2:False, d3:True} connection added or removed
    queried_date: e.g. "2018.05.08"
    """
    str2date = lambda strdate: datetime.strptime(strdate, '%Y.%m.%d')  # 2018.05.08
    changes = sorted(changes_dates,key=lambda d: str2date(d))
    queried_date = datetime.strptime(queried_date, '%Y.%m.%d')
    present = False
    for d in changes:
        if queried_date < str2date(d):
            break
        present = changes_dates[d]
    return present

In [33]:
d1 = "2018.05.01"
d2 = "2018.05.02"
def calculate_new_edges2(d1=d1, d2=d2):
    nw = deepcopy(rc)
    for_col = nw.formation.apply(lambda dates: present_in_date(dates,d2) and not present_in_date(dates,d1))
    return rc[for_col == True]
    

In [34]:
s = time.time()
c1 = calculate_new_edges2()
print("took: ",time.time()-s)

took:  0.3612182140350342


In [14]:
c1.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation
5524,827075093,233548506,{'2018.05.02': True}
5365,799681692,767669089,{'2018.05.02': True}
10864,867510112,574630629,{'2018.05.02': True}
4934,265839474,286693555,{'2018.05.02': True}
9653,166347556,742686779,{'2018.05.02': True}


In [15]:
def get_connections_by_date(cons, date, present=True):
    nw = deepcopy(cons)
    for_col = nw.formation.apply(lambda dates: present_in_date(dates,date))
    return cons[for_col == present]

In [35]:
def calculate_new_edges(d1=d1, d2=d2):
    return get_connections_by_date(get_connections_by_date(rc, d2), d1, False)

In [36]:
s = time.time()
new_edges = calculate_new_edges()
print("took: ",time.time()-s)

took:  0.3839700222015381


In [37]:
twitter_users = pd.read_csv("../REST/static/twitter_users.csv", index_col="id")
twitter_users.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org,community
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
513696454,muharrem kazak,muharremkazak,95,513696500.0,Av. Muharrem Kazak,tr,34.0,52.0,False,2
230850831,mehmet kara,darulselam,100,230850800.0,Mehmet Kara,tr,93.0,336.0,False,25
360065011,hassan khiara,HKhyara,92,3600650000.0,Hassan khyara,tr,38.0,46.0,False,19
169021655,pinar batova,batovapinar,92,169021700.0,Pınar Batova,tr,32.0,593.0,False,33
258833444,hanife ergun,ergun_hanife,96,2588334000.0,Hanife Ergün,tr,38.0,67.0,False,27


### Categorizing edges (user-user vs user-foci)

In [38]:
orgs = set(twitter_users[twitter_users.is_org==True].index)

In [39]:
is_focal = lambda from_,to, orgs: (0,0) if (from_ in orgs and to in orgs)\
                                        or not(from_ in orgs or to in orgs)\
                                            else ((to,from_) if to in orgs else (from_,to))

In [40]:
new_edges["foci"] = new_edges.apply(lambda row:is_focal(row['from_user_id'], row['to_user_id'], orgs), axis=1)
new_edges["nonfoci"] = new_edges.foci.apply(lambda f: f[1])
new_edges["foci"] = new_edges.foci.apply(lambda f: f[0])
new_edges[new_edges.foci!=0]

Unnamed: 0,from_user_id,to_user_id,formation,foci,nonfoci
2777,154837447,819269230,{'2018.05.02': True},819269230,154837447
5148,888378156,430605010,{'2018.05.02': True},430605010,888378156
8967,746201372,271718939,{'2018.05.02': True},271718939,746201372
9877,900469680,283216865,{'2018.05.02': True},283216865,900469680
10734,111082356,386276047,{'2018.05.02': True},111082356,386276047
12256,306164644,754963839,{'2018.05.02': True},306164644,754963839
13585,306164644,550121465,{'2018.05.02': True},306164644,550121465
14824,594766612,847741204,{'2018.05.02': True},847741204,594766612


# Calculating membership closures
### if a user u1 just got a new connection with foci f1 at time t2, get friends of u1 who had connections with f1 at time t1

In [41]:
t1 = get_connections_by_date(rc, d1)
t1["foci"] = t1.apply(lambda row:is_focal(row['from_user_id'], row['to_user_id'], orgs), axis=1)
t1["nonfoci"] = t1.foci.apply(lambda f: f[1])
t1["foci"] = t1.foci.apply(lambda f: f[0])
t1[t1.foci!=0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,from_user_id,to_user_id,formation,foci,nonfoci
697,581066517,347250524,{'2018.05.01': True},581066517,347250524
4193,856911339,74959905,{'2018.05.01': True},856911339,74959905
6091,118400394,271718939,{'2018.05.01': True},271718939,118400394
9466,284399096,131150936,{'2018.05.01': True},131150936,284399096
10415,152042796,819269230,{'2018.05.01': True},819269230,152042796
12308,441584584,803581588,{'2018.05.01': True},803581588,441584584
12366,218228766,111082356,{'2018.05.01': True},111082356,218228766
12708,306672777,841682908,{'2018.05.01': True},841682908,306672777
13396,856911339,946886773,{'2018.05.01': True},856911339,946886773
13581,847741204,110320190,{'2018.05.01': True},847741204,110320190


In [42]:
# foci'es present at t1 and t2
mutual_foci = c2[c2.foci!=0].merge(t1[t1.foci!=0], on="foci")
mutual_foci

Unnamed: 0,from_user_id_x,to_user_id_x,formation_x,foci,nonfoci_x,from_user_id_y,to_user_id_y,formation_y,nonfoci_y
0,154837447,819269230,{'2018.05.02': True},819269230,154837447,152042796,819269230,{'2018.05.01': True},152042796
1,746201372,271718939,{'2018.05.02': True},271718939,746201372,118400394,271718939,{'2018.05.01': True},118400394
2,111082356,386276047,{'2018.05.02': True},111082356,386276047,218228766,111082356,{'2018.05.01': True},218228766
3,594766612,847741204,{'2018.05.02': True},847741204,594766612,847741204,110320190,{'2018.05.01': True},110320190


## check for membership closure

In [43]:
def membership_closure(cons, u1, u2):
    truncate = lambda x: int(str(int(x))[:9])
    u1_friends = set(
        cons.apply(lambda row: truncate(row['to_user_id'])if truncate(u1) == truncate(row['from_user_id'])\
                 else (truncate(row['from_user_id']) if truncate(u1) == truncate(row['to_user_id'])\
                   else np.NaN), axis=1).dropna())
    return u2 in u1_friends

In [44]:
mutual_foci['membership_closure'] = mutual_foci.apply(lambda row: 
                                                      membership_closure(t1,
                                                                         row['nonfoci_x'],
                                                                         row['nonfoci_y']),axis=1)
mutual_foci

Unnamed: 0,from_user_id_x,to_user_id_x,formation_x,foci,nonfoci_x,from_user_id_y,to_user_id_y,formation_y,nonfoci_y,membership_closure
0,154837447,819269230,{'2018.05.02': True},819269230,154837447,152042796,819269230,{'2018.05.01': True},152042796,False
1,746201372,271718939,{'2018.05.02': True},271718939,746201372,118400394,271718939,{'2018.05.01': True},118400394,False
2,111082356,386276047,{'2018.05.02': True},111082356,386276047,218228766,111082356,{'2018.05.01': True},218228766,False
3,594766612,847741204,{'2018.05.02': True},847741204,594766612,847741204,110320190,{'2018.05.01': True},110320190,False


# Calculating focal closures
### if a user u1 just got a new connection with another user u2 at time t2, and u1 and u2 have been connected to foci f1 at time t1

In [145]:
# Get all new user-user connections at t2
new_u2u_edges = new_edges[new_edges.foci==0]
new_u2u_edges.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation,foci,nonfoci
10616,843246206,276471646,{'2018.05.02': True},0,0
11011,291611186,170786149,{'2018.05.02': True},0,0
5559,156083842,226322675,{'2018.05.02': True},0,0
14588,722364482,173968248,"{'2018.05.02': True, '2018.05.05': False}",0,0
9916,941947974,195631629,{'2018.05.02': True},0,0


In [72]:
foci_t1 = t1[t1.foci!=0]
foci_t1

Unnamed: 0,from_user_id,to_user_id,formation,foci,nonfoci
697,581066517,347250524,{'2018.05.01': True},581066517,347250524
4193,856911339,74959905,{'2018.05.01': True},856911339,74959905
6091,118400394,271718939,{'2018.05.01': True},271718939,118400394
9466,284399096,131150936,{'2018.05.01': True},131150936,284399096
10415,152042796,819269230,{'2018.05.01': True},819269230,152042796
12308,441584584,803581588,{'2018.05.01': True},803581588,441584584
12366,218228766,111082356,{'2018.05.01': True},111082356,218228766
12708,306672777,841682908,{'2018.05.01': True},841682908,306672777
13396,856911339,946886773,{'2018.05.01': True},856911339,946886773
13581,847741204,110320190,{'2018.05.01': True},847741204,110320190


In [124]:
users_with_mut_foci = foci_t1.groupby("foci")['nonfoci'].apply(lambda x: set(x)).reset_index()
users_with_mut_foci = users_with_mut_foci[users_with_mut_foci.nonfoci.apply(lambda x: len(x)>1)]
users_with_mut_foci

Unnamed: 0,foci,nonfoci
8,856911339,"{74959905, 946886773}"


In [148]:
def is_fclosure(f, t, users_with_mut_foci):
    for _, focal in users_with_mut_foci.iterrows():
        if f in focal["nonfoci"] and t in focal["nonfoci"]:
            return True
    return False

In [151]:
new_u2u_edges.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation,foci,nonfoci
8858,334697739,9434942,{'2018.05.02': True},0,0
13155,314344118,277394939,{'2018.05.02': True},0,0
1039,235114569,631426132,{'2018.05.02': True},0,0
3148,323999505,342358307,{'2018.05.02': True},0,0
15874,198172639,276471646,{'2018.05.02': True},0,0


In [160]:
new_u2u_edges["is_focal_closure"] = new_u2u_edges.apply(
    lambda row: is_fclosure(
        row["from_user_id"], row["to_user_id"], users_with_mut_foci), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [162]:
new_u2u_edges.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation,foci,nonfoci,is_focal_closure
7667,378536512,480728930,{'2018.05.02': True},0,0,False
8889,156506029,755378424,{'2018.05.02': True},0,0,False
15218,488335702,329880363,{'2018.05.02': True},0,0,False
3532,174575628,2425151,{'2018.05.02': True},0,0,False
11659,371206338,859103325,{'2018.05.02': True},0,0,False
