In [29]:
import pandas as pd
import numpy as np

# filtered_twitter_users.csv
#### https://github.com/AmmarRashed/EventOrient/blob/master/datasets/filtered_twitter_users.csv
### 2849 Twitter users crawled from a list of sehir accounts using snowball sampling
#### Id = Twitter ID
#### followers_count = followers count on Twitter
#### friends_count = friends count on Twitter
#### is_org = is a sehir institutional account or just a user
#### lang = the most used language by that twitter account
#### match_name = most similar name, to twitter name, in the gmail contacts of sehir.edu and std.sehir.edu domains
#### match_ratio = the similarity score between twitter name and most similar name from Sehir domain gmail contacts. Calculated using fuzzywuzzy WRatio  (match_ratio > 90, 291122559 id added manually)
#### name = Twitter name
#### screen_name = twitter screen name
#### truncated_id = first 9 digits of the Twitter Id, to fit in libraries like SnapPy
#### community = the community that account belongs to as calculated by girvan newman algorithm

In [55]:
twitter_users = pd.read_csv("../datasets/filtered_twitter_users.csv", index_col="id")
twitter_users.sample(5)

Unnamed: 0_level_0,followers_count,friends_count,is_org,lang,match_name,match_ratio,name,screen_name,truncated_id,community
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
30725100.0,170.0,319.0,False,en,kaveh nematipour,100.0,Kaveh Nematipour,knematipour,30725104,1
178450000.0,174.0,362.0,False,tr,seyma inal,95.0,Seyma İnal,seyma_i,178450017,1
8.41817e+17,6.0,123.0,False,tr,salih salih,95.0,Salih Çiriş,salih_ciris,841817036,1
99095400.0,202921.0,45.0,False,tr,murat demir,96.0,Murat Özdemir,Murat_Ozdemir_,99095397,1
967897600.0,436.0,420.0,False,tr,ahmet kaya,100.0,Ahmet Kaya,HocazadeAhmed,967897650,3


In [54]:
len(twitter_users)

2849

# filtered_twitter_connections.csv
#### https://github.com/AmmarRashed/EventOrient/blob/master/datasets/filtered_twitter_connections.csv
### 9031 Twitter following connections crawled at two different dates
### from_user_id, to_user_id: the twitter id of the follower and the followed twitter account respectively in this following connection
### formation: the date of the changes in the connection state (e.g. {'2018.05.08':True, '2018.05.18':False} means this connection was _created_ on  2018.05.08, and _removed_ on 2018.05.18)

In [52]:
grouped_cons = pd.read_csv("../datasets/filtered_twitter_connections.csv", index_col="id")
grouped_cons.tail(5)

Unnamed: 0_level_0,from_user_id,to_user_id,formation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9026,995255575398240256,737056442,{'2018.05.24': True}
9027,995255575398240256,1222491402,{'2018.05.24': True}
9028,995255575398240256,1666891914,{'2018.05.24': True}
9029,995255575398240256,3064906390,{'2018.05.24': True}
9030,997958654354755584,1536995378,{'2018.05.24': True}


# contacts.csv
#### https://github.com/AmmarRashed/EventOrient/blob/master/datasets/contacts.csv
## 10557 gmail contacts  from std.sehir.edu.tr and sehir.edu.tr domains

In [51]:
sehir_directory = pd.read_csv('contacts.csv')
sehir_directory.tail(5)

Unnamed: 0,id,First Name,Last Name,Primary Email
10552,10552,Sukru,Olkun,sukruolkun@std.sehir.edu.tr
10553,10553,Zeynep Begum,Tanis,zeyneptanis@std.sehir.edu.tr
10554,10554,Anastasiya,Osipova,anastasiyaosipova@std.sehir.edu.tr
10555,10555,Omer,Koca,omerkoca@std.sehir.edu.tr
10556,10556,Muhammed Murat,Yilmaz,muratyilmaz@std.sehir.edu.tr


In [37]:
len(sehir_directory)

10557

# fb_users.csv
#### https://github.com/AmmarRashed/EventOrient/blob/master/datasets/fb_users.csv
### 2754 facebook accounts from Şehir-related groups
#### Sehir Dersler&Hocalar	2346
#### Sehir Duyuru	309
#### Sehir Lessons&Teachers 39
#### Sehir Mezunlari	35
#### Sehir Muhendislik 25
## .
#### id = Facebook account id
#### membership = group
#### full_name is the concatenation  first_name, midle_name, last_name

In [57]:
fb_users = pd.read_csv('fb_users2.csv', usecols=["id",
                                                "membership",
                                                "first_name",
                                                "middle_name",
                                                "last_name",
                                                "full_name"])
fb_users.tail(5)

Unnamed: 0,id,membership,first_name,middle_name,last_name,full_name
2749,1734068939990594,Sehir Muhendislik,zeynep,kezban,turgut,zeynep kezban turgut
2750,910830722417581,Sehir Muhendislik,reha,,karltekin,reha karltekin
2751,10155959451848548,Sehir Muhendislik,aysenur,,eser,aysenur eser
2752,769153509959215,Sehir Muhendislik,hakan,,gurler,hakan gurler
2753,1797189933648857,Sehir Muhendislik,esra,,uslupat,esra uslupat


In [58]:
fb_users.groupby('membership').count()

Unnamed: 0_level_0,id,first_name,middle_name,last_name,full_name
membership,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Sehir Dersler&Hocalar,2346,2341,506,2341,2341
Sehir Duyuru,309,305,61,306,306
Sehir Lessons&Teachers,39,39,10,39,39
Sehir Mezunlari,35,35,7,34,35
Sehir Muhendislik,25,25,3,25,25
