In [2]:
import pandas as pd
import re
import numpy as np

def create_graph_df(batches):
    return pd.concat(batches,
        ignore_index=True)


def fs_to_dataframe(fs):
#     return pd.DataFrame({'User': fs})
    return pd.DataFrame({'User': re.split('\W+', fs)})


In [4]:
# load dataset

gh_users_allFollowers = pd.read_csv('temp_users_all-followers.csv')
gh_users_allFollowers.head()

Unnamed: 0,User,All_Followers,nF
0,tarruda,"Sannis,danielmahon,csjaba,FergusRedican,Victor...",570
1,mairatma,"brunocoelho,henvic,eduardolundgren,aperrelli,a...",363
2,joselitojunior1,"renatooliveira,jeffesonmaia,jotaefe,duartefq,J...",350
3,marcelcaraciolo,"thiagoarrais,brunojm,henriquebastos,macndesign...",330
4,luanfonceca,"brunohenrique,luizvarela,gladson,lucasbibiano,...",301


In [8]:
# normalizing
gh_users_allFollowers['All_Followers'] = gh_users_allFollowers['All_Followers'].fillna('')
gh_users_allFollowers['All_Followers'] = gh_users_allFollowers['All_Followers'].apply(fs_to_dataframe)

gh_users_allFollowers.head()

Unnamed: 0,User,All_Followers,nF
0,tarruda,User 0 S...,570
1,mairatma,User 0 brunocoelho ...,363
2,joselitojunior1,User 0 renatooliveira 1 ...,350
3,marcelcaraciolo,User 0 thiagoarrais 1 ...,330
4,luanfonceca,User 0 brunohenrique 1 ...,301


In [11]:
# Dataframe apenas com os seguidores recifenses

gh_users_Followers = gh_users_allFollowers.copy()

gh_users_Followers['Followers'] = gh_users_Followers['All_Followers'].apply(pd.DataFrame({'User':gh_users_Followers['User']}).merge)
gh_users_Followers['nFs'] = gh_users_Followers['Followers'].apply(len)

gh_users_Followers.drop(['All_Followers', 'nF'], axis=1, inplace=True)

gh_users_Followers.head()

Unnamed: 0,User,Followers,nFs
0,tarruda,User 0 henriquemenezes 1 paul...,5
1,mairatma,User 0 simoneas02 1 ...,19
2,joselitojunior1,User 0 luanfonceca 1 rena...,23
3,marcelcaraciolo,User 0 luanfonceca 1 ...,27
4,luanfonceca,User 0 deividazevedo2 1 ...,9


In [27]:
# numero de usuarios recifenses 
gh_users_Followers['User'].count()

1990

In [28]:
# numero de usuarios recifenses com pelo menos 1 seguidor (numero de nodes)
users_0_fs = [str(n) for n in gh_users_Followers.User[gh_users_Followers['nFs'] != 0]]
print(len(users_0_fs))
# print(users_0_fs[:50])

691


In [31]:
# total de seguidores por usuario (numero de edges)
gh_users_Followers.nFs.sum()

2318

In [13]:
# distribuicao do Grau
# numero de usuarios com 0 seguidores: value_counts==0 
gh_users_Followers.nFs.value_counts()

0     1299
1      309
2      143
3       67
4       41
5       26
7       20
6       19
8       13
9       13
11       6
14       4
12       3
10       3
15       3
23       3
20       2
13       2
18       2
24       2
25       2
29       1
37       1
35       1
16       1
27       1
19       1
40       1
48       1
Name: nFs, dtype: int64

In [29]:
# distribuicao do Grau - usuarios por ocorrencias da quantidade de seguidores 
groupby_nfs = gh_users_Followers.groupby('nFs')
groupby_nfs['User'].apply( ','.join)

nFs
0     caiorss,andrewesteves,Suburbanno,tarcisio-mari...
1     deividazevedo2,lmmenge,diegocarloslima,thiagoa...
2     zimmerle,andresmachado,kessiacastro,mauricioma...
3     adrielcafe,ktquez,pereiragislene,Juniorlimaivd...
4     ac-pm,matheusmariano,caiobsouza,herbertt,Abran...
5     tarruda,chocoelho,jonathanslima,douglaslira,ma...
6     dmesquita,eduardocruz,vanessa,paulolieuthier,i...
7     pcstl,henriquemenezes,rasoliveira,deyvisonroch...
8     paulorec,horaciojcfilho,dvro,tomersimis,iagobe...
9     luanfonceca,gallindo,ovictoraurelio,embs,victo...
10                roselmamendes,miguelarauj1o,Cisneiros
11    brunobasto,karlafalcao,irgmedeiros,lmarinho,le...
12           jordanamorais,marcellustavares,brunnogomes
13                            brunofarache,alexpessoajr
14         alsmoreira,victorlaerte,diegonvs,thiagodiniz
15                     dakerfp,interaminense,vinicius3w
16                                           pauloborba
18                                        lu

In [43]:
nodes = gh_users_Followers.loc[:,['User','nFs']]
nodes.columns = ['Id', 'nFollowers_recifenses']

nodes.head()

Unnamed: 0,Id,nFollowers_recifenses
0,tarruda,5
1,mairatma,19
2,joselitojunior1,23
3,marcelcaraciolo,27
4,luanfonceca,9


In [45]:
nodes.to_csv('nodes.csv', index=False)

In [37]:
pairs = zip(gh_users_Followers.User, gh_users_Followers.Followers)

edges = [pd.DataFrame([{'Source':str(f), 'Target':str(u)} for f in fs['User']]) for (u,fs) in pairs]
edges

[            Source   Target
 0  henriquemenezes  tarruda
 1   paulolieuthier  tarruda
 2        meiralins  tarruda
 3           dapine  tarruda
 4          spider8  tarruda,              Source    Target
 0        simoneas02  mairatma
 1            henvic  mairatma
 2   henriquemenezes  mairatma
 3       karlafalcao  mairatma
 4          diegonvs  mairatma
 5     andrewesteves  mairatma
 6          iagobelo  mairatma
 7        patrickrbc  mairatma
 8      alexpessoajr  mairatma
 9    pereiragislene  mairatma
 10        meiralins  mairatma
 11     pedroqueiroz  mairatma
 12    albertmourato  mairatma
 13          tcostam  mairatma
 14         filipewl  mairatma
 15        aperrelli  mairatma
 16       grodrigues  mairatma
 17              fmm  mairatma
 18         brunover  mairatma,             Source           Target
 0      luanfonceca  joselitojunior1
 1   renatooliveira  joselitojunior1
 2   talitaoliveira  joselitojunior1
 3            pcstl  joselitojunior1
 4      karlafalcao  

In [39]:
def create_edges_df(batches):
    return pd.concat(batches,
        ignore_index=True)

edges_df = create_edges_df(edges)
edges_df[:10]

Unnamed: 0,Source,Target
0,henriquemenezes,tarruda
1,paulolieuthier,tarruda
2,meiralins,tarruda
3,dapine,tarruda
4,spider8,tarruda
5,simoneas02,mairatma
6,henvic,mairatma
7,henriquemenezes,mairatma
8,karlafalcao,mairatma
9,diegonvs,mairatma


In [47]:
edges_df.to_csv('edges.csv', index=False)