In [1]:
import pandas as pd
import numpy as np

from numpy.linalg import norm

import matplotlib.pyplot as plt
%matplotlib inline

Extract the different network (with different parametrizations) and use them to compute similarity between 1 single seed, a <b>fashion brand</b>, and the candidates, its <b>followers</b>.

In [15]:
def candidatesRanking(candidates, seed, features):
    seed_v = np.array(seed[features])
    
    output = pd.DataFrame()
    output['distance'] = candidates.apply(lambda x: norm(np.array(x[features]) - seed_v), axis=1)
    output['username'] = candidates['username']
    output['similarity'] = output.apply(lambda x: 1/(1+x['distance']), axis=1)
    output['link'] = output.apply(lambda x: 'www.instagram.com/'+x['username']+'/', axis=1)
    
    output.sort_values(by='similarity', ascending=False, inplace=True)
    
    return output[['username', 'similarity', 'link']]

In [18]:
seeds = ['emporiosirenuse', 'daftcollectionofficial'] #,'athenaprocopiou','?'

In [21]:
header = ['1','2','3','4']
label = {'m': 'Mention', 'h': 'Hashtag'}
for s in seeds:
    
    print s.upper()
    
    path = '1-seed/{}/'.format(s)
    validation = pd.DataFrame()
    for graphtype in ['m','h']:
        userdata = pd.read_csv(path+'{}_reduced_nodes.csv'.format(graphtype), sep='\t') # match graph id and user instance
        # candidates == followers
        if graphtype == 'm':
            followers = pd.read_csv('data-followers/{}/user.csv'.format(s), sep='\t', dtype=object)
        elif graphtype == 'h':
            followers = pd.read_csv('data-followers/{}/user.csv'.format(s), sep='\t')
            
        for detection in ['community', 'roles']:
            print 'Graph Type: {} - {}'.format(label[graphtype], detection)
            data = pd.read_csv(path+'user_features_{}_{}.emb'.format(detection, graphtype), sep=' ', header=None, skiprows=1)
            
            data = userdata.merge(data, left_on='id', right_on=0)
            data.drop(0, inplace=True, axis=1)
            try:
                data.drop('username', inplace=True, axis=1)
            except:
                print 'Not found'
            
            seed = data[data['usertype'] == 'seed']
            print 'SEED:\n{}'.format(seed)
            seed.columns = ['id_graph','id_user','usertype']+header

            data = data.merge(followers, left_on='content', right_on='id_user')
            data.drop(['content','biography','profile_pic_url'], inplace=True, axis=1)

            data.columns = ['id_graph','usertype']+header+['followers_count','following_count','id_user',
                                                           'isPrivate','num_posts','username']
    
            r = candidatesRanking(data, seed, header)
            print r[['username','similarity']][:5]
            r.to_csv(path+'rankings/ranking_{}_{}.csv'.format(detection, graphtype), index=None)
            
            tovalidate = r[:10]
            tovalidate['source'] = '{}{}'.format(graphtype, detection[0])
            validation = pd.concat([validation, tovalidate])
    validation['validation'] = 0
    validation.to_csv(path+'validation.csv', index=None)

EMPORIOSIRENUSE
Graph Type: Mention - community
SEED:
        id     content usertype        1         2         3        4
1096  1096  2252447111     seed -1.01642 -0.624757 -0.897409 -1.30354
            username  similarity
1488      speronella    0.951180
18    nicolonovarese    0.946387
9        sgetiquette    0.913633
1489  summerrobertss    0.885518
245    villasolandra    0.879517
Graph Type: Mention - roles
SEED:
        id     content usertype         1         2         3         4
1096  1096  2252447111     seed  0.012651 -0.070971  0.027521 -0.148541


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


           username  similarity
187    elizabethvdg    0.950835
1179   fraaesposito    0.949422
379     nurus.sabah    0.948498
499   rachel_felson    0.945701
1310    bepivalerio    0.943341
Graph Type: Hashtag - community
Not found
SEED:
      id     content usertype        1       2         3       4
127  127  2252447111     seed  1.24798 -0.8559 -0.343738  2.4909
                username  similarity
516       viadeirufolo22    0.937065
1256      mariapatrikios    0.908165
778               ddferg    0.904245
1241  lessisamorefirenze    0.902548
26         serginopiazza    0.901756
Graph Type: Hashtag - roles
Not found
SEED:
      id     content usertype         1       2        3        4
127  127  2252447111     seed -0.021631  1.2016  1.28453 -2.50124
                 username  similarity
879  caitlinflynnramsdale    0.832278
178              kikuluci    0.816768
106              gustiini    0.805615
456            holycaftan    0.797110
351             raraborse    0.796618
DAFT