In [35]:
import os, json, gc
import pandas as pd
import numpy as np
from scipy.spatial import distance
from scipy.stats import pearsonr
from urllib2 import Request, urlopen
from pandas.io.json import json_normalize



BASE_URI = "http://0.0.0.0:3000/"
SOCIAL_INTERESTS_URI = BASE_URI + "api/v4/re/interests/social"
BUSINESS_INTERESTS_URI = BASE_URI + "api/v4/re/interests/business"
LIFESTYLE_INTERESTS_URI = BASE_URI + "api/v4/re/interests/lifestyle"
EVENT_TYPES_URI = BASE_URI + "api/v4/re/events/types"
CONNECTIONS_URI = BASE_URI + "api/v4/re/connections"
ACCOUNTS_URI = BASE_URI + "api/v4/re/accounts"

def _request_data(uri):
    print ("Sending request to:", uri)
    request=Request(uri)
    data = json.loads(urlopen(request).read())
    df = pd.DataFrame(data)
    print ("Data shape:", df.shape)
    return df

def _process_interest_similarity(uri):
#     df_accounts = get_accounts()
    df_profile = _request_data(uri)
    structured_df = _manipulate_profile_matrix(df_profile)
    del df_profile
    gc.collect()
    
    df_profile_t = structured_df[structured_df.columns[2:]]
    df_interest_sim = _calculate_similarity(df_profile_t)
#     print df_interest_sim
    for index, profile in structured_df[['account_id', 'location']].iterrows():
        if index == 1:
            
            sim_for_account = df_interest_sim[index].tolist()
            sim_list = pd.Series(sim_for_account)
            df = structured_df[['account_id', 'location']].copy()
            df['interest_similarity'] = 1 - sim_list.values
            df['interest_count'] = (1 - sim_list.values) * len(df_profile_t.columns)
            df['interest_count'] = df['interest_count'].astype(int)
            
            df_profile_f = df[(df.location == '') | (df.location == 'empty') | (df.location.str.contains(profile.location))]
            df_profile_r = df_profile_f.sort_values(by='interest_similarity', ascending=0)[1:20]
#         update_interest_similarity_for_account(profile.account_id, df, type)
            


def _manipulate_profile_matrix(df):
    df_profile = df.copy()
    df_profile[['account_id']] = df_profile[['account_id']].astype(int)
    df_profile['location'].fillna('empty', inplace=True)
    df_profile_t = pd.pivot_table(df_profile, index=['account_id', 'location'], columns=['social'], values='indicator')
    del df_profile
    gc.collect()

    df_profile_t = df_profile_t.reset_index()
    df_profile_t = df_profile_t.fillna(value=0)
    df_profile_f = df_profile_t.dropna()
    del df_profile_t
    gc.collect()

    df_profile_s = df_profile_f.sort_values(by='account_id', ascending=True)
    del df_profile_f
    gc.collect()

    df_profile_s.reset_index(drop=True, inplace=True)
    return df_profile_s





def _calculate_similarity(df):
    profile_d = distance.pdist(df, metric='matching')
    profile_D = distance.squareform(profile_d)

    del profile_d
    gc.collect()
    
    return profile_D

In [36]:
_process_interest_similarity(SOCIAL_INTERESTS_URI)

('Sending request to:', 'http://0.0.0.0:3000/api/v4/re/interests/social')
('Data shape:', (170758, 4))
social  account_id      location  interest_similarity  interest_count
1               11  New York, NY             1.000000              53
1759         13350  New York, NY             0.962264              51
3372         17891                           0.962264              51
1978         13784                           0.962264              51
1940         13727                           0.943396              50
77             118  New York, NY             0.943396              50
63             101  New York, NY             0.943396              50
55              86  New York, NY             0.943396              50
1861         13566                           0.943396              50
5388         25428  New York, NY             0.943396              50
3427         18001  New York, NY             0.943396              50
947           7104                           0.943396    