In [1]:
# Import load data
import sys
r_path_data = "../new_codebase/src/utils/load_data/"
sys.path.append(r_path_data)
from load_dataframes import *

# K-means
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
#from sklearn import metrics
from sklearn.preprocessing import StandardScaler


In [2]:
username='ywang99587'
season='winter'
country='hungary'

In [4]:
features={"hrs" : ['hrs_in_tusc', 'hrs_outside_tuscany'],
"numlocs" : ['num_loc_in_tusc','num_unique_loc_in_tusc','num_loc_in_italy','num_unique_loc_in_italy'],
"location" : ['forest', 'water', 'river', 'park', 'arezzo', 'florence', 'livorno', 
              'lucca', 'pisa', 'pistoia', 'siena', 'coast', 'num_attrs'],
"latlon" : ['avg_lat', 'avg_lon', 'top_lat','top_lon','start_lat_tusc',
            'start_lon_tusc', 'start_lat', 'start_lon', 'end_lat', 'end_lon', 'std_lat', 'std_lon']}

In [5]:
def choose_features(features, hrs=True, numlocs=True, location=True, latlon=True):
    """
    features: a dictionary with keys equal to the arguments of the function, values list of df variables
    
    """
    final_features=[]
    options = [hrs, numlocs, location, latlon]
    for f in zip(options,features.keys()):
        #print (f)
        if f[0]==True:
            final_features.extend(features[f[1]])
    return final_features

In [6]:
def get_exluded_varaibles(all_features, excluded_features):
    """
    """
    s = set(all_features)
    return [x for x in all_features if x not in excluded_features]

In [7]:
def standardize_features(df_feature_all, features, hrs, numlocs, location, latlon):
    """
    """
    #choose features
    final_features=choose_features(features, hrs=True, numlocs=True, location=True, latlon=True)
    df_rel_features=df_feature_all[final_features]
    features_without_std=[f for f in final_features if f[:3]!='std'] #excluding standard deviation for scaling

    # scale variables
    df_to_scale=df_rel_features[features_without_std] 
    scaler = StandardScaler()
    scaled_feature_all = pd.DataFrame(scaler.fit_transform(df_to_scale), columns = df_to_scale.columns, index=df_to_scale.index)
    
    # add non-scaled variables back
    excluded_vars=get_exluded_varaibles(final_features, features_without_std)
    scaled_feature_all[excluded_vars]=df_rel_features[excluded_vars]    
    df_scaled=scaled_feature_all.query('std_lat > 0 & std_lon > 0') #excluding those who do not move
    """DISCUSS IT"""
    return df_scaled

In [8]:
def kmeans_model(df_kmeans, nc=5, write=False, path="", outfile=""):
    """
    Returns scaled features DataFrame with labels based on k-means
    Parameters:
    df_kmeans: DataFrame returned from select_scale_features
    """
    kmeans = KMeans(n_clusters=nc, n_jobs=-1)
    kmeans.fit(df_kmeans)
    labels = kmeans.labels_
    df_kmeans_labeled = df_kmeans
    df_kmeans_labeled['label'] = labels
    if write==True:
        f_kmeans_labeled[['label']].to_csv(path+outfile)
    return df_kmeans_labeled[['label']]

In [9]:
def calculate_cluster_size(kmeans_res):
    cluster_results=pd.DataFrame(kmeans_res['label'].value_counts())
    ratio=np.round(cluster_results/cluster_results.sum()*100, 2).rename(columns={'label':"ratio"})
    return cluster_results.join(ratio) 

In [10]:
def get_cluster_results(username, season, country, features):
    if country=='all':
        df_feature_all=get_k_means_data_for_all_countries(username, season)
    else:   
        df_feature_all=get_k_means_data(username,season, country)
    df_feature_all=df_feature_all.replace(np.nan,0).set_index('customer_nr')
    df_kmeans=standardize_features(df_feature_all, features, hrs=True, numlocs=True, location=True, latlon=True)
    kmeans_res=kmeans_model(df_kmeans, nc=5, write=False, path="", outfile="")
    print(calculate_cluster_size(kmeans_res))
    return df_feature_all.join(kmeans_res)

In [11]:
get_cluster_results('ovasarhelyi','pre-summer', 'all', features).head()

    label  ratio
3  324002  47.26
4  190890  27.85
1   83555  12.19
0   83199  12.14
2    3867   0.56


Unnamed: 0_level_0,customer_id,mcc,hr_arvl_tusc,day_of_wk_arvl_tusc,mon_arvl_tusc,day_arvl_tusc,loc_arvl_tusc,hr_arvl_italy,day_of_wk_arvl_italy,mon_arvl_italy,...,grosseto,livorno,lucca,pisa,pistoia,siena,coast,num_attrs,country,label
customer_nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,3FF98D9872C30F206C5ABBDF91C84412,250.0,0.0,1.0,5.0,1.0,3656832.0,0.0,1.0,5.0,...,0.0,0.0,0.0,1173.0,0.0,0.0,0.0,0.0,Russian Federation,4.0
2.0,39A283A10BDEA8AE240EC94E2F2ABAC9,250.0,14.0,1.0,5.0,1.0,3656189.0,14.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,240.0,0.0,0.0,Russian Federation,3.0
3.0,8AFEF96F5150D4986B57BAA1EE6CAF00,226.0,0.0,1.0,5.0,1.0,57785013.0,0.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Romania,3.0
4.0,335CF8A4B9E6A92F833BF7127BD38458,219.0,4.0,1.0,5.0,1.0,59769012.0,4.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Croatia,3.0
5.0,4CBA82162789D501F2A95DB0F092957C,238.0,4.0,1.0,5.0,1.0,56427031.0,4.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Denmark,3.0
