In [1]:
import graphlab as gl
import numpy as np
import random as rnd
import math
from __future__ import division
import matplotlib.pyplot as plt
%matplotlib inline
gl.canvas.set_target('ipynb')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1483824882.log
INFO:graphlab.cython.cy_server:GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1483824882.log


This non-commercial license of GraphLab Create for academic use is assigned to joisimha@gmail.com and will expire on October 22, 2017.


In [231]:
cities = gl.SFrame('India_500_Cities.csv')
simple_features = ['effective_literacy_rate_total', 'sex_ratio', 'total_graduates']
label = 'name_of_city'

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,int,str,int,int,int,int,int,int,int,int,int,int,int,int,float,float,float,str,int,int,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [232]:
cities1 = gl.SFrame(cities['name_of_city', 'effective_literacy_rate_total', 'sex_ratio', 'total_graduates'])
cities1['total_graduates'] = cities1['total_graduates']*1.
cities1['sex_ratio'] = cities1['sex_ratio']*1.

In [233]:
cities1

name_of_city,effective_literacy_rate_t otal ...,sex_ratio,total_graduates
Abohar,79.86,890.0,16287.0
Achalpur,91.99,928.0,8863.0
Adilabad,80.51,982.0,10565.0
Adityapur,83.46,902.0,19225.0
Adoni,68.38,1013.0,11902.0
Agartala,93.88,1002.0,52711.0
Agra,63.44,853.0,185813.0
Ahmadabad,89.62,897.0,769858.0
Ahmadnagar,91.49,952.0,51661.0
Aizawl,98.8,1029.0,26832.0


# Simple model

In [276]:
simple_model = gl.nearest_neighbors.create(cities1, features = simple_features, label = 'name_of_city',verbose=False)
simple_model_cosine = gl.nearest_neighbors.create(cities1, features = simple_features, label = 'name_of_city', distance = 'cosine',verbose=False)

In [227]:
chennai = cities1[cities1['name_of_city'] == 'Chennai']

hyd = cities1[cities1['name_of_city'] == 'Greater Hyderabad']

bang = cities1[cities1['name_of_city'] == 'Bengaluru']


## Chennai query

In [206]:
print simple_model.query(chennai, verbose=False)
print simple_model_cosine.query(chennai,verbose=False)

+-------------+-----------------+---------------+------+
| query_label | reference_label |    distance   | rank |
+-------------+-----------------+---------------+------+
|      0      |     Chennai     |      0.0      |  1   |
|      0      |     Kolkata     | 61219.0619021 |  2   |
|      0      |    Ahmadabad    |  109837.03606 |  3   |
|      0      |       Pune      |  223187.00377 |  4   |
|      0      |     Lucknow     | 282705.008971 |  5   |
+-------------+-----------------+---------------+------+
[5 rows x 4 columns]

+-------------+-------------------+-------------------+------+
| query_label |  reference_label  |      distance     | rank |
+-------------+-------------------+-------------------+------+
|      0      |      Chennai      |        0.0        |  1   |
|      0      |      Kolkata      | 2.59384069778e-10 |  2   |
|      0      |     Ahmadabad     | 1.07577446951e-09 |  3   |
|      0      | Greater Hyderabad | 4.82622495301e-08 |  4   |
|      0      |        P

## Bangalore query

In [207]:
print simple_model.query(bang, verbose=False)
print simple_model_cosine.query(bang,verbose=False)

+-------------+-------------------+---------------+------+
| query_label |  reference_label  |    distance   | rank |
+-------------+-------------------+---------------+------+
|      0      |     Bengaluru     |      0.0      |  1   |
|      0      |   Greater Mumbai  | 211208.009101 |  2   |
|      0      | Greater Hyderabad | 427014.001177 |  3   |
|      0      |       Delhi       |  629974.00121 |  4   |
|      0      |      Chennai      | 711468.003644 |  5   |
+-------------+-------------------+---------------+------+
[5 rows x 4 columns]

+-------------+-------------------+-------------------+------+
| query_label |  reference_label  |      distance     | rank |
+-------------+-------------------+-------------------+------+
|      0      |     Bengaluru     |        0.0        |  1   |
|      0      |   Greater Mumbai  | 5.19197040916e-09 |  2   |
|      0      |       Delhi       | 1.64287702331e-08 |  3   |
|      0      | Greater Hyderabad | 2.82744007096e-08 |  4   |
|     

## Hyderabad query

In [208]:
print simple_model.query(hyd, verbose=False)
print simple_model_cosine.query(hyd,verbose=False)

+-------------+-------------------+---------------+------+
| query_label |  reference_label  |    distance   | rank |
+-------------+-------------------+---------------+------+
|      0      | Greater Hyderabad |      0.0      |  1   |
|      0      |      Chennai      |  284454.00305 |  2   |
|      0      |      Kolkata      | 345673.003086 |  3   |
|      0      |     Ahmadabad     | 394291.002978 |  4   |
|      0      |     Bengaluru     | 427014.001177 |  5   |
+-------------+-------------------+---------------+------+
[5 rows x 4 columns]

+-------------+-------------------+--------------------+------+
| query_label |  reference_label  |      distance      | rank |
+-------------+-------------------+--------------------+------+
|      0      | Greater Hyderabad | -2.22044604925e-16 |  1   |
|      0      |     Bengaluru     | 2.82744007096e-08  |  2   |
|      0      |      Kolkata      | 4.16982666174e-08  |  3   |
|      0      |      Chennai      | 4.82622495301e-08  |  4   |

# Create some useful functions

# Feature Scaling, Get query point row data

In [270]:
def feature_scaling(data, features, scaling_type):
    scaled_data = {}
    scaled_data['name_of_city'] = data['name_of_city']
    
    if scaling_type == 1:
        # L2 Euclidean norm scaling - Inner dot product scalar
        
        for feature in features:
          
            norm = np.dot(data[feature], data[feature]) 
            scaled_data[feature] = data[feature]*1./norm
                    
    elif scaling_type == 2:
        # Mean/Std deviation based scaling
        for feature in features:
            
            scaled_data[feature] = ((data[feature]-data[feature].mean())*1.)/data[feature].std()
    
    
    return gl.SFrame(scaled_data)

def get_query(query_name, data):
    return data[data['name_of_city'] == query_name]

In [272]:
# print 'Raw data', cities1[0:2]
# norm_features = feature_scaling(cities1, simple_features, 1)
# print 'Normalized data', norm_features[0:2]
# std_features = feature_scaling(cities1, simple_features, 2)
# print 'Standardized data', std_features[0:2]
# chn_norm = get_query('Chennai', norm_features)
# hyd_norm = get_query('Greater Hyderabad', norm_features)
# bang_norm = get_query('Bengaluru', norm_features)


# Learn models

In [274]:
simple_model = gl.nearest_neighbors.create(cities1, features = simple_features, label = 'name_of_city', distance = 'euclidean', verbose=False)
simple_model_cosine = gl.nearest_neighbors.create(cities1, features = simple_features, label = 'name_of_city', distance = 'cosine',verbose=False)

norm_model = gl.nearest_neighbors.create(norm_features, features = simple_features, label = 'name_of_city', distance = 'euclidean', verbose=False)
norm_model_cosine = gl.nearest_neighbors.create(norm_features, features = simple_features, label = 'name_of_city', distance = 'cosine', verbose=False)

std_model = gl.nearest_neighbors.create(std_features, features = simple_features, label = 'name_of_city', distance = 'euclidean', verbose=False)
std_model_cosine = gl.nearest_neighbors.create(std_features, features = simple_features, label = 'name_of_city', distance = 'cosine', verbose=False)

## Chennai query

In [275]:
print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
print 'Raw features Euclidean', simple_model.query(get_query('Chennai', cities1), verbose=False)
print 'Raw features Cosine', simple_model_cosine.query(get_query('Chennai', cities1), verbose=False)

print 'Normalized Euclidean features', norm_model.query(get_query('Chennai', feature_scaling(cities1, simple_features, 1)),verbose=False)
print 'Normalized Cosine features', norm_model_cosine.query(get_query('Chennai', feature_scaling(cities1, simple_features, 1)), verbose=False)

print 'Standardized Euclidean features', std_model.query(get_query('Chennai', feature_scaling(cities1, simple_features, 2)), verbose=False)
print 'Standardized Cosine features', std_model_cosine.query(get_query('Chennai', feature_scaling(cities1, simple_features, 2)), verbose=False)


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Raw features Euclidean +-------------+-----------------+---------------+------+
| query_label | reference_label |    distance   | rank |
+-------------+-----------------+---------------+------+
|      0      |     Chennai     |      0.0      |  1   |
|      0      |     Kolkata     | 61219.0619021 |  2   |
|      0      |    Ahmadabad    |  109837.03606 |  3   |
|      0      |       Pune      |  223187.00377 |  4   |
|      0      |     Lucknow     | 282705.008971 |  5   |
+-------------+-----------------+---------------+------+
[5 rows x 4 columns]

Raw features Cosine +-------------+-------------------+-------------------+------+
| query_label |  reference_label  |      distance     | rank |
+-------------+-------------------+-------------------+------+
|      0      |      Chennai      |        0.0        |  1   |
|      0      |      Kolkata      | 2.59384069778e-10 |  2   |
|      0      |     Ahmadabad     | 1.07577446951e