# Exploring data on 500 Indian cities to tell a story

In [1]:
# Importing useful packages
import graphlab as gl
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
gl.canvas.set_target('ipynb')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1484071802.log


This non-commercial license of GraphLab Create for academic use is assigned to a.nandan@mun.ca and will expire on November 23, 2017.


## Importing data

In [2]:
cities = gl.SFrame.read_csv('cities_r2.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,int,str,int,int,int,int,int,int,int,int,int,int,int,int,float,float,float,str,int,int,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


## Names of available features

In [3]:
original_features = cities.column_names()
print original_features

['name_of_city', 'state_code', 'state_name', 'dist_code', 'population_total', 'population_male', 'population_female', '0-6_population_total', '0-6_population_male', '0-6_population_female', 'literates_total', 'literates_male', 'literates_female', 'sex_ratio', 'child_sex_ratio', 'effective_literacy_rate_total', 'effective_literacy_rate_male', 'effective_literacy_rate_female', 'location', 'total_graduates', 'male_graduates', 'female_graduates']


## Building a k nearest neighbour model:

### Standardize feature matrix

In [4]:
def standardize_matrix(data_sframe, features):
    # Program to standardize columns of SFrame
    # Input params: "data_sframe" is the SFrame data set; "features" features that need to standardized
    # Output param: standardized SFrame dataset
    for feature in features:
        data_sframe[feature] = (data_sframe[feature] - data_sframe[feature].mean())/data_sframe[feature].std()
    return(data_sframe)

## Exploring nearest neighbours

#### Creating graduate sex ratio feature

In [5]:
cities['graduates_sex_rate'] = cities['female_graduates']/cities['male_graduates']

In [6]:
new_features = ['population_total', '0-6_population_total', 'sex_ratio', 'child_sex_ratio',
                'effective_literacy_rate_total', 'effective_literacy_rate_male', 'effective_literacy_rate_female',
               'graduates_sex_rate']

#### Standardizing features

In [7]:
cities = standardize_matrix(cities, new_features)

#### Building k-nearest neighbors model

In [8]:
knn_cites = gl.nearest_neighbors.create(cities, features = new_features, label = 'name_of_city')

#### Creating queries

In [9]:
chennai = cities[cities['name_of_city'] == 'Chennai']
trichy = cities[cities['name_of_city'] == 'Tiruchirappalli']
siwan = cities[cities['name_of_city'] == 'Siwan'] # a small city in bihar
thrissur = cities[cities['name_of_city'] == 'Thrissur']

#### Testing knn model

In [10]:
knn_cites.query(chennai)

query_label,reference_label,distance,rank
0,Chennai,0.0,1
0,Kolkata,2.29587031793,2
0,Pune,2.43226091697,3
0,Nagpur,3.12376860327,4
0,Lucknow,3.28017195925,5


- Makes sense: Nearest neighbours of Chennai are big; these cities are known to have comparable social indicators

In [11]:
knn_cites.query(trichy)

query_label,reference_label,distance,rank
0,Tiruchirappalli,0.0,1
0,Tirunelveli,0.599850089239,2
0,Madurai,0.704740479038,3
0,Ozhukarai,0.811360535528,4
0,Dindigul,0.898033258317,5


- Again makes sense: Nearest neighbours of Trichy are of comparable size; these cities are known to have comparable social indicators. Ozhukarai is in Pondicherry

In [12]:
knn_cites.query(siwan)

query_label,reference_label,distance,rank
0,Siwan,0.0,1
0,Chas,0.509213328151,2
0,Botad,0.568018697308,3
0,Udgir,0.666522154752,4
0,Bokaro Steel City,0.678591721139,5


- A pattern is emerging: The algorithm prioritizes ranking in terms of size and then ranks based on social indicators

In [13]:
knn_cites.query(thrissur)

query_label,reference_label,distance,rank
0,Thrissur,0.0,1
0,Kozhikode,0.441342059892,2
0,Alappuzha,0.447498376769,3
0,Kochi,1.05879656977,4
0,Kollam,1.42533404987,5


#### Population is playing a big role in ranking neighbours

## Removing population

In [14]:
quality_features = ['sex_ratio', 'child_sex_ratio',
                'effective_literacy_rate_total', 'effective_literacy_rate_male', 'effective_literacy_rate_female',
               'graduates_sex_rate']

In [15]:
knn_cites_quality = gl.nearest_neighbors.create(cities, features = quality_features, label = 'name_of_city')

In [16]:
knn_cites_quality.query(chennai)

query_label,reference_label,distance,rank
0,Chennai,0.0,1
0,Madavaram,0.24814912829,2
0,Jalpaiguri,0.465160750199,3
0,Karaikkudi,0.472054940735,4
0,Kalyani,0.511951760426,5


In [17]:
knn_cites_quality.query(trichy)

query_label,reference_label,distance,rank
0,Tiruchirappalli,0.0,1
0,Dindigul,0.36873827923,2
0,Tirunelveli,0.382053995756,3
0,Ozhukarai,0.438476437762,4
0,Kumbakonam,0.492676816662,5


In [18]:
knn_cites_quality.query(siwan)

query_label,reference_label,distance,rank
0,Siwan,0.0,1
0,Chas,0.50916427896,2
0,Bokaro Steel City,0.546567877818,3
0,Botad,0.56792793759,4
0,Udgir,0.664453345259,5


In [19]:
knn_cites_quality.query(thrissur)

query_label,reference_label,distance,rank
0,Thrissur,0.0,1
0,Kozhikode,0.397294860013,2
0,Alappuzha,0.415599948025,3
0,Kochi,0.988438721707,4
0,Thiruvananthapuram,1.38178585093,5


## Model has no information about location and size but geographically nearby cities are forming clusters. It implies that by using qualitative measures we can predict geographic location. 
### But the result for Chennai is strange. 