In [65]:
__author__ = "Philipp Kats"
__date__ = "2016_04_23"

%pylab inline
import pandas as pd
import numpy as np
import geopandas as gp

import pylab as plt
import os

from sklearn.svm import SVR
from sklearn.preprocessing import Imputer

from ipynotifyer import notifyOnComplete as nf

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


## Read DATA

In [66]:
path = '../../../../Data/refined_data/puma50.csv'
puma = pd.read_csv(path, low_memory=False, index_col='st', na_values='.')
puma.shape

(5777, 1966)

In [67]:
yPath = '../../../../Data/refined_data/predict.csv'
ys = pd.read_csv(yPath, low_memory=False, index_col='st', na_values='.')[['internetP', 'label']]
ys.shape

(5777, 2)

In [68]:
puma.dtypes.value_counts()

float64    1516
int64       449
object        1
dtype: int64

## Impute data

In [69]:
def NormDF(df, axis=0, dl='label'):
    '''normalizes dataframe by chosen axis (column by default)'''
    L = df[dl]
    df = df.drop(dl,1)
    dfn = (df - df.mean(axis))/df.std(axis)
    dfn[dl] = L
    return dfn

In [70]:
def imputeDF(df):
    cols = df.columns
    L = df['label']
    
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    puma_imp = pd.DataFrame(imp.fit_transform(df.drop('label',1)))
    
    puma_imp.index = df.index
    puma_imp.columns = cols[:puma_imp.shape[1]]
    puma_imp['label'] = L
    print puma_imp.shape
    
    
    return puma_imp

    

In [71]:
# pumaN = NormDF(puma)
# print pumaN.shape

In [72]:
puma_imp = imputeDF(puma)

(5777, 1966)


In [43]:
puma_imp.head()

Unnamed: 0_level_0,our2,our3,our4,our5,our6,our7,our8,our9,our10,our11,...,our41986,our42002,our42018,our42034,our42059,our42075,our42152,our42153,our42154,label
st,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
il_129,1.212748,1.233696,1.170192,1.243225,1.355621,1.222473,0.995936,1.035781,0.967715,1.164708,...,1.541282,-0.4183885,1.54294,2.325752,0.286276,2.860892,1.083466,1.574321,0.3022625,train
wi_57,0.923742,0.89902,1.164898,1.20022,1.051604,1.041713,1.346181,1.812326,1.137407,0.943402,...,0.3240344,0.06912739,0.3879691,0.08743878,0.4540903,-0.2383729,0.7264407,0.1067058,-0.05888708,train
tx_412,-0.374275,-0.382255,-0.354002,-0.280761,-0.401216,-0.327129,-0.521107,-0.3212,-0.467192,-0.293923,...,1.729015e-17,5.269378e-17,-4.940042e-18,3.046359e-17,6.586723e-18,-6.586723e-18,-9.880084e-18,-2.9640250000000004e-17,3.293361e-18,train
wa_14,-0.437582,-0.443403,-0.421037,-0.484573,-0.414124,-0.51568,-0.453853,-0.38668,-0.472838,-0.547727,...,1.729015e-17,5.269378e-17,-4.940042e-18,3.046359e-17,6.586723e-18,-6.586723e-18,-9.880084e-18,-2.9640250000000004e-17,3.293361e-18,train
tx_220,-0.307318,-0.306544,-0.374323,-0.272693,-0.242633,-0.149538,-0.239607,-0.096463,-0.460219,-0.371163,...,1.729015e-17,5.269378e-17,-4.940042e-18,3.046359e-17,6.586723e-18,-6.586723e-18,-9.880084e-18,-2.9640250000000004e-17,3.293361e-18,train


## Preprocessing

In [75]:
ip = ys['internetP']/100

In [76]:
ip

st
il_129    0.675431
wi_57     0.621680
tx_412    0.695315
wa_14     0.617784
tx_220    0.804482
az_55     0.946457
nc_193    0.717003
tx_11     0.589776
pr_75     0.444703
mo_84     0.682694
ma_27     0.677482
in_83     0.723709
md_61     0.683660
il_69     0.899556
tx_182    0.782962
ia_41     0.695950
ok_41     0.647244
ca_46     0.830173
oh_184    0.779876
nj_93     0.823367
va_142    0.912370
wa_140    0.917566
me_30     0.795613
pa_44     0.837817
sc_12     0.548076
tn_36     0.611007
nc_65     0.768713
ca_86     0.746463
ca_517    0.870770
az_135    0.904938
            ...   
md_70     0.617973
ca_392    0.556120
tx_348    0.887758
la_101    0.707746
wv_14     0.665684
md_44     0.776905
ky_68     0.618865
ca_215    0.861307
tn_34     0.779812
ks_51     0.835672
ca_92     0.832227
fl_179    0.709896
mi_197    0.815815
ca_68     0.789294
mo_105    0.561197
in_51     0.681631
mo_20     0.736594
ca_14     0.766000
nj_120    0.630899
mn_20     0.777514
pa_112    0.788926
tn_88    

## Data split

In [77]:
Xtest, Xtrain, Xval   = (x[1].drop('label',1) for x in puma_imp.groupby('label'))

In [78]:
ytest, ytrain, yval = (x[1]['internetP'] for x in ys.groupby(L))

In [79]:
datas = {'Xtest':Xtest, 
         'Xtrain':Xtrain, 
         'Xval':Xval,
         'ytest':ytest,
         'ytrain':ytrain,
         'yval':yval}

## Set Model

In [80]:
# svr = SVR(kernel='rbf', 
#       degree=3, 
#       gamma='auto', 
#       coef0=0.0, 
#       tol=0.001, 
#       C=1.0, 
#       epsilon=0.1, 
#       shrinking=True, 
#       cache_size=200, 
#       verbose=False, 
#       max_iter=-1)

In [81]:
@nf
def evaluate(model, datas=datas ):
    try:
        x = model.fit(datas['Xtrain'], datas['ytrain'])
        print 'train:', x.score(datas['Xtrain'], datas['ytrain'])
        print 'valid:', x.score(datas['Xval'], datas['yval'])
    except Exception, e:
        print str(e)


In [83]:
evaluate(SVR(kernel='rbf', C=2, gamma=0.1))

train: 0.322843060736
valid: 0.0656133863563


In [None]:
evaluate(SVR(kernel='linear', C=1))

In [61]:
evaluate(SVR(kernel='poly', C=1, degree=2))

train: -1.03219627863
valid: -0.870161633213
