### Importing the required modules

In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier



### Loading the dataset

In [7]:
crime = pd.read_table('CommViolPredUnnormalizedData.txt', sep=',', na_values='?')

In [9]:
columns_to_keep = [5, 6] + list(range(11,26)) + list(range(32, 103)) + [145]  
crime = crime.iloc[:,columns_to_keep].dropna()

X_crime = crime.iloc[:,range(0,88)]
y_crime = crime['ViolentCrimesPerPop']

In [10]:
X_crime.head()

Unnamed: 0,population,householdsize,agePct12t21,agePct12t29,agePct16t24,agePct65up,numbUrban,pctUrban,medIncome,pctWWage,...,MedRentPctHousInc,MedOwnCostPctInc,MedOwnCostPctIncNoMtg,NumInShelters,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85
0,11980,3.1,12.47,21.44,10.93,11.33,11980,100.0,75122,89.24,...,23.8,21.1,14.0,11,0,10.66,53.72,65.29,78.09,89.14
1,23123,2.82,11.01,21.3,10.48,17.18,23123,100.0,47917,78.99,...,27.6,20.7,12.5,0,0,8.3,77.17,71.27,90.22,96.12
2,29344,2.43,11.36,25.88,11.01,10.28,29344,100.0,35669,82.0,...,24.1,21.7,11.6,16,0,5.0,44.77,36.6,61.26,82.85
3,16656,2.4,12.55,25.2,12.19,17.57,0,0.0,20580,68.15,...,28.7,20.6,14.5,0,0,2.04,88.71,56.7,90.17,96.24
5,140494,2.45,18.09,32.89,20.04,13.26,140494,100.0,21577,75.78,...,26.4,17.3,11.7,327,4,1.49,64.35,42.29,70.61,85.66


In [11]:
y_crime.tail()

2210    545.75
2211    124.10
2212    353.83
2213    691.17
2214    918.89
Name: ViolentCrimesPerPop, dtype: float64

In [12]:
from sklearn.neighbors import KNeighborsRegressor

### Splitting the data into train and test

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X_crime,y_crime,random_state=0)

# Model 1 : KNeighbors Regressor

In [25]:
knnreg = KNeighborsRegressor(n_neighbors=5).fit(X_train,y_train)

In [26]:
print(knnreg.predict(X_test))
print('R-squared test score: {:.3f}'.format(knnreg.score(X_test,y_test)))

[ 235.974 1683.778  288.038 1058.878  992.714   92.032  298.38   182.462
  358.572  706.93   144.882 1817.366  161.736  481.892  264.468  597.794
  507.398  250.334  515.374 1807.16   706.268 1241.876  823.928 1004.81
  498.306  688.646  101.288  390.162  828.734 1216.092   65.538 1234.036
  182.802  224.612  327.442 1584.234  487.768  302.58   797.184  936.26
 1196.874  183.972  275.302 1426.184  156.352  309.028  210.548  807.692
  797.886  570.124  188.904  421.252  459.214  258.226  246.226  493.976
  114.58   325.854  696.428  512.578  353.446 1533.962 1225.87   840.686
 1387.638  831.63   908.312 1044.314  610.698  729.078  153.218  600.322
  209.222  609.808  679.134  565.336  784.304  329.286  285.418 1120.536
  737.71  1091.064  235.214 1894.464  419.84  1568.496  278.668  203.396
  171.064  670.28  1706.46  1085.366  330.388  145.928 1347.528 1219.364
  388.344  301.448  188.422  274.378  153.28   985.916  140.646  915.542
  567.846  718.996  455.78   363.344  270.47   480.57

# Model 2: Lasso Regression

In [28]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [29]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [30]:
linlasso = Lasso(alpha=2.0, max_iter = 10000).fit(X_train_scaled, y_train)

In [32]:
print('lasso regression linear model intercept: {}'
     .format(linlasso.intercept_))
print('lasso regression linear model coeff:\n{}'
     .format(linlasso.coef_))
print('Non-zero features: {}'
     .format(np.sum(linlasso.coef_ != 0)))
print('R-squared score (training): {:.3f}'
     .format(linlasso.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}\n'
     .format(linlasso.score(X_test_scaled, y_test)))

lasso regression linear model intercept: 1186.6120619985795
lasso regression linear model coeff:
[    0.             0.            -0.          -168.18346054
    -0.            -0.             0.           119.6938194
     0.            -0.             0.          -169.67564456
    -0.             0.            -0.             0.
     0.             0.            -0.            -0.
     0.            -0.             0.             0.
   -57.52991966    -0.            -0.             0.
   259.32889226    -0.             0.             0.
     0.            -0.         -1188.7396867     -0.
    -0.            -0.          -231.42347299     0.
  1488.36512229     0.            -0.            -0.
    -0.             0.             0.             0.
     0.             0.            -0.             0.
    20.14419415     0.             0.             0.
     0.             0.           339.04468804     0.
     0.           459.53799903    -0.             0.
   122.69221826    -0.          

### Model 3 : Polynomial features with Linear Regression

In [45]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [41]:
poly = PolynomialFeatures(degree=2)
X_F1_poly = poly.fit_transform(X_crime)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X_F1_poly,y_crime,random_state=0)

In [48]:
linreg = LinearRegression().fit(X_train,y_train)

In [49]:
print('(poly deg 2) linear model coeff (w):\n{}'
     .format(linreg.coef_))
print('(poly deg 2) linear model intercept (b): {:.3f}'
     .format(linreg.intercept_))
print('(poly deg 2) R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('(poly deg 2) R-squared score (test): {:.3f}\n'
     .format(linreg.score(X_test, y_test)))

(poly deg 2) linear model coeff (w):
[-4.71057406e-07 -2.93382949e-01 -1.39505006e-04 ... -9.00839007e-02
 -3.60042702e-01  3.58959895e-01]
(poly deg 2) linear model intercept (b): 15989.231
(poly deg 2) R-squared score (training): 1.000
(poly deg 2) R-squared score (test): -29667.160



### Model 3: Ridge Regression with Polynomial features

Addition of many polynomial features often leads to
overfitting, so we often use polynomial features in combination
with regression that has a regularization penalty, like ridge
regression

In [53]:
from sklearn.linear_model import Ridge

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_crime,
                                                   random_state = 0)
linreg = Ridge().fit(X_train, y_train)

print('(poly deg 2 + ridge) linear model coeff (w):\n{}'
     .format(linreg.coef_))
print('(poly deg 2 + ridge) linear model intercept (b): {:.3f}'
     .format(linreg.intercept_))
print('(poly deg 2 + ridge) R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('(poly deg 2 + ridge) R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))



(poly deg 2 + ridge) linear model coeff (w):
[ 0.00000000e+00  4.76316141e-06 -8.52988052e-10 ...  3.07813587e-05
  2.07119036e-05  9.64493136e-06]
(poly deg 2 + ridge) linear model intercept (b): 1569.967
(poly deg 2 + ridge) R-squared score (training): 0.787
(poly deg 2 + ridge) R-squared score (test): -19914.038
