In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model

### A. Linear Regression

#### 1. Fit the Model

In [2]:
# Open the FBI's 2013 New York crime dataset (skip first four rows).
df_ny13 = pd.read_csv('https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/New_York_offenses/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013%20-%2013tbl8ny.csv', skiprows=4)

In [3]:
# Rename columns.
df_ny13.rename(columns={'City': 'city','Population': 'population', 'Violent\ncrime': 'violent_crime', 'Murder and\nnonnegligent\nmanslaughter': 'murder', 'Rape\n(legacy\ndefinition)2': 'rape','Robbery': 'robbery', 'Aggravated\nassault': 'aggravated_assault', 'Property\ncrime': 'property_crime', 'Burglary': 'burglary', 'Larceny-\ntheft': 'larceny_theft', 'Motor\nvehicle\ntheft': 'motor_vehicle_theft', 'Arson3': 'arson'}, inplace=True)

# Create subset.
df_ny13_ppmr = df_ny13[['population', 'murder', 'robbery', 'property_crime']]

# Drop the null values.
df_ny13_ppmr = df_ny13_ppmr.dropna()

In [4]:
# Create function to remove commas and convert columns to numeric.
cols_ppmr = df_ny13_ppmr.columns[df_ny13_ppmr.dtypes.eq('object')]
convert_col = lambda col_obj: pd.to_numeric(col_obj.replace(',',''))
df_ny13_ppmr[cols_ppmr] = df_ny13_ppmr[cols_ppmr].applymap(convert_col)

In [5]:
# Create features.
df_ny13_ppmr['population_squared'] = df_ny13_ppmr['population'] * df_ny13_ppmr['population']
df_ny13_ppmr['murder_category'] = np.where(df_ny13_ppmr['murder']>0, 1, 0)
df_ny13_ppmr['robbery_category'] = np.where(df_ny13_ppmr['robbery']>0, 1, 0)

In [6]:
# Regression coefficients.
regr_ny13_ppmr = linear_model.LinearRegression()
y_ny13_ppmr = df_ny13_ppmr['property_crime']
X_ny13_ppmr = df_ny13_ppmr[['population','population_squared','murder_category','robbery_category']]
regr_ny13_ppmr.fit(X_ny13_ppmr, y_ny13_ppmr)

print('\nCoefficients: \n', regr_ny13_ppmr.coef_)
print('\nIntercept: \n', regr_ny13_ppmr.intercept_)
print('\nR-squared:')
print(regr_ny13_ppmr.score(X_ny13_ppmr, y_ny13_ppmr))


Coefficients: 
 [  3.46570268e-02  -2.11108019e-09   1.51866535e+01  -9.62774363e+01]

Intercept: 
 -109.575335623

R-squared:
0.996124710499


In [7]:
# Outliers removed.
df_ny13_ppmr_or = df_ny13_ppmr[df_ny13_ppmr['population']<100000]
regr_ny13_ppmr_or = linear_model.LinearRegression()
y_ny13_ppmr_or = df_ny13_ppmr_or['property_crime']
X_ny13_ppmr_or = df_ny13_ppmr_or[['population','population_squared','murder_category','robbery_category']]
regr_ny13_ppmr_or.fit(X_ny13_ppmr_or, y_ny13_ppmr_or)

print('\nCoefficients: \n', regr_ny13_ppmr_or.coef_)
print('\nIntercept: \n', regr_ny13_ppmr_or.intercept_)
print('\nR-squared:')
print(regr_ny13_ppmr_or.score(X_ny13_ppmr_or, y_ny13_ppmr_or))


Coefficients: 
 [  1.31899277e-02   1.36183482e-07   1.94770525e+02   8.05675060e+01]

Intercept: 
 -18.823557926

R-squared:
0.745964704737


#### 2. Train and Test the Model

##### a. Train/Test Split

In [8]:
# Split the data into train and test samples.
from sklearn.model_selection import train_test_split
regr_tts_ny13 = linear_model.LinearRegression()

# Use the outlier-removed dataset (dependent variable not log-transformed).
X_train_tts_ny13, X_test_tts_ny13, y_train_tts_ny13, y_test_tts_ny13 = train_test_split(X_ny13_ppmr_or, df_ny13_ppmr_or['property_crime'], test_size=0.3, random_state=42)

# Train the model.
lm_tts_ny13 = regr_tts_ny13.fit(X_train_tts_ny13, y_train_tts_ny13)

# Make predictions on the test sample.
predictions_ttslr_ny13 = regr_tts_ny13.predict(X_test_tts_ny13)

# Compare predicted vs. actual.
print('Predicted:', predictions_ttslr_ny13[0:5])
print('\n')
print('Actual:')
print(y_test_tts_ny13[0:5])

Predicted: [ 536.96637503  -30.06354991  442.02069786  -25.77740268  -11.38346045]


Actual:
240    630
118      5
115    343
44       2
128      1
Name: property_crime, dtype: int64


In [9]:
# Model score.
print('Score:', lm_tts_ny13.score(X_test_tts_ny13, y_test_tts_ny13))

Score: 0.770198446027


##### b. K-Folds Cross Validation

In [10]:
# Run the default three subsets, or folds, for cross validation.
from sklearn.cross_validation import cross_val_score, cross_val_predict
scores_cv_ny13 = cross_val_score(lm_tts_ny13, X_ny13_ppmr_or, y_ny13_ppmr_or)

# The score for each fold.
print('Cross validated scores:', scores_cv_ny13)

# Mean score.
print('Average score:', scores_cv_ny13.mean())

Cross validated scores: [ 0.76049813  0.68939028  0.59979129]
Average score: 0.683226566421




### B. KNN Regression 

#### 1. Fit the Model

In [11]:
# Fit the train and test data from the linear regression model into the KNN model.
from sklearn import neighbors

knn = neighbors.KNeighborsRegressor(n_neighbors=5)
knn_w = neighbors.KNeighborsRegressor(n_neighbors=5, weights='distance')
knn_tts_ny13 = knn.fit(X_train_tts_ny13, y_train_tts_ny13)
knn_wtts_ny13 = knn_w.fit(X_train_tts_ny13, y_train_tts_ny13)

#### 2. Train and Test the Model

##### a. Train/Test Split

In [12]:
print('Unweighted Accuracy:', knn_tts_ny13.score(X_test_tts_ny13, y_test_tts_ny13))
print('Weighted Accuracy:', knn_wtts_ny13.score(X_test_tts_ny13, y_test_tts_ny13))

Unweighted Accuracy: 0.70012995742
Weighted Accuracy: 0.657215890708


##### b. Cross Validation

In [15]:
# Run KNN cross validation.
from sklearn.model_selection import cross_val_score

score = cross_val_score(knn_tts_ny13, X_ny13_ppmr_or, df_ny13_ppmr_or['property_crime'], cv=3)
print("Unweighted Accuracy: %0.3f (+/- %0.3f)" % (score.mean(), score.std() * 2))
score_w = cross_val_score(knn_wtts_ny13, X_ny13_ppmr_or, df_ny13_ppmr_or['property_crime'], cv=3)
print("Weighted Accuracy: %0.3f (+/- %0.3f)" % (score_w.mean(), score_w.std() * 2))

Unweighted Accuracy: 0.677 (+/- 0.023)
Weighted Accuracy: 0.675 (+/- 0.063)


Score|  OLS  |KNN UW | KNN W
-----|-------|-------|-------
TTS  |  0.770|  0.700| 0.657
CV   |  0.683|  0.677| 0.675

There was a marked difference among the scores with the train/test/split method, but not so with cross validation. In both instances, linear regression outperformed KNN. Since KNN is flexible, it is more susceptible to high noise, as opposed to OLS which is more rigid, and underperforms OLS when the noise-to-signal ratio is high. KNN suffers from the curse of dimensionality, where prediction accuracy can significantly decrease as the number of predictors increases because of the data's distance from a test point. Although KNN can deal with nonlinearity, it cannot determine which predictors are important, or interpret the resulting predictors. Lastly, tuning K is critical to good performance.  

Linear regression, on the other hand, has a fixed number of parameters and is computationally faster, but makes strong assumptions about the data. The algorithm may work well if the assumptions turn out to be correct, but it may perform badly if the assumptions are wrong. 