# Linear Regression

In [1]:
import pandas as pd
import numpy as np

In [2]:
crime = pd.read_csv('Training_and_Test_Set.csv') # read in the csv %%file

In [3]:
crime.head()

Unnamed: 0,CMPLNT_FR_DT,Daytime,Day_Name,Month,Day,Year,Season,GeoCell,BORO_NM,PRCP,...,TMIN,TMAX,Population,PC_INCOME,Hm_Sls_Price_Range,Holiday,Event,is_Holiday,is_Event,count_cmplnt
0,11/26/2014,Morning,Wednesday,November,26.0,2014.0,Fall,66.0,QUEENS,1.24,...,34,51,2250002,40997,Medium,,,0,0,1
1,12/1/2014,Late Night,Monday,December,1.0,2014.0,Winter,60.0,QUEENS,0.09,...,42,65,2250002,40997,Medium,,,0,0,1
2,11/10/2015,Morning,Tuesday,November,10.0,2015.0,Fall,15.0,BROOKLYN,0.26,...,51,57,2552911,43915,High,,,0,0,2
3,2/4/2014,Morning,Tuesday,February,4.0,2014.0,Winter,48.0,QUEENS,0.0,...,22,35,2250002,40997,Medium,,,0,0,3
4,8/25/2015,Late Night,Tuesday,August,25.0,2015.0,Summer,35.0,BROOKLYN,0.0,...,73,90,2552911,43915,High,,,0,0,1


In [5]:
#evaluating a linear regression model
# create x explanatory and y response variables for regression
y = crime['count_cmplnt']

#Delete the response variable from our training dataset
X = crime.drop('count_cmplnt', axis=1)

#inspect data 
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170650 entries, 0 to 170649
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   CMPLNT_FR_DT        170650 non-null  object 
 1   Daytime             170650 non-null  object 
 2   Day_Name            170650 non-null  object 
 3   Month               170650 non-null  object 
 4   Day                 170650 non-null  float64
 5   Year                170650 non-null  float64
 6   Season              170650 non-null  object 
 7   GeoCell             170650 non-null  float64
 8   BORO_NM             170650 non-null  object 
 9   PRCP                170650 non-null  float64
 10  SNOW                170650 non-null  float64
 11  TMIN                170650 non-null  int64  
 12  TMAX                170650 non-null  int64  
 13  Population          170650 non-null  object 
 14  PC_INCOME           170650 non-null  int64  
 15  Hm_Sls_Price_Range  170650 non-nul

In [6]:
#Divide data into test and training splits
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=10, test_size=0.10, random_state=0)

In [7]:
#Use mean absolute error (MAE) to score the regression models created 
#(the scale of MAE is identical to the response variable)
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error

#Function for Root mean squared error
#https://stackoverflow.com/questions/17197492/root-mean-square-error-in-python
def rmse(y_actual, y_predicted):
    return np.sqrt(mean_squared_error(y_actual, y_predicted))

#Function for Mean Absolute Percentage Error (MAPE) - Untested
#Adapted from - https://stackoverflow.com/questions/42250958/how-to-optimize-mape-code-in-python
def mape(y_actual, y_predicted): 
    mask = y_actual != 0
    return (np.fabs(y_actual - y_predicted)/y_actual)[mask].mean() * 100

#Create scorers for rmse and mape functions
mae_scorer = make_scorer(score_func=mean_absolute_error, greater_is_better=False)
rmse_scorer = make_scorer(score_func=rmse, greater_is_better=False)
mape_scorer = make_scorer(score_func=mape, greater_is_better=False)

#Make scorer array to pass into cross_validate() function for producing mutiple scores for each cv fold.
errorScoring = {'MAE':  mae_scorer, 
                'RMSE': rmse_scorer,
                'MAPE': mape_scorer
               } 

In [8]:
from sklearn.model_selection import cross_validate

def EvaluateRegressionEstimator(regEstimator, X, y, cv):
    
    scores = cross_validate(regEstimator, X, y, scoring=errorScoring, cv=cv, return_train_score=True)

    #cross val score sign-flips the outputs of MAE
    # https://github.com/scikit-learn/scikit-learn/issues/2439
    scores['test_MAE'] = scores['test_MAE'] * -1
    scores['test_MAPE'] = scores['test_MAPE'] * -1
    scores['test_RMSE'] = scores['test_RMSE'] * -1

    #print mean MAE for all folds 
    maeAvg = scores['test_MAE'].mean()
    print_str = "The average MAE for all cv folds is: \t\t\t {maeAvg:.5}"
    print(print_str.format(maeAvg=maeAvg))

    #print mean test_MAPE for all folds
    scores['test_MAPE'] = scores['test_MAPE']
    mape_avg = scores['test_MAPE'].mean()
    print_str = "The average MAE percentage (MAPE) for all cv folds is: \t {mape_avg:.5}"
    print(print_str.format(mape_avg=mape_avg))

    #print mean MAE for all folds 
    RMSEavg = scores['test_RMSE'].mean()
    print_str = "The average RMSE for all cv folds is: \t\t\t {RMSEavg:.5}"
    print(print_str.format(RMSEavg=RMSEavg))
    print('*********************************************************')

    print('Cross Validation Fold Mean Error Scores')
    scoresResults = pd.DataFrame()
    scoresResults['MAE'] = scores['test_MAE']
    scoresResults['MAPE'] = scores['test_MAPE']
    scoresResults['RMSE'] = scores['test_RMSE']
    return scoresResults


In [9]:
#Make new estimator compatible for use with GridSearchCV() and cross_validate()
# -  Cap predict function for LinearRegression between 0 and 100
# -  See: Roll your own estimator links above for details. 
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import LinearRegression

class CappedLinearRegression(LinearRegression):

    def predict(self, X):
        return np.clip(super(CappedLinearRegression, self).predict(X), 0, 100) 

In [10]:
#Create a Linear Regression object and perform a grid search to find the best parameters
linreg = CappedLinearRegression()
parameters = {'normalize':(True,False), 'fit_intercept':(True,False)}

#Create a grid search object using the  
from sklearn.model_selection import GridSearchCV
regGridSearch = GridSearchCV(estimator=linreg
                   , verbose=1 # low verbosity
                   , param_grid=parameters
                   , cv=cv # KFolds = 10
                   , scoring=mae_scorer)

#Perform hyperparameter search to find the best combination of parameters for our data
regGridSearch.fit(X, y)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Traceback (most recent call last):
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\linear_model\_base.py", line 505, in fit
    X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packag

Traceback (most recent call last):
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\linear_model\_base.py", line 505, in fit
    X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  Fil

Traceback (most recent call last):
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\linear_model\_base.py", line 505, in fit
    X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  Fil

Traceback (most recent call last):
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\linear_model\_base.py", line 505, in fit
    X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  Fil

Traceback (most recent call last):
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\linear_model\_base.py", line 505, in fit
    X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  Fil

Traceback (most recent call last):
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\linear_model\_base.py", line 505, in fit
    X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  Fil

Traceback (most recent call last):
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\linear_model\_base.py", line 505, in fit
    X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  Fil

ValueError: could not convert string to float: '11/26/2014'

In [11]:
#Print the parameterization of the best estimator
regGridSearch.best_estimator_

CappedLinearRegression(normalize=True)

In [12]:
#Create CappedLinearRegression predictions between 0 and 100% using the best parameters for our Linear Regression object
regEstimator = regGridSearch.best_estimator_

#Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics. 
EvaluateRegressionEstimator(regEstimator, X, y, cv)

Traceback (most recent call last):
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\linear_model\_base.py", line 505, in fit
    X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  Fil

Traceback (most recent call last):
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\linear_model\_base.py", line 505, in fit
    X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  Fil

The average MAE for all cv folds is: 			 nan
The average MAE percentage (MAPE) for all cv folds is: 	 nan
The average RMSE for all cv folds is: 			 nan
*********************************************************
Cross Validation Fold Mean Error Scores


Traceback (most recent call last):
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\linear_model\_base.py", line 505, in fit
    X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\sabri\Anaconda3\envs\ML7331\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  Fil

Unnamed: 0,MAE,MAPE,RMSE
0,,,
1,,,
2,,,
3,,,
4,,,
5,,,
6,,,
7,,,
8,,,
9,,,
