# Linear Regression

In [23]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [24]:
crime = pd.read_csv('Training_and_Test_Set.csv') # read in the csv %%file

In [25]:
crime.head()

Unnamed: 0,CMPLNT_FR_DT,Daytime,Day_Name,Month,Day,Year,Season,GeoCell,BORO_NM,PRCP,...,TMIN,TMAX,Population,PC_INCOME,Hm_Sls_Price_Range,Holiday,Event,is_Holiday,is_Event,count_cmplnt
0,11/26/2014,Morning,Wednesday,November,26,2014,Fall,66,QUEENS,1.24,...,34,51,2250002,40997,Medium,,,0,0,1
1,12/1/2014,Late Night,Monday,December,1,2014,Winter,60,QUEENS,0.09,...,42,65,2250002,40997,Medium,,,0,0,1
2,11/10/2015,Morning,Tuesday,November,10,2015,Fall,15,BROOKLYN,0.26,...,51,57,2552911,43915,High,,,0,0,2
3,2/4/2014,Morning,Tuesday,February,4,2014,Winter,48,QUEENS,0.0,...,22,35,2250002,40997,Medium,,,0,0,3
4,8/25/2015,Late Night,Tuesday,August,25,2015,Summer,35,BROOKLYN,0.0,...,73,90,2552911,43915,High,,,0,0,1


In [26]:
# perform one-hot encoding of the categorical data 
tmp_df = pd.get_dummies(crime.BORO_NM,prefix='BORO_NM')
crime = pd.concat((crime,tmp_df),axis=1) # add back into the dataframe

tmp_df = pd.get_dummies(crime.Day_Name,prefix='Day_Name')
crime = pd.concat((crime,tmp_df),axis=1) # add back into the dataframe

tmp_df = pd.get_dummies(crime.Month,prefix='Month')
crime = pd.concat((crime,tmp_df),axis=1) # add back into the dataframe

tmp_df = pd.get_dummies(crime.Season,prefix='Season')
crime = pd.concat((crime,tmp_df),axis=1) # add back into the dataframe

tmp_df = pd.get_dummies(crime.Daytime,prefix='Daytime')
crime = pd.concat((crime,tmp_df),axis=1) # add back into the dataframe

tmp_df = pd.get_dummies(crime.Daytime,prefix='Hm_Sls_Price_Range')
crime = pd.concat((crime,tmp_df),axis=1) # add back into the dataframe

In [27]:
#Now we drop the original variables
if 'BORO_NM' in crime:    
    del crime['BORO_NM'] # get rid of the original category as it is now one-hot encoded
if 'Day_Name' in crime:    
    del crime['Day_Name'] # get rid of the original category as it is now one-hot encoded
if 'Month' in crime:    
    del crime['Month'] # get rid of the original category as it is now one-hot encoded
if 'Season' in crime:    
    del crime['Season'] # get rid of the original category as it is now one-hot encoded
if 'Daytime' in crime:    
    del crime['Daytime'] # get rid of the original category as it is now one-hot encoded
if 'Hm_Sls_Price_Range' in crime:    
    del crime['Hm_Sls_Price_Range'] # get rid of the original category as it is now one-hot encoded

In [28]:
#binary Y/N converted to Is Holiday or Is Event.  Can drop originals
if 'Holiday' in crime:    
    del crime['Holiday']
if 'Event' in crime:    
    del crime['Event']

In [29]:
#check datatypes for conversions needed
crime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170650 entries, 0 to 170649
Data columns (total 53 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   CMPLNT_FR_DT                      170650 non-null  object 
 1   Day                               170650 non-null  int64  
 2   Year                              170650 non-null  int64  
 3   GeoCell                           170650 non-null  int64  
 4   PRCP                              170650 non-null  float64
 5   SNOW                              170650 non-null  float64
 6   TMIN                              170650 non-null  int64  
 7   TMAX                              170650 non-null  int64  
 8   Population                        170650 non-null  int64  
 9   PC_INCOME                         170650 non-null  int64  
 10  is_Holiday                        170650 non-null  int64  
 11  is_Event                          170650 non-null  i

In [None]:
#split test and training set

# to use the cross validation object in scikit learn, we need to grab an instance
#    of the object and set it up. This object will be able to split our data into 
#    training and testing splits
num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)

In [30]:
#evaluating a linear regression model
# create x explanatory and y response variables for regression
y = crime['count_cmplnt']

#Delete the response variable from our training dataset
X = crime.drop('count_cmplnt', axis=1)
X = crime.drop('CMPLNT_FR_DT', axis=1) #dropping this because I have day, month and year as columns

#inspect data 
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170650 entries, 0 to 170649
Data columns (total 52 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   Day                               170650 non-null  int64  
 1   Year                              170650 non-null  int64  
 2   GeoCell                           170650 non-null  int64  
 3   PRCP                              170650 non-null  float64
 4   SNOW                              170650 non-null  float64
 5   TMIN                              170650 non-null  int64  
 6   TMAX                              170650 non-null  int64  
 7   Population                        170650 non-null  int64  
 8   PC_INCOME                         170650 non-null  int64  
 9   is_Holiday                        170650 non-null  int64  
 10  is_Event                          170650 non-null  int64  
 11  count_cmplnt                      170650 non-null  i

In [36]:
y.describe()

count    170650.000000
mean          4.714122
std           5.140051
min           1.000000
25%           1.000000
50%           3.000000
75%           6.000000
max         132.000000
Name: count_cmplnt, dtype: float64

In [10]:
#Divide data into test and training splits
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=10, test_size=0.10, random_state=0)

In [11]:
#Use mean absolute error (MAE) to score the regression models created 
#(the scale of MAE is identical to the response variable)
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error

#Function for Root mean squared error
#https://stackoverflow.com/questions/17197492/root-mean-square-error-in-python
def rmse(y_actual, y_predicted):
    return np.sqrt(mean_squared_error(y_actual, y_predicted))

#Function for Mean Absolute Percentage Error (MAPE) - Untested
#Adapted from - https://stackoverflow.com/questions/42250958/how-to-optimize-mape-code-in-python
def mape(y_actual, y_predicted): 
    mask = y_actual != 0
    return (np.fabs(y_actual - y_predicted)/y_actual)[mask].mean() * 100

#Create scorers for rmse and mape functions
mae_scorer = make_scorer(score_func=mean_absolute_error, greater_is_better=False)
rmse_scorer = make_scorer(score_func=rmse, greater_is_better=False)
mape_scorer = make_scorer(score_func=mape, greater_is_better=False)

#Make scorer array to pass into cross_validate() function for producing mutiple scores for each cv fold.
errorScoring = {'MAE':  mae_scorer, 
                'RMSE': rmse_scorer,
                'MAPE': mape_scorer
               } 

In [12]:
from sklearn.model_selection import cross_validate

def EvaluateRegressionEstimator(regEstimator, X, y, cv):
    
    scores = cross_validate(regEstimator, X, y, scoring=errorScoring, cv=cv, return_train_score=True)

    #cross val score sign-flips the outputs of MAE
    # https://github.com/scikit-learn/scikit-learn/issues/2439
    scores['test_MAE'] = scores['test_MAE'] * -1
    scores['test_MAPE'] = scores['test_MAPE'] * -1
    scores['test_RMSE'] = scores['test_RMSE'] * -1

    #print mean MAE for all folds 
    maeAvg = scores['test_MAE'].mean()
    print_str = "The average MAE for all cv folds is: \t\t\t {maeAvg:.5}"
    print(print_str.format(maeAvg=maeAvg))

    #print mean test_MAPE for all folds
    scores['test_MAPE'] = scores['test_MAPE']
    mape_avg = scores['test_MAPE'].mean()
    print_str = "The average MAE percentage (MAPE) for all cv folds is: \t {mape_avg:.5}"
    print(print_str.format(mape_avg=mape_avg))

    #print mean MAE for all folds 
    RMSEavg = scores['test_RMSE'].mean()
    print_str = "The average RMSE for all cv folds is: \t\t\t {RMSEavg:.5}"
    print(print_str.format(RMSEavg=RMSEavg))
    print('*********************************************************')

    print('Cross Validation Fold Mean Error Scores')
    scoresResults = pd.DataFrame()
    scoresResults['MAE'] = scores['test_MAE']
    scoresResults['MAPE'] = scores['test_MAPE']
    scoresResults['RMSE'] = scores['test_RMSE']
    return scoresResults


In [15]:
#Create a Linear Regression object and perform a grid search to find the best parameters
linreg = LinearRegression()
parameters = {'normalize':(True,False), 'fit_intercept':(True,False)}

#Create a grid search object using the  
from sklearn.model_selection import GridSearchCV
regGridSearch = GridSearchCV(estimator=linreg
                   , verbose=1 # low verbosity
                   , param_grid=parameters
                   , cv=cv # KFolds = 10
                   , scoring=mae_scorer)

#Perform hyperparameter search to find the best combination of parameters for our data
regGridSearch.fit(X, y)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   32.2s finished


GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, train_size=None),
             estimator=LinearRegression(),
             param_grid={'fit_intercept': (True, False),
                         'normalize': (True, False)},
             scoring=make_scorer(mean_absolute_error, greater_is_better=False),
             verbose=1)

In [16]:
#Print the parameterization of the best estimator
regGridSearch.best_estimator_

LinearRegression(normalize=True)

In [17]:
regEstimator = regGridSearch.best_estimator_

#Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics. 
EvaluateRegressionEstimator(regEstimator, X, y, cv)

The average MAE for all cv folds is: 			 9.5717e-15
The average MAE percentage (MAPE) for all cv folds is: 	 4.5023e-13
The average RMSE for all cv folds is: 			 1.2041e-14
*********************************************************
Cross Validation Fold Mean Error Scores


Unnamed: 0,MAE,MAPE,RMSE
0,1.284496e-14,5.772997e-13,1.57647e-14
1,6.592266e-15,3.180736e-13,8.188876e-15
2,1.109276e-14,5.282709e-13,1.369531e-14
3,1.75369e-14,8.245726e-13,2.263585e-14
4,6.25389e-15,3.140672e-13,8.066542e-15
5,6.586534e-15,3.012246e-13,8.326144e-15
6,8.580043e-15,4.013461e-13,1.068112e-14
7,7.607262e-15,3.488059e-13,9.69519e-15
8,7.988948e-15,3.748987e-13,1.00816e-14
9,1.063328e-14,5.137466e-13,1.327062e-14


In [39]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

cv = StratifiedShuffleSplit(test_size = 0.5, train_size=0.5, random_state = 1).split(X,y)

# fill in the training and testing data and save as separate variables
for trainXY, testXY in cv:
    # note that these are sparse matrices
    X_train = X[trainXY] 
    X_test = X[testXY] 
    y_train = y[trainXY]
    y_test = y[testXY]

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [40]:
from sklearn.linear_model import LinearRegression

In [42]:
reg = LinearRegression().fit(X, y)
reg.score(X, y)

1.0