# Linear Regression

In [23]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [24]:
crime = pd.read_csv('Training_and_Test_Set.csv') # read in the csv %%file

In [25]:
crime.head()

Unnamed: 0,CMPLNT_FR_DT,Daytime,Day_Name,Month,Day,Year,Season,GeoCell,BORO_NM,PRCP,...,TMIN,TMAX,Population,PC_INCOME,Hm_Sls_Price_Range,Holiday,Event,is_Holiday,is_Event,count_cmplnt
0,11/26/2014,Morning,Wednesday,November,26,2014,Fall,66,QUEENS,1.24,...,34,51,2250002,40997,Medium,,,0,0,1
1,12/1/2014,Late Night,Monday,December,1,2014,Winter,60,QUEENS,0.09,...,42,65,2250002,40997,Medium,,,0,0,1
2,11/10/2015,Morning,Tuesday,November,10,2015,Fall,15,BROOKLYN,0.26,...,51,57,2552911,43915,High,,,0,0,2
3,2/4/2014,Morning,Tuesday,February,4,2014,Winter,48,QUEENS,0.0,...,22,35,2250002,40997,Medium,,,0,0,3
4,8/25/2015,Late Night,Tuesday,August,25,2015,Summer,35,BROOKLYN,0.0,...,73,90,2552911,43915,High,,,0,0,1


In [26]:
# perform one-hot encoding of the categorical data 
tmp_df = pd.get_dummies(crime.BORO_NM,prefix='BORO_NM')
crime = pd.concat((crime,tmp_df),axis=1) # add back into the dataframe

tmp_df = pd.get_dummies(crime.Day_Name,prefix='Day_Name')
crime = pd.concat((crime,tmp_df),axis=1) # add back into the dataframe

tmp_df = pd.get_dummies(crime.Month,prefix='Month')
crime = pd.concat((crime,tmp_df),axis=1) # add back into the dataframe

tmp_df = pd.get_dummies(crime.Season,prefix='Season')
crime = pd.concat((crime,tmp_df),axis=1) # add back into the dataframe

tmp_df = pd.get_dummies(crime.Daytime,prefix='Daytime')
crime = pd.concat((crime,tmp_df),axis=1) # add back into the dataframe

tmp_df = pd.get_dummies(crime.Daytime,prefix='Hm_Sls_Price_Range')
crime = pd.concat((crime,tmp_df),axis=1) # add back into the dataframe

In [27]:
#Now we drop the original variables
if 'BORO_NM' in crime:    
    del crime['BORO_NM'] # get rid of the original category as it is now one-hot encoded
if 'Day_Name' in crime:    
    del crime['Day_Name'] # get rid of the original category as it is now one-hot encoded
if 'Month' in crime:    
    del crime['Month'] # get rid of the original category as it is now one-hot encoded
if 'Season' in crime:    
    del crime['Season'] # get rid of the original category as it is now one-hot encoded
if 'Daytime' in crime:    
    del crime['Daytime'] # get rid of the original category as it is now one-hot encoded
if 'Hm_Sls_Price_Range' in crime:    
    del crime['Hm_Sls_Price_Range'] # get rid of the original category as it is now one-hot encoded

In [28]:
#binary Y/N converted to Is Holiday or Is Event.  Can drop originals
if 'Holiday' in crime:    
    del crime['Holiday']
if 'Event' in crime:    
    del crime['Event']

In [29]:
#check datatypes for conversions needed
crime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170650 entries, 0 to 170649
Data columns (total 53 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   CMPLNT_FR_DT                      170650 non-null  object 
 1   Day                               170650 non-null  int64  
 2   Year                              170650 non-null  int64  
 3   GeoCell                           170650 non-null  int64  
 4   PRCP                              170650 non-null  float64
 5   SNOW                              170650 non-null  float64
 6   TMIN                              170650 non-null  int64  
 7   TMAX                              170650 non-null  int64  
 8   Population                        170650 non-null  int64  
 9   PC_INCOME                         170650 non-null  int64  
 10  is_Holiday                        170650 non-null  int64  
 11  is_Event                          170650 non-null  i

In [30]:
#evaluating a linear regression model
# create x explanatory and y response variables for regression
y = crime['count_cmplnt']

#Delete the response variable from our training dataset
X = crime.drop('count_cmplnt', axis=1)
X = crime.drop('CMPLNT_FR_DT', axis=1) #dropping this because I have day, month and year as columns

#inspect data 
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170650 entries, 0 to 170649
Data columns (total 52 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   Day                               170650 non-null  int64  
 1   Year                              170650 non-null  int64  
 2   GeoCell                           170650 non-null  int64  
 3   PRCP                              170650 non-null  float64
 4   SNOW                              170650 non-null  float64
 5   TMIN                              170650 non-null  int64  
 6   TMAX                              170650 non-null  int64  
 7   Population                        170650 non-null  int64  
 8   PC_INCOME                         170650 non-null  int64  
 9   is_Holiday                        170650 non-null  int64  
 10  is_Event                          170650 non-null  int64  
 11  count_cmplnt                      170650 non-null  i

In [36]:
y.describe()

count    170650.000000
mean          4.714122
std           5.140051
min           1.000000
25%           1.000000
50%           3.000000
75%           6.000000
max         132.000000
Name: count_cmplnt, dtype: float64

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [63]:
from sklearn.preprocessing import StandardScaler

# we want to normalize the features based upon the mean and standard deviation of each column. 
# However, we do not want to accidentally use the testing data to find out the mean and std (this would be snooping)
# to Make things easier, let's start by just using whatever was last stored in the variables:
##    X_train , y_train , X_test, y_test (they were set in a for loop above)

# scale attributes by the training set
scl_obj = StandardScaler()
scl_obj.fit(X_train) # find scalings for each column that make this zero mean and unit std
# the line of code above only looks at training data to get mean and std and we can use it 
# to transform new feature data

X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test) # apply those means and std to the test set (without snooping at the test set values)

In [64]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
lr = regressor.fit(X_train, y_train)

In [65]:
print(regressor.intercept_)

-1.8383529987175962e-07


In [66]:
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
Day,1.775225e-14
Year,9.105538e-11
GeoCell,-1.219318e-13
PRCP,3.719858e-12
SNOW,-1.000935e-12
TMIN,4.452549e-13
TMAX,-4.752865e-13
Population,2.220446e-16
PC_INCOME,9.367507000000001e-17
is_Holiday,-8.762708e-12


In [67]:
print(regressor.coef_)

[ 1.77522512e-14  9.10553799e-11 -1.21931762e-13  3.71985776e-12
 -1.00093545e-12  4.45254944e-13 -4.75286477e-13  2.22044605e-16
  9.36750677e-17 -8.76270844e-12  4.49215014e-12  1.00000000e+00
 -5.99890797e-13 -3.53085616e-13  1.07514149e-12  4.85098073e-14
 -1.69993793e-13 -1.09532088e-12  2.16763239e-13  2.60099451e-13
 -2.38874285e-12  3.03157456e-13  1.52447759e-12  1.15965137e-12
 -3.57100981e-12  2.24171029e-12 -2.56424049e-12  2.92997831e-12
  6.14800095e-13  1.49776806e-12 -3.95095675e-12  2.71163994e-12
  5.75671021e-13 -9.76274183e-14  1.95846507e-12 -2.38082514e-12
 -5.06280781e-13 -2.85351603e-13 -1.90313043e-13  1.02451966e-12
  2.30730799e-13  1.47849961e-12 -1.10374036e-12 -8.08216341e-13
 -1.11068273e-13  3.01777700e-13  2.30695671e-13  1.47849961e-12
 -1.10374210e-12 -8.08218076e-13 -1.11052660e-13  3.01720454e-13]


In [68]:
y_pred = regressor.predict(X_test)

In [69]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
29879,9,9.0
142968,12,12.0
58631,7,7.0
160497,1,1.0
57495,5,5.0
...,...,...
531,1,1.0
164879,1,1.0
105058,4,4.0
34210,3,3.0


In [70]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 1.298341313810322e-10
Mean Squared Error: 2.453997065027149e-20
Root Mean Squared Error: 1.5665238794947075e-10


In [71]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

In [72]:
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:           count_cmplnt   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 3.262e+23
Date:                Wed, 14 Oct 2020   Prob (F-statistic):               0.00
Time:                        21:38:12   Log-Likelihood:             3.3783e+06
No. Observations:              170650   AIC:                        -6.757e+06
Df Residuals:                  170612   BIC:                        -6.756e+06
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   