In [2]:
import pandas as pd
acs =pd.read_csv('acs_ny.csv')
print(acs.columns)

Index(['Acres', 'FamilyIncome', 'FamilyType', 'NumBedrooms', 'NumChildren',
       'NumPeople', 'NumRooms', 'NumUnits', 'NumVehicles', 'NumWorkers',
       'OwnRent', 'YearBuilt', 'HouseCosts', 'ElectricBill', 'FoodStamp',
       'HeatingFuel', 'Insurance', 'Language'],
      dtype='object')


In [3]:
from patsy import dmatrices

#sequential strings get concatenated together in Python

response,predictors= dmatrices(
   "FamilyIncome~NumBedrooms + NumChildren +NumPeople +"
   "NumRooms + NumUnits +NumVehicles +NumWorkers+OwnRent+"
   "YearBuilt +ElectricBill+FoodStamp+HeatingFuel +"
    "Insurance + Language",
     data= acs,
  
)

In [4]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(
    predictors, response, random_state = 0
)



In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler 

lr= make_pipeline(
    StandardScaler(with_mean =False), LinearRegression()
)
lr = lr.fit(X_train,y_train)
print(lr)


Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('linearregression', LinearRegression())])


In [6]:
model_coefs = pd.DataFrame(
    data=list(
        zip(
            predictors.design_info.column_names,
            lr.named_steps["linearregression"].coef_[0],
        )
    ),
    columns = ["variable", "coef_lr"],
)
print(model_coefs)

                       variable       coef_lr
0                     Intercept  6.682651e-14
1   NumUnits[T.Single attached]  8.936384e+03
2   NumUnits[T.Single detached]  7.833083e+03
3           OwnRent[T.Outright]  1.818126e+03
4             OwnRent[T.Rented]  1.613850e+03
5        YearBuilt[T.1940-1949]  2.204366e+03
6        YearBuilt[T.1950-1959]  5.775934e+03
7        YearBuilt[T.1960-1969]  4.345828e+03
8        YearBuilt[T.1970-1979]  3.911641e+03
9        YearBuilt[T.1980-1989]  6.042011e+03
10       YearBuilt[T.1990-1999]  7.411342e+03
11       YearBuilt[T.2000-2004]  5.382651e+03
12            YearBuilt[T.2005]  4.008482e+03
13            YearBuilt[T.2006]  3.226638e+03
14            YearBuilt[T.2007]  3.369052e+03
15            YearBuilt[T.2008]  1.276426e+03
16            YearBuilt[T.2009]  1.318172e+03
17            YearBuilt[T.2010]  2.792588e+03
18     YearBuilt[T.Before 1939]  3.959058e+03
19             FoodStamp[T.Yes] -6.988396e+03
20   HeatingFuel[T.Electricity]  2

In [7]:
#score on the training data 

print(lr.score(X_train, y_train))

#score on the testing data
print(lr.score(X_test, y_test))

0.27430887322923125
0.261070805023913


In [8]:
# R2 (coeeficient of determination)

#LASSO( Least Absolute Shrinkage and selection operator)  is one type of regualarization technique. Also known as
#L1 Regularization

In [9]:
from sklearn.linear_model import Lasso

lasso = make_pipeline(
    StandardScaler(with_mean=False),
    Lasso(max_iter= 10000, random_state=42)
)

lasso = lasso.fit(X_test,y_test)
print(lasso)


Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('lasso', Lasso(max_iter=10000, random_state=42))])


In [10]:
coefs_lasso = pd.DataFrame(
    data = list(
        zip(
            predictors.design_info.column_names,
            lasso.named_steps["lasso"].coef_.tolist(),
        )
    ),
    columns = ["variable" , "coef_lasso"],
)
model_coefs = pd.merge(model_coefs, coefs_lasso, on='variable')
print(model_coefs)

                       variable       coef_lr    coef_lasso
0                     Intercept  6.682651e-14      0.000000
1   NumUnits[T.Single attached]  8.936384e+03   9868.623799
2   NumUnits[T.Single detached]  7.833083e+03   8924.389171
3           OwnRent[T.Outright]  1.818126e+03   4202.789333
4             OwnRent[T.Rented]  1.613850e+03   2401.078639
5        YearBuilt[T.1940-1949]  2.204366e+03  -9668.313004
6        YearBuilt[T.1950-1959]  5.775934e+03 -11094.600837
7        YearBuilt[T.1960-1969]  4.345828e+03 -10366.952862
8        YearBuilt[T.1970-1979]  3.911641e+03 -10221.594524
9        YearBuilt[T.1980-1989]  6.042011e+03  -6626.846652
10       YearBuilt[T.1990-1999]  7.411342e+03  -8764.121654
11       YearBuilt[T.2000-2004]  5.382651e+03  -4134.316213
12            YearBuilt[T.2005]  4.008482e+03  -2912.219235
13            YearBuilt[T.2006]  3.226638e+03  -1809.412093
14            YearBuilt[T.2007]  3.369052e+03  -2405.616328
15            YearBuilt[T.2008]  1.27642

In [33]:
print(lasso.score(X_train,y_train))
print(lasso.score(X_test, y_test))

0.2677491984844108
0.26786561861744373


In [12]:
#Ridge Regression , L2 Regularization 

In [13]:
from sklearn.linear_model import Ridge
ridge = make_pipeline(
    StandardScaler(with_mean=False),Ridge(random_state=42)
)
ridge = ridge.fit(X_train, y_train)
print(ridge)


Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('ridge', Ridge(random_state=42))])


In [14]:
coefs_ridge =pd.DataFrame(
    data=list(
        zip(
            predictors.design_info.column_names,
            ridge.named_steps["ridge"].coef_.tolist(),
        )
    ),
    columns= ["variable","coef_ridge"],
)
model_coefs=pd.merge(model_coefs, coefs_ridge , on="variable")
print(model_coefs)

                       variable       coef_lr    coef_lasso    coef_ridge
0                     Intercept  6.682651e-14      0.000000      0.000000
1   NumUnits[T.Single attached]  8.936384e+03   9868.623799   8933.916923
2   NumUnits[T.Single detached]  7.833083e+03   8924.389171   7830.186855
3           OwnRent[T.Outright]  1.818126e+03   4202.789333   1817.081756
4             OwnRent[T.Rented]  1.613850e+03   2401.078639   1612.894064
5        YearBuilt[T.1940-1949]  2.204366e+03  -9668.313004   1872.387871
6        YearBuilt[T.1950-1959]  5.775934e+03 -11094.600837   5311.584955
7        YearBuilt[T.1960-1969]  4.345828e+03 -10366.952862   3944.776874
8        YearBuilt[T.1970-1979]  3.911641e+03 -10221.594524   3538.961340
9        YearBuilt[T.1980-1989]  6.042011e+03  -6626.846652   5693.363620
10       YearBuilt[T.1990-1999]  7.411342e+03  -8764.121654   7059.944934
11       YearBuilt[T.2000-2004]  5.382651e+03  -4134.316213   5125.923192
12            YearBuilt[T.2005]  4.008

In [15]:
#combines lasso n ridge
from sklearn.linear_model import ElasticNet
en= ElasticNet(random_state=42).fit(X_train, y_train)

coefs_en = pd.DataFrame(
    list(zip( predictors.design_info.column_names , en.coef_)),
    columns =["variable","coef_en"],
)

model_coefs= pd.merge(model_coefs, coefs_en, on="variable")
print(model_coefs)



                       variable       coef_lr    coef_lasso    coef_ridge  \
0                     Intercept  6.682651e-14      0.000000      0.000000   
1   NumUnits[T.Single attached]  8.936384e+03   9868.623799   8933.916923   
2   NumUnits[T.Single detached]  7.833083e+03   8924.389171   7830.186855   
3           OwnRent[T.Outright]  1.818126e+03   4202.789333   1817.081756   
4             OwnRent[T.Rented]  1.613850e+03   2401.078639   1612.894064   
5        YearBuilt[T.1940-1949]  2.204366e+03  -9668.313004   1872.387871   
6        YearBuilt[T.1950-1959]  5.775934e+03 -11094.600837   5311.584955   
7        YearBuilt[T.1960-1969]  4.345828e+03 -10366.952862   3944.776874   
8        YearBuilt[T.1970-1979]  3.911641e+03 -10221.594524   3538.961340   
9        YearBuilt[T.1980-1989]  6.042011e+03  -6626.846652   5693.363620   
10       YearBuilt[T.1990-1999]  7.411342e+03  -8764.121654   7059.944934   
11       YearBuilt[T.2000-2004]  5.382651e+03  -4134.316213   5125.923192   

In [16]:
#cross validation used to pick optimal parameters for regularization. it can be used to try out various 
#combinations of these hyper-parameters to pick the best model

from sklearn.linear_model import ElasticNetCV

en_cv =ElasticNetCV(cv=5, random_state=42).fit(
    X_train,y_train.ravel()  #ravel is used to remove the 1d warning
)
coefs_en_cv= pd.DataFrame(
    list(zip(predictors.design_info.column_names, en_cv.coef_)),
    columns=["variable","coef_en_cv"],
)
model=coefs = pd.merge(model_coefs, coefs_en_cv, on= "variable")
print(model_coefs)

                       variable       coef_lr    coef_lasso    coef_ridge  \
0                     Intercept  6.682651e-14      0.000000      0.000000   
1   NumUnits[T.Single attached]  8.936384e+03   9868.623799   8933.916923   
2   NumUnits[T.Single detached]  7.833083e+03   8924.389171   7830.186855   
3           OwnRent[T.Outright]  1.818126e+03   4202.789333   1817.081756   
4             OwnRent[T.Rented]  1.613850e+03   2401.078639   1612.894064   
5        YearBuilt[T.1940-1949]  2.204366e+03  -9668.313004   1872.387871   
6        YearBuilt[T.1950-1959]  5.775934e+03 -11094.600837   5311.584955   
7        YearBuilt[T.1960-1969]  4.345828e+03 -10366.952862   3944.776874   
8        YearBuilt[T.1970-1979]  3.911641e+03 -10221.594524   3538.961340   
9        YearBuilt[T.1980-1989]  6.042011e+03  -6626.846652   5693.363620   
10       YearBuilt[T.1990-1999]  7.411342e+03  -8764.121654   7059.944934   
11       YearBuilt[T.2000-2004]  5.382651e+03  -4134.316213   5125.923192   