In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from patsy import dmatrices
from sklearn.model_selection import KFold, cross_val_score

In [3]:
housing = pd.read_csv('DataFiles/housing_renamed.csv')
print(housing)
print(housing.columns)

              neighborhood            type  units  year_built   sq_ft  \
0                FINANCIAL  R9-CONDOMINIUM     42      1920.0   36500   
1                FINANCIAL  R4-CONDOMINIUM     78      1985.0  126420   
2                FINANCIAL  RR-CONDOMINIUM    500         NaN  554174   
3                FINANCIAL  R4-CONDOMINIUM    282      1930.0  249076   
4                  TRIBECA  R4-CONDOMINIUM    239      1985.0  219495   
...                    ...             ...    ...         ...     ...   
2621              ROSEBANK  R4-CONDOMINIUM     52         NaN   62391   
2622  ARROCHAR-SHORE ACRES  R4-CONDOMINIUM    102      1987.0   90618   
2623            GRANT CITY  R4-CONDOMINIUM    100      1986.0   78903   
2624            GRANT CITY  R4-CONDOMINIUM    159      1961.0  166712   
2625           GREAT KILLS  R4-CONDOMINIUM     67      1965.0  108864   

        income  income_per_sq_ft  expense  expense_per_sq_ft  net_income  \
0      1332615             36.51   342005      

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
 pd.get_dummies(
 housing[["units", "sq_ft", "boro"]], drop_first=True
 ),

 housing["value_per_sq_ft"],
 test_size=0.20,
 random_state=42,
)

In [5]:
lr = LinearRegression().fit(X_train, y_train)
print(lr.score(X_test, y_test))

0.6137125285030869


In [6]:
y, X = dmatrices(
    "value_per_sq_ft ~ units + sq_ft + boro",
    housing,
    return_type="dataframe",
)
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.20, random_state=42
)

lr = LinearRegression().fit(X_train, y_train)
print(lr.score(X_test, y_test))

0.613712528503469


In [8]:
housing = pd.read_csv('DataFiles/housing_renamed.csv')
print(housing)

              neighborhood            type  units  year_built   sq_ft  \
0                FINANCIAL  R9-CONDOMINIUM     42      1920.0   36500   
1                FINANCIAL  R4-CONDOMINIUM     78      1985.0  126420   
2                FINANCIAL  RR-CONDOMINIUM    500         NaN  554174   
3                FINANCIAL  R4-CONDOMINIUM    282      1930.0  249076   
4                  TRIBECA  R4-CONDOMINIUM    239      1985.0  219495   
...                    ...             ...    ...         ...     ...   
2621              ROSEBANK  R4-CONDOMINIUM     52         NaN   62391   
2622  ARROCHAR-SHORE ACRES  R4-CONDOMINIUM    102      1987.0   90618   
2623            GRANT CITY  R4-CONDOMINIUM    100      1986.0   78903   
2624            GRANT CITY  R4-CONDOMINIUM    159      1961.0  166712   
2625           GREAT KILLS  R4-CONDOMINIUM     67      1965.0  108864   

        income  income_per_sq_ft  expense  expense_per_sq_ft  net_income  \
0      1332615             36.51   342005      

In [9]:
kf = KFold(n_splits=5)
y, X = dmatrices('value_per_sq_ft ~ units + sq_ft + boro', housing)

In [10]:
coefs = []
scores = []
for train, test in kf.split(X):
  X_train, X_test = X[train], X[test]
  y_train, y_test = y[train], y[test]
  lr = LinearRegression().fit(X_train, y_train)
  coefs.append(pd.DataFrame(lr.coef_))
  scores.append(lr.score(X_test, y_test))

In [11]:
coefs_df = pd.concat(coefs)
coefs_df.columns = X.design_info.column_names
print(coefs_df)

   Intercept  boro[T.Brooklyn]  boro[T.Manhattan]  boro[T.Queens]  \
0        0.0         33.369037         129.904011       32.103100   
0        0.0         32.889925         116.957385       31.295956   
0        0.0         30.975560         141.859327       32.043449   
0        0.0         41.449196         130.779013       33.050968   
0        0.0        -38.511915          56.069855      -17.557939   

   boro[T.Staten Island]     units     sq_ft  
0              -4.381085 -0.205890  0.000220  
0              -4.919232 -0.146180  0.000155  
0              -4.379916 -0.179671  0.000194  
0              -3.430209 -0.207904  0.000232  
0               0.000000 -0.145829  0.000202  


In [12]:
print(scores)

[0.027314162909394923, -0.553836221218611, -0.1563637168803238, -0.32342020618600453, -1.6929655586236945]


In [13]:
model = LinearRegression()
scores = cross_val_score(model, X, y, cv=5)
print(scores)

[ 0.02731416 -0.55383622 -0.15636372 -0.32342021 -1.69296556]


In [14]:
print(scores.mean())

-0.5398543079998477


In [15]:
y1, X1 = dmatrices(
   "value_per_sq_ft ~ units + sq_ft + boro", housing)

y2, X2 = dmatrices("value_per_sq_ft ~ units*sq_ft + boro", housing)

y3, X3 = dmatrices(
  "value_per_sq_ft ~ units + sq_ft*boro + type", housing
)

y4, X4 = dmatrices(
    "value_per_sq_ft ~ units + sq_ft*boro + sq_ft*type", housing
)

y5, X5 = dmatrices("value_per_sq_ft ~ boro + type", housing)

In [16]:
model = LinearRegression()

scores1 = cross_val_score(model, X1, y1, cv=5)
scores2 = cross_val_score(model, X2, y2, cv=5)
scores3 = cross_val_score(model, X3, y3, cv=5)
scores4 = cross_val_score(model, X4, y4, cv=5)
scores5 = cross_val_score(model, X5, y5, cv=5)

In [17]:
scores_df = pd.DataFrame([scores1, scores2, scores3, scores4, scores5])
print(scores_df.apply(np.mean, axis=1))

0   -5.398543e-01
1   -1.088184e+00
2   -4.719338e+25
3   -6.826894e+25
4   -2.355763e+24
dtype: float64
