In [None]:
# import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFE
from sklearn import linear_model
from sklearn import model_selection
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import classification_report

from sklearn.preprocessing import Normalizer,StandardScaler
from sklearn.preprocessing.data import normalize

from statistics import mean 

df = pd.read_csv("sales_data_2015.csv")
df.info()

In [None]:
df.columns

In [None]:
# cleaning option 1
# df2 = df[['borough', 'bldg_cls_p', 'tax_cls_p', 'tot_sqft', 'land_sqft', 'tot_unit', 'res_unit', 'com_unit', 'price']]
# df2 = df2[(df2.tot_sqft != 0) & (df2.tot_sqft != 1) & (df2.price >= 100) & (df2.land_sqft != 0)]

# df2.dropna(subset = ["bldg_cls_p"], inplace=True)
# df2.dropna(subset = ["tax_cls_p"], inplace=True)

# print("Length of dataset is {}".format(len(df2)))

# def bldg_cls_simple(building):
#     return building[0]

# def tax_cls_simple(building):
#     return building[0]

# df2['cat'] = df2.bldg_cls_p.apply(lambda building: bldg_cls_simple(str(building)))
# df2['tax_cat'] = df2.tax_cls_p.apply(lambda building: tax_cls_simple(str(building)))

# # df2 = pd.concat([df2, pd.get_dummies(df2.tax_cls_p)], axis=1)
# df2 = pd.concat([df2, pd.get_dummies(df2.cat)], axis=1)
# df2 = pd.concat([df2, pd.get_dummies(df2.tax_cat)], axis=1)
# df2 = pd.concat([df2, pd.get_dummies(df2.borough, prefix="b")], axis=1)

# df2 = df2.drop(columns=['bldg_cls_p', 'borough', 'tax_cls_p', 'cat', 'tax_cat'])

In [None]:
# cleaning option 2 (all possible features)
df2 = df[['borough', 'bldg_ctgy','bldg_cls_p', 'tax_cls_p','bldg_cls_s', 'tax_cls_s', 'tot_sqft', 'yr_built', 'land_sqft', 'tot_unit', 'res_unit', 'com_unit', 'price']]
df2 = df2[(df2.tot_sqft != 0) & (df2.tot_sqft != 1) & (df2.land_sqft != 0) & (df2.land_sqft != 1) & (df2.price >= 1000) & (df2.land_sqft != 0) & (df2.yr_built !=0)]

df2.dropna(subset = ["bldg_cls_p"], inplace=True)
df2.dropna(subset = ["tax_cls_p"], inplace=True)

print("Length of dataset is {}".format(len(df2)))

df2 = pd.concat([df2, pd.get_dummies(df2.bldg_cls_s, prefix="bdgp_cls_s")], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df2.borough, prefix="b")], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df2.bldg_ctgy, prefix="bldg_ctgy")], axis=1)
# df2 = pd.concat([df2, pd.get_dummies(df2.tax_cls_s, prefix="tax_cls_s")], axis=1)

df2 = df2.drop(columns=['borough','bldg_ctgy', 'bldg_cls_p', 'tax_cls_p','bldg_cls_s'])

In [None]:
df2

In [None]:
len(df2.columns)

In [None]:
X = df2.drop(columns=['price'])
Y = df2.price
Y = np.array(Y).reshape(-1)
print(X.shape,Y.shape)

In [None]:
# cross validation parameters

def nested_cv(MODEL, PARAMS, X, Y, NUM_TRIALS):
    mae_scores = []
    r2_scores = []

    for i in range(NUM_TRIALS):
        print("At {}th trail".format(i))
        inner_cv = KFold(n_splits=10, random_state=i, shuffle=True)
        outer_cv = KFold(n_splits=10, random_state=i, shuffle=True)
        feature_scaler = StandardScaler()
        X = feature_scaler.fit_transform(X)
        model = MODEL

        lm = GridSearchCV(estimator=model, param_grid=PARAMS, cv=inner_cv)
        mae_score = cross_val_score(lm, X=X, y=Y, cv=outer_cv, scoring='neg_mean_absolute_error')
        mae_scores.append([mae_score.mean(), mae_score.std()])
        r2_score = cross_val_score(lm, X=X, y=Y, cv=outer_cv, scoring='r2')
        r2_scores.append([r2_score.mean(), r2_score.std()])
    
    return r2_scores, mae_scores

In [None]:
for column in X.columns:
    corry = np.corrcoef(Y, X["{}".format(column)])[0][1]
    if corry < 0.2 and corry > -0.2:
#         print("removed column {0}, because {1}".format(column, corry))
        X = X.drop(columns=[column])
    else:
        print("keep column ", column, corry)

In [None]:
len(X.columns)

In [None]:
f, ax = plt.subplots(figsize=(15, 9))

corr = pd.concat([df2.price, df2[list(X.columns)]], axis=1).corr()
heatmap = sns.heatmap(corr, vmin=-1, vmax=1, annot=True)
plt.savefig('corr_heatmap.png', dpi=300, transparent=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .30, random_state = 20)

feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

rf = RandomForestClassifier(max_depth=5)
rf.fit(X_train, y_train)

In [None]:
print('Train:', rf.score(X_train, y_train))
print('Test:', rf.score(X_test, y_test))

mae_model = mean_absolute_error(y_test, rf.predict(X_test))

In [None]:
print(mae_model)

In [None]:
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
model = linear_model.Lasso(tol=1.0, normalize=True)
nested_lm = nested_cv(model, parameters, X, Y, 10)

In [None]:
temp = np.array(nested_lm)
print(np.mean(temp[0], axis=0))
print(np.mean(temp[1], axis=0))

In [None]:
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
ridge = Ridge()
nested_rlm = nested_cv(ridge, parameters, X, Y, 10)

In [None]:
temp = np.array(nested_rlm)
print(np.mean(temp[0], axis=0))
print(np.mean(temp[1], axis=0))

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 2, stop = 6, num = 2, dtype=int)]

parameters = {'n_estimators': n_estimators}

rf = RandomForestClassifier(max_depth=1)

rf_nested = nested_cv(rf, parameters, X, Y, 10)

In [None]:
temp = np.array(rf_nested)
print(np.mean(temp[0], axis=0))
print(np.mean(temp[1], axis=0))

In [None]:
parameters = {
              'hidden_layer_sizes':np.linspace(1, 10, 3, dtype=int)
             }

mlp = MLPClassifier()

mlp_nested = nested_cv(mlp, parameters, X, Y, 10)

In [None]:
temp = np.array(mlp_nested)
print(np.mean(temp[0], axis=0))
print(np.mean(temp[1], axis=0))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .30, random_state = 10)

feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

In [None]:
lm = linear_model.Lasso(tol=1.0, normalize=True)
lm.fit(X_train, y_train)
print('Train:', lm.score(X_train, y_train))
print('Test:', lm.score(X_test, y_test))

mae_model = mean_absolute_error(y_test, lm.predict(X_test))
mae_baseline = abs(y_test - y_test.mean()).mean()

print('MAE:', mae_model)
print('MAE (baseline):', mae_baseline)
print('Improvement: {:.2f} %'.format(((mae_model-mae_baseline)/ mae_baseline )* 100) )

In [None]:
X.columns

In [None]:
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}

In [None]:
ridge = Ridge()

ridge_regressor = GridSearchCV(ridge, parameters, scoring='r2', cv=5)

ridge_regressor.fit(X_train, y_train)

In [None]:
print(ridge_regressor.best_params_, ridge_regressor.best_score_)

In [None]:
mean_model = [y_train.mean()] * len(y_test)

In [None]:
r2_score(y_test, mean_model)

In [None]:
print('Test:', ridge_regressor.score(X_test, y_test))

mae_model = mean_absolute_error(y_test, ridge_regressor.predict(X_test))
mae_baseline = abs(y_test - y_train.mean()).mean()

print('MAE:', mae_model)
print('MAE (baseline):', mae_baseline)
print('Improvement: {:.2f} %'.format(((mae_model-mae_baseline)/ mae_baseline )* 100) )

In [None]:
lasso = Lasso(normalize=True)

lasso_regressor = GridSearchCV(lasso, parameters, scoring='r2', cv=5)

lasso_regressor.fit(X_train, y_train)

In [None]:
print(lasso_regressor.best_params_, lasso_regressor.best_score_)

In [None]:
print('Test:', lasso_regressor.score(X_test, y_test))

mae_model = mean_absolute_error(y_test, lasso_regressor.predict(X_test))
mae_baseline = abs(y_test - y_test.mean()).mean()

print('MAE:', mae_model)
print('MAE (baseline):', mae_baseline)
print('Improvement: {:.2f} %'.format(((mae_model-mae_baseline)/ mae_baseline )* 100) )

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

print('Train:', lm.score(X_train, y_train))
print('Test:', lm.score(X_test, y_test))

mae_model = mean_absolute_error(y_test, lm.predict(X_test))
mae_baseline = abs(y_test - y_train.mean()).mean()

print('MAE:', mae_model)
print('MAE (baseline):', mae_baseline)
print('Improvement: {:.2f} %'.format(((mae_model-mae_baseline)/ mae_baseline )* 100) )

In [None]:
# lm = LinearRegression(normalize=True)
# lm.fit(X_train, y_train)

lm = linear_model.Lasso(alpha=20, tol=1.0, normalize=True)
lm.fit(X_train, y_train)

print('Train:', lm.score(X_train, y_train))
print('Test:', lm.score(X_test, y_test))

mae_model = mean_absolute_error(y_test, lm.predict(X_test))
mae_baseline = abs(y_test - y_train.mean()).mean()

print('MAE:', mae_model)
print('MAE (baseline):', mae_baseline)
print('Improvement: {:.2f} %'.format(((mae_model-mae_baseline)/ mae_baseline )* 100) )

In [None]:
df3 = pd.read_csv("sales_data_2015_DF-inception-conv.csv", index_col=0)

In [None]:
df3.head()

In [None]:
len(df)

In [None]:
new_df = df.merge(df3, on=['Sale_id'])

In [None]:
new_df.head()

In [None]:
new_df.columns

In [None]:
df4 = new_df.drop(columns=['Unnamed: 0', 'Sale_id', 'bbl_id_x', 'year', 'block', 'lot','easmnt', 'bldg_cls_p', 'address', 'apt','zip','usable'])
df4 = df4[(df4.tot_sqft != 0) & (df4.tot_sqft != 1) & (df4.land_sqft != 0) & (df4.land_sqft != 1) & (df4.price >= 10000) & (df4.land_sqft != 0) & (df4.yr_built != 0)]

df4.dropna(subset = ["bldg_cls_s"], inplace=True)
df4.dropna(subset = ["tax_cls_s"], inplace=True)

print("Length of dataset is {}".format(len(df2)))

df4 = pd.concat([df4, pd.get_dummies(df4.bldg_cls_s, prefix="bdgp_cls_s")], axis=1)
df4 = pd.concat([df4, pd.get_dummies(df4.borough, prefix="b")], axis=1)
df4 = pd.concat([df4, pd.get_dummies(df4.bldg_ctgy, prefix="bldg_ctgy")], axis=1)
# df2 = pd.concat([df2, pd.get_dummies(df2.tax_cls_s, prefix="tax_cls_s")], axis=1)

df4 = df4.drop(columns=['borough', 'bldg_ctgy', 'tax_cls_p','bldg_cls_s', 'sale_date', 'long', 'lat'])

In [None]:
len(df4.columns)

In [None]:
df4.columns

In [None]:
X = df4.drop(columns=['price'])
Y = df4.price
Y = np.array(Y).reshape(-1)
print(X.shape,Y.shape)

In [None]:
vis_feats = [str(i) for i in range(0, 32)]

In [None]:
for column in X.columns:
    corry = np.corrcoef(Y, X["{}".format(column)])[0][1]
    if corry < 0.2 and corry > -0.2 and column not in vis_feats:
#         print("removed column {0}, because {1}".format(column, corry))
        X = X.drop(columns=[column])
    else:
        print("keep column ", column, corry)

In [None]:
len(X.columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .30, random_state = 10)

feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

In [None]:
ridge = Ridge()
ridge_regressor = GridSearchCV(ridge, parameters, scoring='r2', cv=5)
ridge_regressor.fit(X_train, y_train)

print(ridge_regressor.best_params_, ridge_regressor.best_score_)

In [None]:
print('Test:', ridge_regressor.score(X_test, y_test))

mae_model = mean_absolute_error(y_test, ridge_regressor.predict(X_test))
mae_baseline = abs(y_test - y_train.mean()).mean()

print('MAE:', mae_model)
print('MAE (baseline):', mae_baseline)
print('Improvement: {:.2f} %'.format(((mae_model-mae_baseline)/ mae_baseline )* 100) )