# Project Demo

This demo is designed to provide some general tips and tricks for the ITDS Fall 2019 project. For full details on the project, please refer to [the project requirements](https://grantmlong.com/teaching/fall2019/project/Project-ITDS-Fall-2019.pdf).

***
This demo uses `scikit-learn`, but by no means are you required to use this or any other particular package.



In [0]:
import pandas as pd
import numpy as np
import re
from google.colab import files
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression
import seaborn as sns
import io
%matplotlib inline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


In [0]:
train_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_train.csv', index_col=0)
test_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test1.csv', index_col=0)
submit1_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test2.csv', index_col=0)
submit2_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test3.csv', index_col=0)
schools_df = pd.read_csv('https://raw.githubusercontent.com/12NaN/data-science-project/master/Data/school_data.csv',index_col=0)


train_df = train_df.reset_index().merge(schools_df, on='addr_zip', how='left').set_index(train_df.index.names)
test_df = test_df.reset_index().merge(schools_df, on='addr_zip', how='left').set_index(test_df.index.names)

submit1_df = submit1_df.reset_index().merge(schools_df, on='addr_zip', how='left').set_index(submit1_df.index.names)
submit2_df = submit2_df.reset_index().merge(schools_df, on='addr_zip', how='left').set_index(submit2_df.index.names)


train_df = train_df.drop(train_df[(train_df['size_sqft']>8000) & (train_df['rent']<10000)].index)
train_df = train_df.drop(train_df[(train_df['bathrooms']>=12) & (train_df['rent']<10000)].index)

train_df = train_df.drop(train_df[(train_df['bathrooms']==6) & (train_df['rent']<20000)].index)

train_df = train_df.drop(train_df[(train_df['bedrooms']>=8) & (train_df['rent']<20000)].index)


cf = submit1_df #retaining the dataframe in cf instead incase I want to look at the original without edits
cf.min_to_subway = cf.min_to_subway.fillna(cf.min_to_subway.mean(), axis=0) #Replacing empty values with the mean of the entire column
cf.year_built = cf.year_built.fillna(cf.year_built.median(), axis=0) #Replacing empty values with the median of the entire column
cf.loc[cf['size_sqft']==0,'size_sqft'] = np.nan #Changing data with 0 as sqrt feet to nans
cf.size_sqft = cf.size_sqft.fillna(cf.size_sqft.mean(), axis=0) #Nan values of sqrt feet to mean of column


cf2 = submit2_df #retaining the dataframe in cf instead incase I want to look at the original without edits
cf2.min_to_subway = cf2.min_to_subway.fillna(cf2.min_to_subway.mean(), axis=0) #Replacing empty values with the mean of the entire column
cf2.year_built = cf2.year_built.fillna(cf2.year_built.median(), axis=0) #Replacing empty values with the median of the entire column
cf2.loc[cf2['size_sqft']==0,'size_sqft'] = np.nan #Changing data with 0 as sqrt feet to nans
cf2.size_sqft = cf2.size_sqft.fillna(cf2.size_sqft.mean(), axis=0) #Nan values of sqrt feet to mean of column


In [21]:
print(len(test_df))
print(len(train_df))
print(len(submit1_df))
print(len(submit2_df))

2000
11990
2000
2000


#### Build training data

In [0]:
# extract usable features
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway', 
    'size_sqft', 'no_fee', 'has_doorman', 'Occurrences'
]
#Year build and minute values are the only features with missing values
train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

#### Fit model

In [23]:
lreg = LinearRegression()
lreg.fit(train_features, train_target)

rf = RandomForestRegressor()
rf.fit(train_features, train_target)

gbr = GradientBoostingRegressor()
gbr.fit(train_features,train_target)



GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

#### Predict and Measure Using Test 1

In [24]:
test_features = test_df[feature_cols] 

# impute missing values with medians
test_features = test_features.fillna(train_features.median(), axis=0)

# construct predictions 
test_df['predicted'] = gbr.predict(test_features)

mean_squared_error(test_df['rent'], test_df['predicted'])

2412238.3627237626

#### Combine Data, Predict Values for Test 2

In [25]:
master_df = train_df.append(test_df, sort=False)

master_df.loc[master_df['size_sqft']==0,'size_sqft'] = np.nan
master_df.size_sqft = master_df.size_sqft.fillna(master_df.size_sqft.mean(), axis=0)

master_features = master_df[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
master_target = master_df['rent']

gbr.fit(master_features, master_target)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [8]:
print(cf['rent'].value_counts) #Confirming missing rent values have proper index due to previous issue

<bound method IndexOpsMixin.value_counts of rental_id
7428577   NaN
7454944   NaN
7473595   NaN
7490488   NaN
7460896   NaN
           ..
7451410   NaN
7493779   NaN
7474525   NaN
7507738   NaN
7443274   NaN
Name: rent, Length: 2000, dtype: float64>


#### Create Submission File for `test2`

In [26]:
submit1_features = cf[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
cf['predictions'] = gbr.predict(submit1_features)
cf['predictions'].to_csv('sample_submission1.csv', header=True)

cf['fake_rent'] = np.ones(cf['predictions'].shape) * master_target.median()
mean_squared_error(cf['predictions'], test_df['rent'])

13845229.534566218

In [27]:
submit1_df[feature_cols+['predictions']] #Checking predictions

Unnamed: 0_level_0,bedrooms,year_built,bathrooms,min_to_subway,size_sqft,no_fee,has_doorman,Occurrences,predictions
rental_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7428577,0,1931.0,1.0,0.6000,450.000000,0,0,11.0,2333.203714
7454944,2,1960.0,1.0,27.1500,950.000000,0,0,,2266.050224
7473595,1,1934.0,1.0,2.8000,914.956665,1,0,,2684.503140
7490488,2,2017.0,1.0,2.2833,500.000000,1,0,5.0,2570.065127
7460896,1,2010.0,1.0,10.4833,610.000000,1,0,1.0,2344.601708
...,...,...,...,...,...,...,...,...,...
7451410,0,1920.0,1.0,4.0333,600.000000,0,0,,2097.788498
7493779,1,1900.0,1.0,1.6000,914.956665,0,0,11.0,3473.997823
7474525,1,1920.0,1.0,4.0333,914.956665,0,0,4.0,2581.326733
7507738,1,2005.0,1.0,1.0500,900.000000,1,0,4.0,2939.343264


In [0]:
files.download("sample_submission1.csv") #Downloads file from colab

#### Create Submission File for `test3`

In [28]:
submit2_features = cf2[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
cf2['predictions'] = gbr.predict(submit2_features)
cf2['predictions'].to_csv('sample_submission2.csv', header=True)

cf2['fake_rent'] = np.ones(cf2['predictions'].shape) * master_target.median()
mean_squared_error(cf2['predictions'], test_df['rent'])

12588133.49452935

submit2_features = submit2_df[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
submit2_df['predictions'] = lreg.predict(submit2_features)
submit2_df['predictions'].to_csv('sample_submission2.csv', header=True)

submit2_df['fake_rent'] = np.ones(submit2_df['predictions'].shape) * master_target.median()
mean_squared_error(submit2_df['predictions'], submit2_df['rent'])

In [32]:
submit2_df[feature_cols+['predictions']] #checking predictions of submit2

Unnamed: 0_level_0,bedrooms,year_built,bathrooms,min_to_subway,size_sqft,no_fee,has_doorman,Occurrences,predictions
rental_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7455565,4,1931.0,2.0,1.3167,1200.0,1,0,5.0,3696.142802
7473295,1,1899.0,1.0,1.3167,800.0,1,0,2.0,2697.658193
7459804,0,2002.0,1.0,7.3500,525.0,1,1,11.0,2926.544422
7456330,2,1920.0,1.0,1.2333,750.0,0,0,12.0,2802.539904
7443595,0,1931.0,1.0,0.5833,487.0,1,1,3.0,2928.795385
...,...,...,...,...,...,...,...,...,...
7455124,2,1945.0,1.0,8.5667,1000.0,0,0,,2325.146788
7443016,1,2010.0,1.0,10.6667,675.0,1,0,,2498.096458
7480876,2,1905.0,1.0,2.4833,560.0,1,0,8.0,3053.797014
7443379,3,2009.0,1.0,3.0833,700.0,1,0,5.0,2614.985207
