# Project Demo

This demo is designed to provide some general tips and tricks for the ITDS Fall 2019 project. For full details on the project, please refer to [the project requirements](https://grantmlong.com/teaching/fall2019/project/Project-ITDS-Fall-2019.pdf).

***
This demo uses `scikit-learn`, but by no means are you required to use this or any other particular package.



In [0]:
!git add .

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor

%matplotlib inline

In [0]:
train_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_train.csv', index_col=0)
test_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test1.csv', index_col=0)
submit1_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test2.csv', index_col=0)
submit2_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test3.csv', index_col=0)

cf = submit1_df 

cf.min_to_subway = cf.min_to_subway.fillna(cf.min_to_subway.mean(), axis=0)
cf.year_built = cf.year_built.fillna(cf.year_built.median(), axis=0)



In [0]:
train_df.sample(5).transpose()

rental_id,7229284,7215520,7311652,7388557,7322155
addr_unit,#5AA,#A,#214,#2,#2F
building_id,386185,91474,133315,700879,123712
bedrooms,1,1,0,1,2
bathrooms,1,1,1,1,1
size_sqft,700,900,900,550,900
created_at,2018-06-01 16:03:21,2018-05-29 13:25:07,2018-06-27 12:11:25,2018-07-20 17:35:03,2018-06-30 14:14:50
addr_street,24 COOK STREET,1721 LEXINGTON AVENUE,50 BRIDGE STREET,8629 16 AVENUE,676 RIVERSIDE DRIVE
addr_city,Brooklyn,New York,Brooklyn,Brooklyn,New York
addr_zip,11206,10029,11201,11214,10031
addr_lat,40.7021,40.7931,40.7033,40.6093,40.8267


#### Build training data

In [0]:
# extract usable features
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway', 
    'size_sqft', 'no_fee', 'has_doorman'
]
train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

#### Fit model

In [50]:
lreg = LinearRegression()
lreg.fit(train_features, train_target)

rf = RandomForestRegressor()
rf.fit(train_features, train_target)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

#### Predict and Measure Using Test 1

In [51]:
test_features = test_df[feature_cols] 

# impute missing values with medians
test_features = test_features.fillna(train_features.median(), axis=0)

# construct predictions 
test_df['predicted'] = rf.predict(test_features)

mean_squared_error(test_df['rent'], test_df['predicted'])

2650731.7484114342

#### Combine Data, Predict Values for Test 2

In [52]:
master_df = train_df.append(test_df, sort=False)

master_features = master_df[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
master_target = master_df['rent']

rf.fit(master_features, master_target)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

#### Create Submission File for `test2`

In [54]:
submit1_features = cf[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
cf['predictions'] = rf.predict(submit1_features)
cf['predictions'].to_csv('sample_submission2.csv', header=True)

cf['fake_rent'] = np.ones(cf['predictions'].shape) * master_target.median()
mean_squared_error(cf['predictions'], cf['fake_rent'])

6612413.287062759

#### Create Submission File for `test3`

In [35]:
submit2_features = submit2_df[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
submit2_df['predictions'] = lreg.predict(submit2_features)
submit2_df['predictions'].to_csv('sample_submission2.csv', header=True)

submit2_df['fake_rent'] = np.ones(submit2_df['predictions'].shape) * master_target.median()
mean_squared_error(submit2_df['predictions'], submit2_df['fake_rent'])

4017817.158625244