# Project Demo

This demo is designed to provide some general tips and tricks for the ITDS Fall 2019 project. For full details on the project, please refer to [the project requirements](https://grantmlong.com/teaching/fall2019/project/Project-ITDS-Fall-2019.pdf).

***
This demo uses `scikit-learn`, but by no means are you required to use this or any other particular package.



In [119]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor


%matplotlib inline

In [120]:
train_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_train.csv', index_col=0)
test_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test1.csv', index_col=0)
submit1_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test2.csv', index_col=0)
submit2_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test3.csv', index_col=0)

In [121]:
train_df.sample(10).transpose()

rental_id,7325284,7341163,7261414,7408405,7208263,7214455,7415689,7141258,7324879,7230550
addr_unit,#4A,#5B,#9C,#1025,#9M,#7,#GRDN,#3F,#B5,#4B
building_id,359569,18490,662614,1225675,59809,196804,203530,231190,1297285,99757
bedrooms,2,2,1,2,3,2,2,1,1,0
bathrooms,1,2,1,2,3,1,1,1,1,1
size_sqft,1299,900,650,1250,1772,800,1750,700,700,500
created_at,2018-07-02 13:12:25,2018-07-06 19:09:12,2018-06-11 14:48:40,2018-07-25 17:10:29,2018-05-25 11:04:56,2018-05-29 11:06:28,2018-07-27 12:50:51,2018-05-03 09:12:32,2018-07-02 12:03:13,2018-06-01 19:53:40
addr_street,305 MC GUINNESS BLVD,229 EAST 13 STREET,8709 5 AVENUE,2600 NETHERLAND AVENUE,101 WEST END AVENUE,276 PROSPECT PARK WEST,140 PARK PLACE,257 FLATBUSH AVENUE,23-15 ASTORIA BOULEVARD,2194 3 AVENUE
addr_city,Brooklyn,New York,Brooklyn,Bronx,New York,Brooklyn,Brooklyn,Brooklyn,Astoria,New York
addr_zip,11222,10003,11209,10463,10069,11215,11217,11217,11102,10035
addr_lat,40.7329,40.7324,40.6205,40.8787,40.7751,40.6583,40.6777,40.6802,40.7719,40.8004


#### Build training data

In [163]:
# extract usable features

#feature_cols = [
#    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway',
#    'size_sqft', 'no_fee', 'has_doorman', 'floornumber', 'has_pool', 'has_childrens_playroom', 'allows_pets'
#]
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway'
]
train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

#### Fit model

In [171]:
lreg = LinearRegression()
lreg.fit(train_features, train_target)


#rf = RandomForestRegressor(n_estimators = 1000, random_state= 42)
#rf.fit(train_features, train_target)




LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

#### Predict and Measure Using Test 1

In [172]:

test_features = test_df[feature_cols] 

# impute missing values with medians
test_features = test_features.fillna(train_features.median(), axis=0)

# construct predictions 
test_df['predicted'] = lreg.predict(test_features)
#test_df['predicted']=clf.predict(test_features)

mean_squared_error(test_df['rent'], test_df['predicted'])

5144072.597655631

#### Combine Data, Predict Values for Test 2

In [173]:

master_df = train_df.append(test_df, sort=False)

master_features = master_df[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
master_target = master_df['rent']

lreg.fit(master_features, master_target)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

#### Create Submission File for `test2`

In [174]:
submit1_features = submit1_df[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
submit1_df['predictions'] = lreg.predict(submit1_features)
submit1_df['predictions'].to_csv('sample_submission1.csv', header=True)

submit1_df['fake_rent'] = np.ones(submit1_df['predictions'].shape) * master_target.median()
mean_squared_error(submit1_df['predictions'], submit1_df['fake_rent'])


3098524.2259505177

#### Create Submission File for `test3`

In [144]:

submit2_features = submit2_df[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
submit2_df['predictions'] = lreg.predict(submit2_features)
submit2_df['predictions'].to_csv('sample_submission2.csv', header=True)

submit2_df['fake_rent'] = np.ones(submit2_df['predictions'].shape) * master_target.median()
mean_squared_error(submit2_df['predictions'], submit2_df['fake_rent'])


4069309.7717325995