In [None]:
import os
import sys
import pickle
import warnings
import numpy as np 
import pandas as pd 
import seaborn as sns 
from scipy.stats import uniform
import matplotlib.pyplot as plt 
from sklearn.utils import shuffle
from sklearn.externals import joblib 
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor


%load_ext autoreload
%autoreload 2
%matplotlib inline 
warnings.filterwarnings('ignore')


# Setting up directories
ROOT_DIR = os.path.join(os.getcwd(), '..')
SRC_DIR = os.path.join(os.getcwd(), '..', 'src')
DATA_DIR = os.path.join(ROOT_DIR, "data")

sys.path.append(ROOT_DIR)
sys.path.append(SRC_DIR)
os.chdir(ROOT_DIR)

In [3]:
# Custom Modules
from d00_utils import read_write
from d04_modeling import models
from d04_modeling import features_transformations

PREP_DATA_DIR = os.path.join(DATA_DIR,"03_preprocessed")
MODELs_DIR = os.path.join(DATA_DIR,"04_models")
LISTINGS_DATA_PATH = os.path.join(PREP_DATA_DIR, 'listings_preprocessed_2.csv')

### Get Prepared Data

In [4]:
listings = pd.read_csv(LISTINGS_DATA_PATH)
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6885 entries, 0 to 6884
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   url           6885 non-null   object 
 1   address       6885 non-null   object 
 2   neighborhood  6885 non-null   object 
 3   rent          6885 non-null   int64  
 4   beds          6885 non-null   int64  
 5   baths         6885 non-null   float64
 6   flexs         6885 non-null   float64
 7   zip           6885 non-null   int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 430.4+ KB


### Train Test Split

In [5]:
# Sinals and labels 
listings_signals = listings[['beds', 'baths', 'flexs', 'zip']]
listings_labels = listings['rent']

# Shuffle
listings_signals, listings_labels = shuffle(listings_signals, listings_labels)

# Split
listings_train_signals, listings_test_signals, listings_train_labels, listings_test_labels = train_test_split(listings_signals, listings_labels, test_size=0.33, random_state=42)

### Transforming all Features

In [85]:
listings_train_signals_prep, listings_test_signals_prep = features_transformations.\
fit_transform(listings_signals,listings_train_signals,listings_test_signals,'beds_bath_flex_ordi')

Training set Shape:(4612, 133)
Testing set Shape:(2273, 133)


### Base Model 
#### Lasso Regression - with alpha = 0.1

In [7]:
lasso_reg = models.build_lasso(0.1, listings_train_signals_prep, listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'lasso_reg1.pkl')
saved_model = joblib.dump(lasso_reg,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2344.1458116373688


Model perfroms is extremly low

#### Lasso Regression - RandomSearchCV

In [8]:
models.lasso_random_search(n_iter_=100,cv_=3, X_train=listings_train_signals_prep, y_train=listings_train_labels)

{'alpha': 0.16627929785782958}

#### Building Lasso Regression with Best Estimators

In [9]:
lasso_reg = models.build_lasso(0.166, listings_train_signals_prep, listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'lasso_reg2.pkl')
saved_model = joblib.dump(lasso_reg,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2343.9955616415587


No imporovment

### Lasso model with only beds and zip

In [10]:
listings_train_signals_prep2, listings_test_signals_prep2 = features_transformations.\
fit_transform(listings_signals,listings_train_signals,listings_test_signals,'beds_bath_flex_numerical')

Training set Shape:(4612, 133)
Testing set Shape:(2273, 133)


In [11]:
lasso_reg3 = models.build_lasso(0.16, listings_train_signals_prep2, listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'lasso_reg3.pkl')
saved_model = joblib.dump(lasso_reg3,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2281.286595256233


# Plynomial Features 

##### Drgree2= 2

In [12]:
# Getting polynomical features
poly_features = PolynomialFeatures(degree=2, include_bias=False)
listings_train_poly = poly_features.fit_transform(listings_train_signals)

# RandomSearch on polynomial features
models.lasso_random_search(n_iter_=20,cv_=3, X_train=listings_train_poly, y_train=listings_train_labels)

{'alpha': 0.736695059880016}

In [13]:
lasso_reg4_poly2 = models.build_lasso(0.73, listings_train_poly, listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'lasso_reg4_poly2.pkl')
saved_model = joblib.dump(lasso_reg4_poly2,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2238.9460408102136


##### Drgree = 5

In [14]:
# Getting polynomical features
poly_features = PolynomialFeatures(degree=5, include_bias=False)
listings_train_poly = poly_features.fit_transform(listings_train_signals)

# RandomSearch on polynomial features
models.lasso_random_search(n_iter_=20,cv_=3, X_train=listings_train_poly, y_train=listings_train_labels)

{'alpha': 0.38007689568792413}

In [15]:
lasso_reg5_poly5 = models.build_lasso(0.38, listings_train_poly, listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'lasso_reg5_poly5.pkl')
saved_model = joblib.dump(lasso_reg5_poly5,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2769.3592937712447


##### Drgree = 3

In [16]:
# Getting polynomical features
poly_features = PolynomialFeatures(degree=3, include_bias=False)
listings_train_poly = poly_features.fit_transform(listings_train_signals)

# RandomSearch on polynomial features
models.lasso_random_search(n_iter_=20,cv_=3, X_train=listings_train_poly, y_train=listings_train_labels)

{'alpha': 0.004178809327484712}

In [17]:
lasso_reg6_poly3 = models.build_lasso(0.004, listings_train_poly, listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'lasso_reg6_poly3.pkl')
saved_model = joblib.dump(lasso_reg6_poly3,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2222.8896064592923


Does not make much of the difference

### Decsion Tree

Numerical Features

In [18]:
dtree1= models.build_decision_tree(max_depth_=40,max_features_=4,X_train=listings_train_signals_prep2,y_train=listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'dtree1.pkl')
saved_model = joblib.dump(dtree1,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2629.322162108982


In [19]:
dtree2= models.build_decision_tree(max_depth_=50,max_features_=15,X_train=listings_train_signals_prep2,y_train=listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'dtree2.pkl')
saved_model = joblib.dump(dtree2,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2273.6615477780215


Ordinal Features

In [20]:
dtree3= models.build_decision_tree(max_depth_=40,max_features_=4,X_train=listings_train_signals_prep,y_train=listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'dtree3.pkl')
saved_model = joblib.dump(dtree3,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2510.071241067437


In [21]:
dtree4= models.build_decision_tree(max_depth_=100,max_features_=11,X_train=listings_train_signals_prep,y_train=listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'dtree4.pkl')
saved_model = joblib.dump(dtree4,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2347.893429538848


No imporvemnts

### SVM 

In [36]:
svm_reg1 = models.build_svm(1.5,1,listings_train_signals_prep, listings_train_labels)

Cross Valdiation RMSE Evaluation Mean Score: 3037.67458939483


In [37]:
# RandomSearch on polynomial features
models.svm_random_search(n_iter_=20,cv_=3, X_train=listings_train_poly, y_train=listings_train_labels)

{'C': 31.114025723769867, 'epsilon': 1.4900168650682453}

In [38]:
svm_reg2 = models.build_svm(1.5,31,listings_train_signals_prep, listings_train_labels)

Cross Valdiation RMSE Evaluation Mean Score: 2609.3599152667384


### Ensemble Model

In [49]:
voting_reg1 = models.build_ensemble(listings_train_signals_prep, listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'voting_reg1.pkl')
saved_model = joblib.dump(voting_reg1,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2189.0082415043003


In [50]:
# Only Beds and zips
voting_reg2 = models.build_ensemble(listings_train_signals_prep, listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'voting_reg2.pkl')
saved_model = joblib.dump(voting_reg2,model_path)

Cross Valdiation RMSE Evaluation Mean Score: 2242.7222178624193


### Adaptive Boosting

In [51]:
# Adpative Boosting on Voting Regressor 
ada_reg1_votng = AdaBoostRegressor(voting_reg,n_estimators=200, learning_rate=0.5)

ada_reg1_votng.fit(listings_train_signals_prep, listings_train_labels)

models.calculate_rmse(listings_train_signals_prep, listings_train_labels,ada_reg1_votng)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'ada_reg1_votng.pkl')
saved_model = joblib.dump(ada_reg1_votng,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2482.181469644943


In [53]:
# Adpative Boosting on lasso 
ada_reg2_lasso = AdaBoostRegressor(lasso_reg,n_estimators=200, learning_rate=0.5)

ada_reg2_lasso.fit(listings_train_signals_prep, listings_train_labels)

models.calculate_rmse(listings_train_signals_prep, listings_train_labels,ada_reg2_lasso)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'ada_reg2_lasso.pkl')
saved_model = joblib.dump(ada_reg2_lasso,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 3658.9700982991903


### Dimentionality Reduction - Principal Component Analysis

To Check if we can reduce dimension. We have 133 dimension after converting categorical features to encoders

In [54]:
# PCA with 99% variance captured
pca  = PCA(n_components=0.99)
listings_train_reduced = pca.fit_transform(listings_train_signals_prep)
listings_train_reduced.shape

(4612, 73)

In [57]:
# Fitting votting Regression 
voting_reg1_reduced =models.build_ensemble(listings_train_reduced, listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'voting_reg1_reduced.pkl')
saved_model = joblib.dump(voting_reg1_reduced,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2236.5311726809214
Cross Valdiation RMSE Evaluation Mean Score: 2266.0899817060317


In [59]:
# Fitting on adaptive boost model with votting regression
ada_reg1_votng_reduced =models.build_ensemble(listings_train_reduced, listings_train_labels)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'ada_reg1_votng_reduced.pkl')
saved_model = joblib.dump(ada_reg1_votng_reduced,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2230.177631941019


#### Building Model without zip

In [60]:
listings_train_signals_prep3 = listings_train_signals[['beds', 'baths', 'flexs']]

In [63]:
## Fitting votting Regression ##
# Lasso Regression
lasso_reg = Lasso(alpha=0.35) # alpha from RandomSearch
#  Random Foresst
rnd_reg = RandomForestRegressor(n_estimators=100)
# Decision Tree
# SVM
svm_reg = LinearSVR(1.5,31)
tree_reg = DecisionTreeRegressor(max_depth=40,max_features=2)
# Voting Regressor 
voting_reg2 = VotingRegressor(
    estimators=[('lr', lasso_reg), ('rf', rnd_reg), ('tree', tree_reg),('svm', svm_reg)])

voting_reg2.fit(listings_train_signals_prep3, listings_train_labels)
models.calculate_rmse(listings_train_signals_prep3, listings_train_labels,voting_reg2)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'voting_reg2.pkl')
saved_model = joblib.dump(voting_reg2,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2444.948078470481


In [64]:
# Adpative Boosting on Voting Regressor 
ada_reg3 = AdaBoostRegressor(voting_reg2,n_estimators=200, learning_rate=0.5)

ada_reg3.fit(listings_train_signals_prep3, listings_train_labels)

models.calculate_rmse(listings_train_signals_prep3, listings_train_labels,ada_reg3)

# Save the trained model as a pickle string. 
model_path = os.path.join(MODELs_DIR , 'ada_reg3.pkl')
saved_model = joblib.dump(ada_reg3,model_path) 

Cross Valdiation RMSE Evaluation Mean Score: 2527.7066590973745


### Testing




Final Model: Ensemble Model of Lasso, DecisionTree, and Random forsest perfroms better. 

In [90]:
model_path = os.path.join(MODELs_DIR , 'voting_reg1.pkl')
final_model = joblib.load(model_path) 

In [95]:
models.calculate_rmse( listings_test_signals_prep, listings_test_labels,final_model)

Cross Valdiation RMSE Evaluation Mean Score: 2288.2917932032888
