In [1]:
import os
import sys
import warnings
import numpy as np 
import pandas as pd 
import seaborn as sns 
from scipy.stats import uniform
import matplotlib.pyplot as plt 
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor


%load_ext autoreload
%autoreload 2
%matplotlib inline 

# Setting up directories
ROOT_DIR = os.path.join(os.getcwd(), '..')
SRC_DIR = os.path.join(os.getcwd(), '..', 'src')
DATA_DIR = os.path.join(ROOT_DIR, "data")

sys.path.append(ROOT_DIR)
sys.path.append(SRC_DIR)
os.chdir(ROOT_DIR)

In [2]:
# Custom Modules
from d00_utils import read_write
from d04_modeling import models
from d04_modeling import features_transformations

PREP_DATA_DIR = os.path.join(DATA_DIR,"03_preprocessed")
LISTINGS_DATA_PATH = os.path.join(PREP_DATA_DIR, 'listings_preprocessed_2.csv')

### Get Prepared Data

In [3]:
listings = pd.read_csv(LISTINGS_DATA_PATH)
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6885 entries, 0 to 6884
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   url           6885 non-null   object 
 1   address       6885 non-null   object 
 2   neighborhood  6885 non-null   object 
 3   rent          6885 non-null   int64  
 4   beds          6885 non-null   int64  
 5   baths         6885 non-null   float64
 6   flexs         6885 non-null   float64
 7   zip           6885 non-null   int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 430.4+ KB


### Train Test Split

In [4]:
listings_signals = listings[['beds', 'baths', 'flexs', 'zip']]
listings_labels = listings['rent']

listings_train_signals, listings_test_signals, listings_train_labels, listings_test_labels = train_test_split(listings_signals, listings_labels, test_size=0.33, random_state=42)

### Transforming all Features

In [5]:
listings_train_signals_prep, listings_test_signals_prep = features_transformations.\
fit_transform(listings_signals,listings_train_signals,listings_test_signals,'beds_bath_flex_ordi')

Training set Shape:(4612, 133)
Testing set Shape:(2273, 133)


### Base Model 
#### Lasso Regression - with alpha = 0.1

In [7]:
lasso_reg = models.build_lasso(0.1, listings_train_signals_prep, listings_train_labels)

Simple RMSE Evaluation Score: 2559.2717003863045
Cross Valdiation RMSE Evaluation Mean Score: 2613.9517066103417


Model perfroms is extremly low

#### Lasso Regression - RandomSearchCV

In [8]:
models.lasso_random_search(n_iter_=100,cv_=3, X_train=listings_train_signals_prep, y_train=listings_train_labels)

{'alpha': 0.3534697256351348}

#### Building Lasso Regression with Best Estimators

In [10]:
lasso_reg = models.build_lasso(0.35, listings_train_signals_prep, listings_train_labels)

Simple RMSE Evaluation Score: 2565.354446826757
Cross Valdiation RMSE Evaluation Mean Score: 2613.1781801149596


No imporovment

### Lasso model with only beds and zip

In [11]:
listings_train_signals_prep2, listings_test_signals_prep2 = features_transformations.\
fit_transform(listings_signals,listings_train_signals,listings_test_signals,'beds_bath_flex_numerical')

Training set Shape:(4612, 133)
Testing set Shape:(2273, 133)


In [13]:
lasso_reg2 = models.build_lasso(0.35, listings_train_signals_prep2, listings_train_labels)

Simple RMSE Evaluation Score: 2490.274338596655
Cross Valdiation RMSE Evaluation Mean Score: 2547.0939559319695


# Plynomial Features 

##### Drgree2= 2

In [14]:
# Getting polynomical features
poly_features = PolynomialFeatures(degree=2, include_bias=False)
listings_train_poly = poly_features.fit_transform(listings_train_signals)

# RandomSearch on polynomial features
warnings.filterwarnings('ignore')
models.lasso_random_search(n_iter_=20,cv_=3, X_train=listings_train_poly, y_train=listings_train_labels)

{'alpha': 0.579041708743506}

In [16]:
lasso_reg3_poly = models.build_lasso(0.62, listings_train_poly, listings_train_labels)

Simple RMSE Evaluation Score: 2467.1397088844024
Cross Valdiation RMSE Evaluation Mean Score: 2485.3241809339443


##### Drgree = 5

In [17]:
# Getting polynomical features
poly_features = PolynomialFeatures(degree=5, include_bias=False)
listings_train_poly = poly_features.fit_transform(listings_train_signals)

# RandomSearch on polynomial features
models.lasso_random_search(n_iter_=20,cv_=3, X_train=listings_train_poly, y_train=listings_train_labels)

{'alpha': 0.9615366218413242}

In [19]:
lasso_reg4_poly = models.build_lasso(0.9, listings_train_poly, listings_train_labels)

Simple RMSE Evaluation Score: 2343.547016395309
Cross Valdiation RMSE Evaluation Mean Score: 2517.65587596868


##### Drgree = 10

In [20]:
# Getting polynomical features
poly_features = PolynomialFeatures(degree=10, include_bias=False)
listings_train_poly = poly_features.fit_transform(listings_train_signals)

# RandomSearch on polynomial features
models.lasso_random_search(n_iter_=20,cv_=3, X_train=listings_train_poly, y_train=listings_train_labels)

{'alpha': 0.03016559396939189}

In [None]:
lasso_reg5_poly = models.build_lasso(0.008, listings_train_poly, listings_train_labels)

Does not make much of the difference

### Decsion Tree

Numerical Features

In [None]:
dtree1= models.build_decision_tree(max_depth_=40,max_features_=4,X_train=listings_train_signals_prep2,y_train=listings_train_labels)

In [None]:
dtree1= models.build_decision_tree(max_depth_=50,max_features_=15,X_train=listings_train_signals_prep2,y_train=listings_train_labels)

Ordinal Features

In [None]:
dtree1= models.build_decision_tree(max_depth_=40,max_features_=4,X_train=listings_train_signals_prep,y_train=listings_train_labels)

In [None]:
dtree1= models.build_decision_tree(max_depth_=100,max_features_=11,X_train=listings_train_signals_prep,y_train=listings_train_labels)

No imporvemnts

### Ensemble Model forest

In [None]:
# Lasso Regression
lasso_reg = Lasso(alpha=0.35) # alpha from RandomSearch
#  Random Foresst
rnd_reg = RandomForestRegressor(n_estimators=100)
# Decision Tree
tree_reg = DecisionTreeRegressor(max_depth=40,max_features=5)
# Voting Regressor 
voting_reg = VotingRegressor(
    estimators=[('lr', lasso_reg), ('rf', rnd_reg), ('tree', tree_reg)])

voting_reg.fit(listings_train_signals_prep, listings_train_labels)

models.calculate_rmse(listings_train_signals_prep, listings_train_labels,voting_reg)

### Adaptive Boosting

In [None]:
# Adpative Boosting on Voting Regressor 
ada_reg1 = AdaBoostRegressor(voting_reg,n_estimators=200, learning_rate=0.5)

ada_reg1.fit(listings_train_signals_prep, listings_train_labels)

models.calculate_rmse(listings_train_signals_prep, listings_train_labels,ada_reg1)

In [None]:
# Adpative Boosting on lasso 
ada_reg1 = AdaBoostRegressor(lasso_reg2,n_estimators=200, learning_rate=0.5)

ada_reg1.fit(listings_train_signals_prep, listings_train_labels)

models.calculate_rmse(listings_train_signals_prep, listings_train_labels,ada_reg1)

### Dimentionality Reduction - Principal Component Analysis

To Check if we can reduce dimension. We have 133 dimension after converting categorical features to encoders

In [None]:
# PCA with 99% variance captured
pca  = PCA(n_components=0.99)
listings_train_reduced = pca.fit_transform(listings_train_signals_prep)
listings_train_reduced.shape

In [None]:
# Fitting votting Regression 
voting_reg.fit(listings_train_reduced, listings_train_labels)
models.calculate_rmse(listings_train_reduced, listings_train_labels,voting_reg)

In [None]:
# Fitting on adaptive boost model with votting regression
ada_reg1.fit(listings_train_reduced, listings_train_labels)
models.calculate_rmse(listings_train_reduced, listings_train_labels,ada_reg1)

#### Building Model without zip

In [None]:
listings_train_signals_prep3 = listings_train_signals[['beds', 'baths', 'flexs']]

In [None]:
## Fitting votting Regression ##
# Lasso Regression
lasso_reg = Lasso(alpha=0.35) # alpha from RandomSearch
#  Random Foresst
rnd_reg = RandomForestRegressor(n_estimators=100)
# Decision Tree
tree_reg = DecisionTreeRegressor(max_depth=40,max_features=2)
# Voting Regressor 
voting_reg2 = VotingRegressor(
    estimators=[('lr', lasso_reg), ('rf', rnd_reg), ('tree', tree_reg)])

voting_reg2.fit(listings_train_signals_prep3, listings_train_labels)
models.calculate_rmse(listings_train_signals_prep3, listings_train_labels,voting_reg2)

In [None]:
# Adpative Boosting on Voting Regressor 
ada_reg2 = AdaBoostRegressor(voting_reg2,n_estimators=200, learning_rate=0.5)

ada_reg2.fit(listings_train_signals_prep3, listings_train_labels)

models.calculate_rmse(listings_train_signals_prep3, listings_train_labels,ada_reg2)

### Testing




In [None]:
pred_test =  ada_reg.predict(listings_test_signals_prep)
mse = mean_squared_error(listings_test_labels, pred_test)
rmse = np.sqrt(mse)
print("Simple RMSE Evaluation Score: "+ str(rmse))


In [None]:
accuracy = ada_reg.score(listings_test_signals_prep,listings_test_labels)
print(accuracy*100,'%')