In [15]:
#import simple_imputed_df.csv
import pandas as pd
simple_imputed_df = pd.read_csv('simple_imputed_df.csv')
print(simple_imputed_df.head())

        id   host_id  host_response_time  host_response_rate  \
0  1489424   5294164                 1.0               100.0   
1  2992450   4621559                 3.0               100.0   
2  3820211  19648678                 1.0               100.0   
3  5651579  29288920                 1.0               100.0   
4  6623339  19648678                 1.0               100.0   

   host_acceptance_rate  host_is_superhost  host_identity_verified  \
0                  87.0                  0                       1   
1                 100.0                  0                       0   
2                  66.0                  0                       1   
3                  99.0                  0                       1   
4                  66.0                  0                       1   

  neighbourhood_cleansed  latitude  longitude  ... number_of_reviews  \
0        FOURTEENTH WARD  42.66719  -73.81580  ...               248   
1             THIRD WARD  42.65789  -73.75370  ...

## Possible Topics
1. Linear Regression: A good starting point for regression tasks. If the relationship between the features and the target variable is approximately linear, linear regression can provide a decent model with the benefit of interpretability.
2. Decision Tree Regressor: A non-linear model that can capture complex relationships between features and the target variable. It is easy to interpret but can be prone to overfitting.
3. Random Forest Regressor: An ensemble of decision trees that can model more complex relationships and is less likely to overfit compared to a single decision tree.
4. Gradient Boosting Machines (GBM): Models like XGBoost, LightGBM, or CatBoost are powerful for regression tasks and can handle a mix of categorical and numerical data well. They are often among the top performers in machine learning competitions.
5. Support Vector Regression (SVR): Effective in high-dimensional spaces and when there is a non-linear relationship between the features and the target variable.

### Linear Regression

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

X = simple_imputed_df[['room_type', 'City', 'State',
                       'latitude', 
                       'longitude', 'instant_bookable']]
y = simple_imputed_df['price']

# Preprocessing numerical and categorical data
numeric_features = ['latitude', 'longitude']
numeric_transformer = StandardScaler()

categorical_features = ['room_type', 
                        'instant_bookable', 'City', 'State']
categorical_transformer = OneHotEncoder()

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")

RMSE: 175.48697070499924


### Decision Tree

In [20]:
from sklearn.tree import DecisionTreeRegressor, plot_tree
import matplotlib.pyplot as plt

tree_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(
        max_depth=10,             # Limit the depth of the tree
        min_samples_split=10,    # At least 10 samples to split a node
        min_samples_leaf=5,      # At least 5 samples at each leaf
        random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")



RMSE: 161.13818341876885


### Random Forest

In [13]:
from sklearn.ensemble import RandomForestRegressor

rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=100, 
        random_state=42))
])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")


RMSE: 152.7064245501371


### SVM

In [23]:
from sklearn.svm import SVR

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='linear'))  # You can change the kernel and tweak other parameters
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")

RMSE: 186.09158236868342


## GBM

In [24]:
from sklearn.ensemble import GradientBoostingRegressor

gbm_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
gbm_model.fit(X_train, y_train)
y_pred = gbm_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")

RMSE: 163.28012927610666


## Ensemble Learning

In [17]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

# Define the Bagging model
bagging_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('bagging', BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=100, random_state=42))
])

# Train the model
bagging_model.fit(X_train, y_train)
y_pred = bagging_model.predict(X_test)

# Calculate RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"Bagging RMSE: {rmse}")



Bagging RMSE: 152.72974844401946


In [3]:
simple_imputed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253798 entries, 0 to 253797
Data columns (total 24 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              253798 non-null  int64  
 1   host_id                         253798 non-null  int64  
 2   host_response_time              253798 non-null  float64
 3   host_response_rate              253798 non-null  float64
 4   host_acceptance_rate            253798 non-null  float64
 5   host_is_superhost               253798 non-null  int64  
 6   host_identity_verified          253798 non-null  int64  
 7   neighbourhood_cleansed          253798 non-null  object 
 8   latitude                        253798 non-null  float64
 9   longitude                       253798 non-null  float64
 10  room_type                       253798 non-null  object 
 11  accommodates                    253798 non-null  int64  
 12  beds            