In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style

In [2]:
file_path_knn = 'df_knn.csv'

In [5]:
file_path_knn_outlier_treated = 'df_knn_o.csv'

In [7]:
df_knn = pd.read_csv(file_path_knn_outlier_treated)

In [9]:
df_knn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54117 entries, 0 to 54116
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   accommodates                  54117 non-null  float64
 1   bathrooms                     54117 non-null  float64
 2   review_scores_rating          54117 non-null  float64
 3   bedrooms                      54117 non-null  float64
 4   beds                          54117 non-null  float64
 5   log_price                     54117 non-null  float64
 6   room_type_Entire home/apt     54117 non-null  float64
 7   room_type_Private room        54117 non-null  float64
 8   room_type_Shared room         54117 non-null  float64
 9   cancellation_policy_flexible  54117 non-null  float64
 10  cancellation_policy_moderate  54117 non-null  float64
 11  cancellation_policy_strict    54117 non-null  float64
 12  cleaning_fee_False            54117 non-null  float64
 13  c

In [11]:
df_knn.columns

Index(['accommodates', 'bathrooms', 'review_scores_rating', 'bedrooms', 'beds',
       'log_price', 'room_type_Entire home/apt', 'room_type_Private room',
       'room_type_Shared room', 'cancellation_policy_flexible',
       'cancellation_policy_moderate', 'cancellation_policy_strict',
       'cleaning_fee_False', 'cleaning_fee_True', 'instant_bookable_f',
       'instant_bookable_t'],
      dtype='object')

### Creating 4 Data Frames Normalized 

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [15]:
from scipy.stats import boxcox

In [17]:
df_original = df_knn.copy()

In [19]:
scaler = StandardScaler() 
df_z_standardized = df_knn.copy() 
df_z_standardized.iloc[:, :5] = scaler.fit_transform(df_knn.iloc[:, :5])

In [21]:
scaler = MinMaxScaler() 
df_min_max_scaled = df_knn.copy() 
df_min_max_scaled.iloc[:, :5] = scaler.fit_transform(df_knn.iloc[:, :5])

In [23]:
df_box_cox = df_knn.copy() 
for col in df_knn.columns[:5]: 
    df_box_cox[col], _ = boxcox(df_knn[col] + 1)

In [25]:
df_log_transformed = df_knn.copy() 
df_log_transformed.iloc[:, :5] = np.log1p(df_knn.iloc[:, :5])

### Applying Train Test Split

In [27]:
from sklearn.model_selection import train_test_split

In [29]:
target = 'log_price' 
features = df_knn.columns.drop(target)

In [31]:
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split( df_original[features], df_original[target], test_size=0.2, random_state=42)

In [33]:
X_train_z, X_test_z, y_train_z, y_test_z = train_test_split( df_z_standardized[features], df_z_standardized[target], test_size=0.2, random_state=42)

In [35]:
X_train_minmax, X_test_minmax, y_train_minmax, y_test_minmax = train_test_split( df_min_max_scaled[features], df_min_max_scaled[target], test_size=0.2, random_state=42)

In [37]:
X_train_boxcox, X_test_boxcox, y_train_boxcox, y_test_boxcox = train_test_split( df_box_cox[features], df_box_cox[target], test_size=0.2, random_state=42)

### KNN Regressor

In [39]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [41]:
def apply_knn_regression(X_train, X_test, y_train, y_test):
    # Initialize the k-NN regressor with k=5 (you can experiment with different values of k)
    knn_regressor = KNeighborsRegressor(n_neighbors=5)

    # Train the k-NN regressor
    knn_regressor.fit(X_train, y_train)

    # Predict on the test set
    y_pred = knn_regressor.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mse, r2

In [43]:
mse_original, r2_original = apply_knn_regression(X_train_original, X_test_original, y_train_original, y_test_original)
print(f"Original Data - MSE: {mse_original:.2f}, R²: {r2_original:.2f}")

Original Data - MSE: 0.27, R²: 0.49


In [45]:
mse_z, r2_z = apply_knn_regression(X_train_z, X_test_z, y_train_z, y_test_z)
print(f"Z-Standardized Data - MSE: {mse_z:.2f}, R²: {r2_z:.2f}")

Z-Standardized Data - MSE: 0.27, R²: 0.49


In [47]:
mse_knn_boxcox, r2_knn_boxcox = apply_knn_regression(X_train_boxcox, X_test_boxcox, y_train_boxcox, y_test_boxcox)
print(f"Box-Cox Data (k-NN) - MSE: {mse_knn_boxcox:.2f}, R²: {r2_knn_boxcox:.2f}")

Box-Cox Data (k-NN) - MSE: 0.28, R²: 0.47


### Decision Tree

In [49]:
from sklearn.tree import DecisionTreeRegressor

In [51]:
def apply_decision_tree_regression(X_train, X_test, y_train, y_test):
    # Initialize the Decision Tree Regressor
    tree_regressor = DecisionTreeRegressor(random_state=42)

    # Train the Decision Tree Regressor
    tree_regressor.fit(X_train, y_train)

    # Predict on the test set
    y_pred = tree_regressor.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mse, r2

In [53]:
mse_original, r2_original = apply_decision_tree_regression(X_train_original, X_test_original, y_train_original, y_test_original)
print(f"Original Data - MSE: {mse_original:.2f}, R²: {r2_original:.2f}")

Original Data - MSE: 0.32, R²: 0.40


In [55]:
mse_z, r2_z = apply_decision_tree_regression(X_train_z, X_test_z, y_train_z, y_test_z)
print(f"Z-Standardized Data - MSE: {mse_z:.2f}, R²: {r2_z:.2f}")

Z-Standardized Data - MSE: 0.32, R²: 0.40


In [57]:
mse_tree_boxcox, r2_tree_boxcox = apply_decision_tree_regression(X_train_boxcox, X_test_boxcox, y_train_boxcox, y_test_boxcox)
print(f"Box-Cox Data (Decision Tree) - MSE: {mse_tree_boxcox:.2f}, R²: {r2_tree_boxcox:.2f}")

Box-Cox Data (Decision Tree) - MSE: 0.32, R²: 0.40


### Linear Regression

In [59]:
from sklearn.linear_model import LinearRegression

In [61]:
def apply_linear_regression(X_train, X_test, y_train, y_test):
    # Initialize the Linear Regression model
    linear_regressor = LinearRegression()

    # Train the Linear Regression model
    linear_regressor.fit(X_train, y_train)

    # Predict on the test set
    y_pred = linear_regressor.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mse, r2

In [63]:
mse_original, r2_original = apply_linear_regression(X_train_original, X_test_original, y_train_original, y_test_original)
print(f"Original Data - MSE: {mse_original:.2f}, R²: {r2_original:.2f}")

Original Data - MSE: 0.26, R²: 0.51


In [65]:
mse_z, r2_z = apply_linear_regression(X_train_z, X_test_z, y_train_z, y_test_z)
print(f"Z-Standardized Data - MSE: {mse_z:.2f}, R²: {r2_z:.2f}")

Z-Standardized Data - MSE: 0.26, R²: 0.51


In [67]:
mse_boxcox, r2_boxcox = apply_linear_regression(X_train_boxcox, X_test_boxcox, y_train_boxcox, y_test_boxcox)
print(f"Box-Cox Data - MSE: {mse_boxcox:.2f}, R²: {r2_boxcox:.2f}")

Box-Cox Data - MSE: 0.52, R²: 0.02


### Random Forest

In [69]:
from sklearn.ensemble import RandomForestRegressor

In [71]:
def apply_random_forest_regression(X_train, X_test, y_train, y_test):
    # Initialize the Random Forest Regressor
    rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

    # Train the Random Forest Regressor
    rf_regressor.fit(X_train, y_train)

    # Predict on the test set
    y_pred = rf_regressor.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mse, r2

In [73]:
mse_original, r2_original = apply_random_forest_regression(X_train_original, X_test_original, y_train_original, y_test_original)
print(f"Original Data - MSE: {mse_original:.2f}, R²: {r2_original:.2f}")

Original Data - MSE: 0.25, R²: 0.52


In [74]:
mse_z, r2_z = apply_random_forest_regression(X_train_z, X_test_z, y_train_z, y_test_z)
print(f"Z-Standardized Data - MSE: {mse_z:.2f}, R²: {r2_z:.2f}")

Z-Standardized Data - MSE: 0.25, R²: 0.52


In [75]:
mse_boxcox, r2_boxcox = apply_random_forest_regression(X_train_boxcox, X_test_boxcox, y_train_boxcox, y_test_boxcox)
print(f"Box-Cox Data - MSE: {mse_boxcox:.2f}, R²: {r2_boxcox:.2f}")

Box-Cox Data - MSE: 0.25, R²: 0.52
