In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

In [2]:
# Load the dataset
airbnb_data = pd.read_csv(r"/content/new_york_listings_2024.csv")

In [4]:
# Display full list of columns
print("Columns:")
print(airbnb_data.columns)

Columns:
Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'license', 'rating',
       'bedrooms', 'beds', 'baths'],
      dtype='object')


In [5]:
# Count the number of missing values in each column variable
missing_values_per_column = airbnb_data.isnull().sum()

# Print the count of missing values in each column variable
print("Missing Values per Column Variable:")
print(missing_values_per_column)


Missing Values per Column Variable:
id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
number_of_reviews_ltm             0
license                           0
rating                            0
bedrooms                          0
beds                              0
baths                             0
dtype: int64


In [6]:
airbnb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   name                            20758 non-null  object 
 2   host_id                         20758 non-null  int64  
 3   host_name                       20758 non-null  object 
 4   neighbourhood_group             20758 non-null  object 
 5   neighbourhood                   20758 non-null  object 
 6   latitude                        20758 non-null  float64
 7   longitude                       20758 non-null  float64
 8   room_type                       20758 non-null  object 
 9   price                           20758 non-null  float64
 10  minimum_nights                  20758 non-null  int64  
 11  number_of_reviews               20758 non-null  int64  
 12  last_review                     

In [7]:
airbnb_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,20758.0,3.034044e+17,3.901216e+17,2595.0,27088080.0,49930030.0,7.216019e+17,1.054376e+18
host_id,20758.0,174931600.0,172554100.0,1678.0,20417380.0,108727100.0,314410200.0,550403500.0
latitude,20758.0,40.7268,0.06029351,40.500314,40.68415,40.72282,40.7631,40.91115
longitude,20758.0,-73.93916,0.06140306,-74.24984,-73.98071,-73.94959,-73.91746,-73.71365
price,20758.0,187.7766,1022.797,10.0,80.0,125.0,199.0,100000.0
minimum_nights,20758.0,28.55844,33.53652,1.0,30.0,30.0,30.0,1250.0
number_of_reviews,20758.0,42.6426,73.56165,1.0,4.0,14.0,49.0,1865.0
reviews_per_month,20758.0,1.25791,1.904661,0.01,0.21,0.65,1.8,75.49
calculated_host_listings_count,20758.0,18.84411,70.91083,1.0,1.0,2.0,5.0,713.0
availability_365,20758.0,205.9903,135.0878,0.0,87.0,215.0,353.0,365.0


In [104]:
print(airbnb_data.columns)

Index(['id', 'neighbourhood', 'room_type', 'price', 'rating', 'bedrooms',
       'beds', 'baths', 'log10_minimum_nights', 'log10_number_of_reviews',
       'log10_calculated_host_listings_count', 'log10_availability_365'],
      dtype='object')


In [2]:
import numpy as np
!pip install feature-engine
import ast
import re
import numpy as np  # Import NumPy for handling numerical operations
import pandas as pd  # Import Pandas for data manipulation and analysis
import warnings  # Import Warnings to suppress unnecessary warnings
# Import RareLabelEncoder from feature_engine.encoding for encoding categorical features
from feature_engine.encoding import RareLabelEncoder

# Set Pandas options to display a maximum of 1000 rows
pd.set_option('display.max_rows', 1000)
# Load the dataset
airbnb_data = pd.read_csv(r"/content/new_york_listings_2024.csv")

main_label = 'price'
# Exclude 1% of smallest and 1% of highest prices
P = np.percentile(airbnb_data[main_label], [1, 99])
airbnb_data = airbnb_data[(airbnb_data[main_label] > P[0]) & (airbnb_data[main_label] < P[1])]
# log10-transform columns and group for larger bins
for col in ['minimum_nights', 'number_of_reviews', 'calculated_host_listings_count', 'availability_365']:
    airbnb_data[f'log10_{col}'] = airbnb_data[col].apply(lambda x: 1/5*round(5*np.log10(1+x)))
    airbnb_data = airbnb_data.drop([col], axis=1)
# set up the rare label encoder limiting number of categories to max_n_categories
for col in ['neighbourhood', 'room_type']:
    encoder = RareLabelEncoder(n_categories=1, max_n_categories=120, replace_with='Other', tol=20/airbnb_data.shape[0])
    airbnb_data[col] = encoder.fit_transform(airbnb_data[[col]])
# drop unused columns
cols2drop = ['name', 'host_id', 'host_name', 'latitude', 'longitude', 'license', 'neighbourhood_group',
             'last_review', 'reviews_per_month', 'number_of_reviews_ltm']
airbnb_data= airbnb_data.drop(cols2drop, axis=1)
print(airbnb_data.shape)
airbnb_data.sample(5).T

(20300, 12)


Unnamed: 0,10349,9406,12437,9417,15417
id,30279712,702914813859077943,7711899,749089594249468672,613747202944317309
neighbourhood,Brownsville,Bedford-Stuyvesant,Washington Heights,Bedford-Stuyvesant,Flatiron District
room_type,Entire home/apt,Entire home/apt,Shared room,Entire home/apt,Private room
price,123.0,250.0,43.0,145.0,156.0
rating,4.89,No rating,4.53,4.67,4.54
bedrooms,1,1,1,2,Studio
beds,1,1,1,2,1
baths,1,2,1,1,1
log10_minimum_nights,1.4,1.4,1.4,1.4,0.4
log10_number_of_reviews,2.0,0.4,1.2,0.8,1.4


In [3]:
# Display the data types of each column in the dataset
print(airbnb_data.dtypes)
airbnb_data.columns = airbnb_data.columns.str.strip()



# Define features (X) and target variable (y)
features = ['airbnb_data.columns ']

target = 'price'

# Split dataset into features (X) and target variable (y)
X = airbnb_data[airbnb_data.columns]
y = airbnb_data[target]

# Split data into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and validation sets
print("Training set (X_train) shape:", X_train.shape)
print("Validation set (X_val) shape:", X_val.shape)
print("Training set (y_train) shape:", y_train.shape)
print("Validation set (y_val) shape:", y_val.shape)

id                                        int64
neighbourhood                            object
room_type                                object
price                                   float64
rating                                   object
bedrooms                                 object
beds                                      int64
baths                                    object
log10_minimum_nights                    float64
log10_number_of_reviews                 float64
log10_calculated_host_listings_count    float64
log10_availability_365                  float64
dtype: object
Training set (X_train) shape: (16240, 12)
Validation set (X_val) shape: (4060, 12)
Training set (y_train) shape: (16240,)
Validation set (y_val) shape: (4060,)


In [4]:
# Replace 'No rating' with NaN
X_train['rating'] = pd.to_numeric(X_train['rating'], errors='coerce')
X_val['rating'] = pd.to_numeric(X_val['rating'], errors='coerce')


X['rating'] = pd.to_numeric(X['rating'], errors='coerce')
X['rating'] = X['rating'].astype(float)

X['baths'] = X['baths'].replace('Not specified', np.nan)
X['baths'] = pd.to_numeric(X['baths'], errors='coerce')

X['bedrooms'] = X['bedrooms'].replace('studio', 0)
X['bedrooms'] = pd.to_numeric(X['bedrooms'], errors='coerce')


# Fill NaN values with a specific numerical value or strategy (e.g., mean, median)
X_train['rating'].fillna(-1, inplace=True)  # Replace NaN with -1
X_val['rating'].fillna(-1, inplace=True)

# Convert 'rating' column to float type
X_train['rating'] = X_train['rating'].astype(float)
X_val['rating'] = X_val['rating'].astype(float)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train['rating'].fillna(-1, inplace=True)  # Replace NaN with -1
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['rating'].fillna(-1, inplace=True)


In [5]:
# import numpy as np
# !pip install feature-engine
# Replace 'Not specified' with a consistent numerical representation (e.g., NaN)
X_train['baths'] = X_train['baths'].replace('Not specified', np.nan)
X_val['baths'] = X_val['baths'].replace('Not specified', np.nan)

# Convert the 'baths' column to numeric
X_train['baths'] = pd.to_numeric(X_train['baths'], errors='coerce')
X_val['baths'] = pd.to_numeric(X_val['baths'], errors='coerce')

# Handle any missing values, if present
# For example, filling missing values with the median
median_baths = X_train['baths'].median()
X_train['baths'].fillna(median_baths, inplace=True)
X_val['baths'].fillna(median_baths,inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train['baths'].fillna(median_baths, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['baths'].fillna(median_baths,inplace=True)


In [6]:
# Replace 'studio' with a consistent numerical representation
X_train['bedrooms'] = X_train['bedrooms'].replace('studio', 0)
X_val['bedrooms'] = X_val['bedrooms'].replace('studio', 0)

# Replace 'Studio' with a consistent numerical representation (e.g., 0 for studio)
X_train['bedrooms'] = X_train['bedrooms'].replace('Studio', 0)
X_val['bedrooms'] = X_val['bedrooms'].replace('Studio', 0)

# Convert the 'bedrooms' column to numeric
X_train['bedrooms'] = pd.to_numeric(X_train['bedrooms'], errors='coerce')
X_val['bedrooms'] = pd.to_numeric(X_val['bedrooms'], errors='coerce')

# Handle any missing values, if present
# For example, filling missing values with the median
median_bedrooms = X_train['bedrooms'].median()
X_train['bedrooms'].fillna(median_bedrooms, inplace=True)
X_val['bedrooms'].fillna(median_bedrooms,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train['bedrooms'].fillna(median_bedrooms, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['bedrooms'].fillna(median_bedrooms,inplace=True)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Assuming 'X_train' and 'X_val' are your feature datasets containing categorical variables
# Let's say 'X_train' and 'X_val' have multiple columns containing categorical values

# Columns to be one-hot encoded
categorical_cols = ['neighbourhood', 'room_type']

# One-hot encode the categorical columns in both training and validation datasets
encoder = OneHotEncoder(handle_unknown='ignore')

X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical_cols]).toarray(), columns=encoder.get_feature_names_out(categorical_cols))
X_val_encoded = pd.DataFrame(encoder.transform(X_val[categorical_cols]).toarray(), columns=encoder.get_feature_names_out(categorical_cols))

# Drop the original categorical columns from both datasets
X_train.drop(categorical_cols, axis=1, inplace=True)
X_val.drop(categorical_cols, axis=1, inplace=True)

# Concatenate the encoded columns with the original datasets
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_val = pd.concat([X_val, X_val_encoded], axis=1)


In [8]:
# Check column names in X_train
print("Column Names in X_train:", X_train.columns)

# Inspect a sample of X_train
print("Sample of X_train:")
print(X_train.head())

# Check column names in y_train
print("Name of y_train:", y_train.name)

# Inspect a sample of y_train
print("Sample of y_train:")
print(y_train.head())

Column Names in X_train: Index(['id', 'price', 'rating', 'bedrooms', 'beds', 'baths',
       'log10_minimum_nights', 'log10_number_of_reviews',
       'log10_calculated_host_listings_count', 'log10_availability_365',
       ...
       'neighbourhood_West Village', 'neighbourhood_Williamsbridge',
       'neighbourhood_Williamsburg', 'neighbourhood_Windsor Terrace',
       'neighbourhood_Woodhaven', 'neighbourhood_Woodside',
       'room_type_Entire home/apt', 'room_type_Hotel room',
       'room_type_Private room', 'room_type_Shared room'],
      dtype='object', length=135)
Sample of X_train:
                id  price  rating  bedrooms  beds  baths  \
925   4.062402e+07   85.0   -1.00       2.0   8.0    2.0   
7659  9.169225e+17  175.0    5.00       2.0   2.0    1.0   
7772  4.717912e+07  120.0    4.74       2.0   2.0    2.0   
188   8.789687e+17   51.0   -1.00       1.0   1.0    1.5   
1092  5.255347e+07  198.0    4.52       1.0   1.0    1.0   

      log10_minimum_nights  log10_number

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify numeric and non-numeric columns in X_train
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
non_numeric_cols = X_train.select_dtypes(include=['object']).columns

# Define preprocessing steps for numeric and non-numeric columns
numeric_transformer = SimpleImputer(strategy='median')
non_numeric_transformer = SimpleImputer(strategy='most_frequent')

# Apply preprocessing steps to different types of columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('non_num', non_numeric_transformer, non_numeric_cols)
    ])

# Fit and transform X_train
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_train_numeric = X_train[numeric_cols]

# Transform X_val
X_val_preprocessed = preprocessor.transform(X_val)

# Impute missing values separately for the target variable y_train
target_imputer = SimpleImputer(strategy='median')
y_train_imputed = target_imputer.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_val_imputed = target_imputer.transform(y_val.values.reshape(-1,1)).ravel()


In [12]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode categorical variables
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train_preprocessed)

# Print the shape of the encoded data to verify
print("Shape of X_train_encoded:", X_train_encoded.shape)

Shape of X_train_encoded: (19794, 17372)


In [13]:
print("Shape of X_train_encoded:", X_train_encoded.shape)
print("Shape of y_train_imputed:", y_train_imputed.shape)

Shape of X_train_encoded: (19794, 17372)
Shape of y_train_imputed: (16240,)


In [18]:
from sklearn.preprocessing import OneHotEncoder

# Define preprocessing steps for non-numeric columns
non_numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))    # One-hot encode categorical variables
])

# Apply preprocessing steps to different types of columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('non_num', non_numeric_transformer, non_numeric_cols)
    ])
# Fit the ColumnTransformer on the training data
preprocessor.fit(X_train)

# Transform X_train
X_train_preprocessed = preprocessor.transform(X_train)

# Transform X_val
X_val_preprocessed = preprocessor.transform(X_val)
# Fit and transform X_train
X_train_numeric = X_train[numeric_cols]

# Transform X_val
X_val_preprocessed = preprocessor.transform(X_val)

In [20]:
print("Shape of X_train_preprocessed:", X_train_preprocessed.shape)
print("Shape of y_train_imputed:", y_train_imputed.shape)


Shape of X_train_preprocessed: (19794, 135)
Shape of y_train_imputed: (16240,)


In [29]:
# Subset X_train_preprocessed to match the number of samples in y_train_imputed
X_train_preprocessed_subset = X_train_preprocessed[:len(y_train_imputed)]

# Verify the shape of the subsetted input features
print("Shape of X_train_preprocessed_subset:", X_train_preprocessed_subset.shape)


Shape of X_train_preprocessed_subset: (16240, 135)


In [33]:
print("Shape of y_val_imputed:", y_val_imputed.shape)
print("Shape of y_pred_linear_reg:", y_pred_linear_reg.shape)


Shape of y_val_imputed: (4060,)
Shape of y_pred_linear_reg: (16240,)


In [34]:
# Subset y_pred_linear_reg to match the number of samples in y_val_imputed
y_pred_linear_reg_subset = y_pred_linear_reg[:len(y_val_imputed)]
# Verify the shape of the subsetted input features
print("Shape of y_pred_linear_reg_subset:", y_pred_linear_reg_subset.shape)

Shape of y_pred_linear_reg_subset: (4060,)


In [35]:
from sklearn.linear_model import LinearRegression

# Instantiate the Linear Regression model
linear_reg_model = LinearRegression()

# Fit the Linear Regression model on the imputed data
linear_reg_model.fit(X_train_preprocessed_subset , y_train_imputed)

# Make predictions on the validation data
y_pred_linear_reg = linear_reg_model.predict(X_train_preprocessed_subset)

# Now you can evaluate the performance of the Linear Regression model, e.g., by calculating the mean squared error
from sklearn.metrics import mean_squared_error

mse_linear_reg = mean_squared_error(y_val_imputed, y_pred_linear_reg_subset)
print("Mean Squared Error (Linear Regression):", mse_linear_reg)



Mean Squared Error (Linear Regression): 32206.95696660115


In [38]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Calculate R-squared (R2) score
r2 = r2_score(y_val_imputed, y_pred_linear_reg_subset)
print("R-squared (R2) score:", r2)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_val_imputed, y_pred_linear_reg_subset)
print("Mean Absolute Error (MAE):", mae)

# Calculate Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_val_imputed, y_pred_linear_reg_subset, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)


R-squared (R2) score: -1.119978944965462
Mean Absolute Error (MAE): 119.49307468209585
Root Mean Squared Error (RMSE): 179.4629682318922




In [40]:
print("Shape of y_val_imputed:", y_val_imputed.shape)
print("Shape of y_pred_naive_bayes:", y_pred_naive_bayes.shape)


Shape of y_val_imputed: (4060,)
Shape of y_pred_naive_bayes: (7359,)


In [46]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error

# Instantiate the Naive Bayes model
naive_bayes_model = GaussianNB()

# Fit the Naive Bayes model on the training data
naive_bayes_model.fit(X_train_preprocessed_subset, y_train_imputed)

# Make predictions on the validation data
y_pred_naive_bayes = naive_bayes_model.predict(X_train_preprocessed_subset)

# Now you can evaluate the performance of the Naive Bayes model by calculating the mean squared error
mse_naive_bayes = mean_squared_error(y_val_imputed, y_pred_naive_bayes[:len(y_val_imputed)])
print("Mean Squared Error (Naive Bayes):", mse_naive_bayes)



Mean Squared Error (Naive Bayes): 77551.01551724138


In [47]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Calculate R-squared (R2) score
r2_nb = r2_score(y_val_imputed, y_pred_naive_bayes_subset)
print("R-squared (R2) score (Naive Bayes):", r2_nb)

# Calculate Mean Absolute Error (MAE)
mae_nb = mean_absolute_error(y_val_imputed, y_pred_naive_bayes_subset)
print("Mean Absolute Error (MAE) (Naive Bayes):", mae_nb)

# Calculate Root Mean Squared Error (RMSE)
rmse_nb = mean_squared_error(y_val_imputed, y_pred_naive_bayes_subset, squared=False)
print("Root Mean Squared Error (RMSE) (Naive Bayes):", rmse_nb)


R-squared (R2) score (Naive Bayes): -4.174492635964833
Mean Absolute Error (MAE) (Naive Bayes): 236.7716748768473
Root Mean Squared Error (RMSE) (Naive Bayes): 280.3773710502407




In [52]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Instantiate the Decision Tree regressor
decision_tree_model = DecisionTreeRegressor()

# Fit the model on the training data
decision_tree_model.fit(X_train_preprocessed_subset, y_train_imputed)

# Make predictions on the validation data
y_pred_decision_tree = decision_tree_model.predict(X_val_preprocessed)

# Trim y_pred_decision_tree to match the shape of y_val_imputed
y_pred_decision_tree_subset = y_pred_decision_tree[:len(y_val_imputed)]

# Calculate R-squared (R2) score
r2_decision_tree = r2_score(y_val_imputed, y_pred_decision_tree_subset)
print("R-squared (R2) score (Decision Tree):", r2_decision_tree)

# Calculate Mean Absolute Error (MAE)
mae_decision_tree = mean_absolute_error(y_val_imputed, y_pred_decision_tree_subset)
print("Mean Absolute Error (MAE) (Decision Tree):", mae_decision_tree)

# Calculate Root Mean Squared Error (RMSE)
rmse_decision_tree = mean_squared_error(y_val_imputed, y_pred_decision_tree_subset, squared=False)
print("Root Mean Squared Error (RMSE) (Decision Tree):", rmse_decision_tree)


R-squared (R2) score (Decision Tree): 0.9999910343671043
Mean Absolute Error (MAE) (Decision Tree): 0.03423645320197044
Root Mean Squared Error (RMSE) (Decision Tree): 0.3690621852096529




In [53]:
# Now you can evaluate the performance of the Decision Tree model by calculating the mean squared error
mse_decision_tree = mean_squared_error(y_val_imputed, y_pred_decision_tree_subset)
print("Mean Squared Error (Decision Tree):", mse_decision_tree)

Mean Squared Error (Decision Tree): 0.13620689655172413


In [61]:
print("Shape of y_val_imputed:", y_val_imputed.shape)
print("Shape of y_pred_xgb_subset:", y_pred_xgb_subset.shape)




Shape of y_val_imputed: (4060,)
Shape of y_pred_xgb_subset: (4060,)


In [63]:
# Install XGBoost if you haven't already
# !pip install xgboost

from sklearn.metrics import mean_squared_error

# Import necessary modules
import xgboost as xgb_regressor
from sklearn.metrics import mean_squared_error


X_train_preprocessed_subset = X_train_preprocessed[:len(y_train_imputed)]

# Fit the model on the training data
xgb_regressor.fit(X_train_preprocessed_subset, y_train_imputed)

# Make predictions on the validation data
y_pred_xgb = xgb_regressor.predict(X_train_preprocessed_subset)

# Evaluate the model's performance
mse_xgb = mean_squared_error(y_val_imputed, y_pred_xgb_subset)
print("Mean Squared Error (XGBoost):", mse_xgb)


Mean Squared Error (XGBoost): 32008.33136893449


In [64]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Calculate R-squared (R2) score
r2_xgb = r2_score(y_val_imputed, y_pred_xgb_subset)
print("R-squared (R2) score (XGBoost):", r2_xgb)

# Calculate Mean Absolute Error (MAE)
mae_xgb = mean_absolute_error(y_val_imputed, y_pred_xgb_subset)
print("Mean Absolute Error (MAE) (XGBoost):", mae_xgb)

# Calculate Root Mean Squared Error (RMSE)
rmse_xgb = mean_squared_error(y_val_imputed, y_pred_xgb_subset, squared=False)
print("Root Mean Squared Error (RMSE) (XGBoost):", rmse_xgb)


R-squared (R2) score (XGBoost): -1.1069046863380119
Mean Absolute Error (MAE) (XGBoost): 118.97399168061506
Root Mean Squared Error (RMSE) (XGBoost): 178.9087235685686




In [None]:
linear_reg_model.fit(X_train_preprocessed_subset , y_train_imputed)

naive_bayes_model.fit(X_train_preprocessed_subset, y_train_imputed)

decision_tree_model.fit(X_train_preprocessed_subset, y_train_imputed)

xgb_regressor.fit(X_train_preprocessed_subset, y_train_imputed)
