### Feature Engineering & Normalization

In [72]:
import pandas as pd
import numpy as np



importing and using the cleaned csv file 

In [73]:
df=pd.read_csv("cleaned_listings.csv")
df.head()


Unnamed: 0,price,room_type,neighbourhood,minimum_nights,availability_365
0,60,3,34,2,353
1,105,1,65,2,155
2,60,1,76,2,321
3,65,1,39,4,300
4,21,3,34,1,168


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6297 entries, 0 to 6296
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   price             6297 non-null   int64
 1   room_type         6297 non-null   int64
 2   neighbourhood     6297 non-null   int64
 3   minimum_nights    6297 non-null   int64
 4   availability_365  6297 non-null   int64
dtypes: int64(5)
memory usage: 246.1 KB


Creating new features


Low price but still high availability might indicate multiple factors like quality , competetiveness.

In [75]:
# df['price_per_available_day'] = df['price'] / (df['availability_365'] + 1)  

In [76]:
# from sklearn.discriminant_analysis import StandardScaler

# scaler = StandardScaler()
# df[['price', 'minimum_nights', 'availability_365']] = scaler.fit_transform(df[['price', 'minimum_nights', 'availability_365']])

In [77]:
df.head()

Unnamed: 0,price,room_type,neighbourhood,minimum_nights,availability_365
0,60,3,34,2,353
1,105,1,65,2,155
2,60,1,76,2,321
3,65,1,39,4,300
4,21,3,34,1,168


Since the price is in negative we are using the min-max scaler

In [78]:
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# df[['price', 'minimum_nights', 'availability_365']] = scaler.fit_transform(df[['price', 'minimum_nights', 'availability_365']])
# df.head()


Performing 3-way split

In [79]:
from sklearn.model_selection import train_test_split


X = df.drop(columns=['price'])  
y = df['price']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

Training and evaluating all methods at once using for loop 

In [80]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
}

# Train & evaluate models
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    y_val_pred = model.predict(X_val)  # Predict validation set
    
    # Evaluate performance
    mae = mean_absolute_error(y_val, y_val_pred)
    rmse = root_mean_squared_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    
    results[name] = {"MAE": mae, "RMSE": rmse, "R2 Score": r2}

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print(results_df)


                         MAE        RMSE  R2 Score
Linear Regression  93.409013  317.405377  0.029165
Decision Tree      90.466748  325.262897 -0.019497
Random Forest      85.146870  316.673850  0.033635
Gradient Boosting  90.255036  319.857665  0.014106


Getting the model with highest R2 and testing on final test set

In [None]:
best_model_name = results_df['R2 Score'].idxmax()  # Get model with highest R²
best_model = models[best_model_name]

# Test on final test set
y_test_pred = best_model.predict(X_test)

# Final evaluation
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = root_mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Best Model: {best_model_name}")
print(f"Test MAE: {test_mae}")
print(f"Test RMSE: {test_rmse}")
print(f"Test R2 Score: {test_r2}")


Best Model: Random Forest
Test MAE: 90.31071765221753
Test RMSE: 372.35197735670204
Test R2 Score: -0.04479212817453471


In [82]:
import joblib
joblib.dump(best_model, "best_airbnb_model.pkl")


['best_airbnb_model.pkl']