In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, RandomizedSearchCV
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, make_scorer, mean_squared_error, root_mean_squared_error

In [3]:
from sklearn.ensemble import RandomForestRegressor

In [14]:
# Custom RMSE scoring with rounding
def rounded_rmse(y_true, y_pred):
    # Round predictions to nearest integer
    y_pred_rounded = np.round(y_pred)
    return np.sqrt(mean_squared_error(y_true, y_pred_rounded))

# Custom scorer for GridSearchCV
rmse_scorer = make_scorer(rounded_rmse, greater_is_better=False)

In [43]:
df = pd.read_csv("datas/df_1124v4_train.csv")
df_submission = pd.read_csv("datas/df_1124v4_test.csv")

In [44]:
X, y = df.drop(columns=["price"]), df["price"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=23
)

In [32]:
rf_param_grid = {
    'n_estimators': [500, 700, 900],              # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],              # Maximum depth of each tree
    'min_samples_split': [2, 5, 10, 15],          # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4, 8],             # Minimum samples required at each leaf node
    'max_features': ['sqrt', 'log2', None],       # Number of features to consider at each split
    'bootstrap': [True, False]                    # Whether to use bootstrap samples
}


# Initialize the Random Forest Classifier
rf = RandomForestRegressor(random_state=23)

# Initialize GridSearchCV
grid_search = RandomizedSearchCV(
    estimator=rf,
    n_iter=20,
    param_distributions=rf_param_grid,
    cv=5,  # 5-fold cross-validation
    scoring="neg_root_mean_squared_error",  # Use RMSE as scoring metric
    n_jobs=-1,  # Use all available cores
    verbose=2
)

# Fit the grid search to your data
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f"Best RF parameters: {best_params}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=900; total time=  13.4s
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=900; total time=  13.2s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=700; total time=  25.4s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=900; total time=  40.2s




Best RF parameters: {'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 20, 'bootstrap': True}
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=900; total time=  39.8s
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=500; total time=   6.3s
[CV] END bootstrap=False, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=15, n_estimators=900; total time= 7.0min
[CV] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=700; total time= 3.6min
[CV] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time= 3.6min
[CV] END bootstrap=False, max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=2, n_estimators=900; total time= 6.8min
[CV] END bootstrap=True, max_

In [33]:
# Predict on the validation set
y_pred = best_model.predict(X_test)
y_pred_rounded = np.clip(np.round(y_pred), 0, 5)

# Calculate final RMSE
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rounded))
print(f"Final RMSE: {final_rmse}")

Final RMSE: 0.3756099074142889


In [42]:
feature_importances = best_model.feature_importances_
feature_names = X_train.columns

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort by importance (descending order)
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the DataFrame
print(importance_df.head(20))

                                          Feature  Importance
270                        room_type_Private room    0.244749
23                                 minimum_nights    0.129754
1                                       longitude    0.062244
6                             host_listings_count    0.044434
14                                   accommodates    0.042020
268                     room_type_Entire home/apt    0.039385
0                                        latitude    0.035906
15                                      bathrooms    0.023918
41                               shared_bathrooms    0.023563
12   calculated_host_listings_count_private_rooms    0.023071
16                                       bedrooms    0.020575
7                       host_total_listings_count    0.020121
603                   property_type_Room in hotel    0.014816
20                                availability_90    0.014809
38                                amenities_count    0.014049
19      

In [34]:
# Store the IDs before dropping them
ids = df_submission['id'].values

# Drop the ID column for predictions
X_submission = df_submission.drop('id', axis=1)

# Generate predictions
predictions = best_model.predict(X_submission)
rounded_predictions = np.clip(np.round(predictions), 0, 5)

# Create a DataFrame with IDs and predictions
submission_df = pd.DataFrame({
    'id': ids,
    'price': rounded_predictions
})

In [35]:
# Save to CSV file
submission_df.to_csv('submissions/rf_v2.csv', index=False)

In [None]:
# Train on all data before submitting

In [None]:
rf_param_grid = {
    'n_estimators': [500, 700, 900],              # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],              # Maximum depth of each tree
    'min_samples_split': [2, 5, 10, 15],          # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4, 8],             # Minimum samples required at each leaf node
    'max_features': ['sqrt', 'log2', None],       # Number of features to consider at each split
    'bootstrap': [True, False]                    # Whether to use bootstrap samples
}


# Initialize the Random Forest Classifier
rf = RandomForestRegressor(random_state=23)

# Initialize GridSearchCV
grid_search = RandomizedSearchCV(
    estimator=rf,
    n_iter=20,
    param_distributions=rf_param_grid,
    cv=5,  # 5-fold cross-validation
    scoring=rmse_scorer,  # Use RMSE as scoring metric
    n_jobs=-1,  # Use all available cores
    verbose=2
)

# Fit the grid search to your data
grid_search.fit(X, y)

# Best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f"Best RF parameters: {best_params}")

In [36]:
timed_model = RandomForestRegressor(random_state=23, **best_params)

In [39]:
t1 = time.time()
timed_model.fit(X, y)
t2 = time.time()
print(f"{t2- t1}")

197.7847821712494


In [40]:
print(best_params)

{'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 20, 'bootstrap': True}
