In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('Airbnb_Open_Data.csv')

# Clean price and service fee columns
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
df['service fee'] = df['service fee'].replace('[\$,]', '', regex=True).astype(float)

# Select features and target
features = ['neighbourhood group', 'neighbourhood', 'room type', 'Construction year', 
            'minimum nights', 'number of reviews', 'reviews per month', 
            'review rate number', 'calculated host listings count', 'availability 365',
            'host_identity_verified', 'instant_bookable', 'cancellation_policy']
target = 'price'

# Drop rows with missing target
df = df.dropna(subset=[target])

# Split data
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_features = ['Construction year', 'minimum nights', 'number of reviews', 
                    'reviews per month', 'review rate number', 
                    'calculated host listings count', 'availability 365']
categorical_features = ['neighbourhood group', 'neighbourhood', 'room type', 
                        'host_identity_verified', 'instant_bookable', 'cancellation_policy']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create Random Forest pipeline
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
pipeline_rf.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline_rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f'RMSE: {rmse:.2f}')
print(f'R2 Score: {r2:.2f}')
# Feature importance
feature_names = preprocessor.get_feature_names_out()
importances = pipeline_rf.named_steps['model'].feature_importances_
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(8, 5))
sns.barplot(data=feature_importance_df, x='Importance', y='Feature')
plt.title('Feature Importance (Random Forest)')
plt.show()

# Save the model and preprocessor
joblib.dump(pipeline_rf, 'random_forest_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')

  df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
  df['service fee'] = df['service fee'].replace('[\$,]', '', regex=True).astype(float)


TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['bool', 'str']