In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from icecream import ic
import sys
sys.path.append("./../projectname")
from projectname.config import raw_data_dir
from projectname.config import clean_data_dir
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
import joblib


In [None]:
# Load the dataset
file_path = clean_data_dir / 'hotel_bookings_clean.csv'
bookings_df = pd.read_csv(file_path)
bookings_df

# Models Seleccion Analysis

### Columns to Discard:
 - hotel: The hotel type (City Hotel or Resort Hotel). While interesting, it might not be as predictive and can be indirectly inferred from other features.
 - arrival_date_month: The month of arrival. Less precise compared to week number and day of the month.
 - reservation_status: Indicates if the booking is canceled, checked-out, or no-show. This is directly related to the target variable and should be discarded to avoid data leakage.
 - reservation_status_date: The date when the reservation status was last updated. This can also lead to data leakage.
 - is_repeated_guest: Indicates if the guest has stayed before. While it can be relevant, it might be less impactful compared to other features. (This column wasn't listed initially, but should be considered if present.)

In [None]:

# Drop the unnamed column
bookings_df.drop(columns=["Unnamed: 0"], inplace=True)

# Separate features and target variable
X = bookings_df.drop(columns=["is_canceled"])
y = bookings_df["is_canceled"]

# Identify categorical and numerical features
categorical_cols = ['meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type',
                    'assigned_room_type', 'deposit_type', 'customer_type', 'agent', 'company', 'is_repeated_guest']

numerical_cols = ['lead_time', 'arrival_date_year', 'arrival_date_week_number', 
                  'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 
                  'adults', 'children', 'babies', 'adr', 'days_in_waiting_list', 
                  'required_car_parking_spaces', 'total_of_special_requests']




In [None]:
# Preprocessing for numerical data: impute missing values and scale
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: impute missing values and one-hot encode
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate each model
results = {}

for model_name, model in models.items():
    # Create a pipeline with the preprocessor and the model
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train the model
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_test)
    
    # Evaluate the model
    report = classification_report(y_test, y_pred, output_dict=True)
    results[model_name] = report

results

In [None]:
results_df = pd.DataFrame(results).transpose()
results_df

In [None]:
results_df.to_csv('../data/results/results_df.csv')

## Random Forest Train

In [None]:
# Define the numerical and categorical columns
categorical_cols = ['meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type',
                    'assigned_room_type', 'deposit_type', 'customer_type', 'agent', 'company', 'is_repeated_guest']

numerical_cols = ['lead_time', 'arrival_date_year', 'arrival_date_week_number', 
                  'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 
                  'adults', 'children', 'babies', 'adr', 'days_in_waiting_list', 
                  'required_car_parking_spaces', 'total_of_special_requests']

# Preprocessor
random_forest_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Create a pipeline with preprocessing and the model
random_forest_model_pipeline = Pipeline(steps=[
    ('preprocessor', random_forest_preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the pipeline
random_forest_model_pipeline.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = random_forest_model_pipeline.predict(X_test)
    
# Evaluate the model
report = classification_report(y_test, y_pred, output_dict=True)
report

### Save the Random Forest Pipeline and train and test datasets

In [None]:
# Load the saved pipeline
joblib.dump(random_forest_model_pipeline, '../data/models/random_forest_model_pipeline.pkl')

In [None]:
# Save the test set to a CSV file
test_set_filename = '../data/clean/X_test.csv'
X_test.to_csv(test_set_filename, index=False)

# Save the target values of the test set to a CSV file
test_target_filename = '../data/clean/y_test.csv'
y_test.to_csv(test_target_filename, index=False)