In [None]:
!pip install flask-cors

In [None]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import joblib
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
import threading


In [None]:
# Load data
data = pd.read_csv('/content/first inten project.csv', encoding='latin1')

# Check for null values and data types
print("Null values in each column:\n", data.isnull().sum())
print("\nData types of each column:\n", data.dtypes)

# Strip whitespace from column names and data
data.columns = data.columns.str.strip()
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)

  data = pd.read_csv('/content/first inten project.csv', encoding='latin1')
  data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Null values in each column:
 Booking_ID                    0
number of adults            129
number of children          129
number of weekend nights    256
number of week nights       256
type of meal                256
car parking space           256
room type                   256
lead time                   256
market segment type         256
repeated                    256
P-C                         257
P-not-C                     257
average price               257
special requests            257
date of reservation         257
booking status              257
dtype: int64

Data types of each column:
 Booking_ID                   object
number of adults             object
number of children           object
number of weekend nights     object
number of week nights       float64
type of meal                 object
car parking space           float64
room type                    object
lead time                   float64
market segment type          object
repeated                 

In [None]:
# Handle outliers using IQR
Q1 = data['average price'].quantile(0.25)
Q3 = data['average price'].quantile(0.75)
IQR = Q3 - Q1
outlier_condition = (data['average price'] < (Q1 - 1.5 * IQR)) | (data['average price'] > (Q3 + 1.5 * IQR))
data = data[~outlier_condition]

data['date of reservation'] = pd.to_datetime(data['date of reservation'], errors='coerce')

# Feature engineering: Total Guests, Total Nights, Special Request Count
data['Total Guests'] = data['number of adults'] + data['number of children']
data['Total Nights'] = pd.to_numeric(data['number of weekend nights'], errors='coerce') + pd.to_numeric(data['number of week nights'], errors='coerce')
data['Special Request Count'] = data['special requests'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)

# Encode Meal Type
meal_type_mapping = {meal: idx for idx, meal in enumerate(data['type of meal'].unique())}
data['Ordered Meal Type'] = data['type of meal'].map(meal_type_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['date of reservation'] = pd.to_datetime(data['date of reservation'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Total Guests'] = data['number of adults'] + data['number of children']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Total Nights'] = pd.to_numeric(

In [None]:
# Select features and target
features = data.drop(columns=['Booking_ID', 'booking status', 'type of meal', 'room type', 'special requests'])
target = data['booking status']

# Convert 'date of reservation' to datetime and handle errors
features['date of reservation'] = pd.to_datetime(features['date of reservation'], errors='coerce')

# Extract year, month, and day of the week from date of reservation
features['reservation_year'] = features['date of reservation'].dt.year
features['reservation_month'] = features['date of reservation'].dt.month
features['reservation_day'] = features['date of reservation'].dt.dayofweek

# Drop the original reservation date column
features = features.drop(columns=['date of reservation'])

# Update cat_features to include the new date-related features
cat_features = ['reservation_year', 'reservation_month', 'reservation_day',
                'market segment type', 'repeated guest', 'reserved room type']


# Define numeric and categorical features for the ColumnTransformer
num_features = ['lead time', 'average price', 'number of adults', 'number of children',
                'number of weekend nights', 'number of week nights', 'car parking space',
                'Total Guests', 'Total Nights', 'Special Request Count', 'Ordered Meal Type']

# Ensure all categorical features are strings
for col in cat_features:
    if col in features.columns:
        features[col] = features[col].astype(str)

# Ensure all numeric features are numeric and handle errors
for feature in num_features:
    if feature in features.columns:
        features[feature] = pd.to_numeric(features[feature], errors='coerce')

# Impute missing values in numerical features with the mean
for feature in num_features:
    if feature in features.columns:
        features[feature] = features[feature].fillna(features[feature].mean())

# Impute missing values in categorical features with the most frequent value
for feature in cat_features:
    if feature in features.columns:
        features[feature] = features[feature].fillna(features[feature].mode()[0])

# Check and handle NaN values in the target variable
# This line was added to address the potential NaN values in the target
target = target.fillna(target.mode()[0])

# Print the columns of the features DataFrame to check for inconsistencies
print(features.columns)

# Create preprocessing pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing numerical values with the mean
    ('scaler', StandardScaler())
])

# Ensure all features are present in the DataFrame before processing
num_features = [col for col in num_features if col in features.columns]
cat_features = [col for col in cat_features if col in features.columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),  # Use the pipeline for numerical features
        ('cat', OneHotEncoder(), cat_features)
    ])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42, stratify=target) # Stratify based on the target variable instead of 'reservation_year'

# Convert target variables to strings after train-test split
y_train = y_train.astype(str)
y_test = y_test.astype(str)

# Impute missing values and preprocess training and testing data using the pipeline
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

Index(['number of adults', 'number of children', 'number of weekend nights',
       'number of week nights', 'car parking space', 'lead time',
       'market segment type', 'repeated', 'P-C', 'P-not-C', 'average price',
       'Total Guests', 'Total Nights', 'Special Request Count',
       'Ordered Meal Type', 'reservation_year', 'reservation_month',
       'reservation_day'],
      dtype='object')


In [None]:
# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5,10]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_transformed, y_train)

best_rf = grid_search.best_estimator_
print(f"Best parameters for Random Forest: {grid_search.best_params_}")

# Evaluate best model
y_pred_best_rf = best_rf.predict(X_test_transformed)
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
print(f"Best Random Forest Accuracy: {accuracy_best_rf:.4f}")


Best parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Best Random Forest Accuracy: 0.8404


In [None]:
import pickle

# Save the trained model
with open('best_rf_model.pkl', 'wb') as f:
    pickle.dump(best_rf, f)

# To load the model later:
# with open('best_rf_model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)


In [None]:
app = Flask(__name__)
CORS(app)

# Load the pre-trained model
model = joblib.load('best_rf_model.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    try:
        input_data = request.json

        # Log received data for debugging
        print("Received input data:", input_data)

        # Updated required features based on model training
        required_features = [
            'lead time', 'average price', 'number of adults', 'number of children',
            'number of weekend nights', 'number of week nights', 'car parking space',
            'Total Guests', 'Total Nights', 'Special Request Count',
            'Ordered Meal Type', 'reservation_year', 'reservation_month', 'reservation_day_of_week',
            'market segment type', 'repeated guest', 'reserved room type', 'customer type'
        ]

        # Extract features from the request and create an input array for the model
        feature_array = np.array([input_data.get(feature) for feature in required_features]).reshape(1, -1)

        # Check if any feature is None and log it
        if any(value is None for value in feature_array.flatten()):
            return jsonify({'error': 'One or more required features are missing or invalid'}), 400

        # Make a prediction using the loaded model
        prediction = model.predict(feature_array)[0]

        # Return the prediction result
        return jsonify({'status': prediction})

    except KeyError as e:
        # Catch missing keys in the input data
        return jsonify({'error': f'Missing key: {str(e)}'}), 400

    except Exception as e:
        # Log any other exceptions that occur
        return jsonify({'error': str(e)}), 500


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5006
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Ngrok tunnel URL: NgrokTunnel: "https://3f47-34-168-104-136.ngrok-free.app" -> "http://localhost:5006"


INFO:werkzeug:127.0.0.1 - - [04/Oct/2024 16:30:45] "[35m[1mPOST /predict_booking HTTP/1.1[0m" 500 -


Received input data: {'lead time': 10, 'average price': 200.5, 'number of adults': 2, 'number of children': 1, 'number of weekend nights': 1, 'number of week nights': 2, 'car parking space': 1, 'Total Guests': 3, 'Total Nights': 3, 'Special Request Count': 2, 'Ordered Meal Type': 1, 'reservation_year': 2024, 'reservation_month': 10, 'reservation_day': 15, 'market segment type': 0, 'repeated guest': 1, 'reserved room type': 2}
Input data keys: dict_keys(['lead time', 'average price', 'number of adults', 'number of children', 'number of weekend nights', 'number of week nights', 'car parking space', 'Total Guests', 'Total Nights', 'Special Request Count', 'Ordered Meal Type', 'reservation_year', 'reservation_month', 'reservation_day', 'market segment type', 'repeated guest', 'reserved room type'])
Transformed feature array: [[1.000e+01 2.005e+02 2.000e+00 1.000e+00 1.000e+00 2.000e+00 1.000e+00
  3.000e+00 3.000e+00 2.000e+00 1.000e+00 2.024e+03 1.000e+01 1.500e+01
  0.000e+00 1.000e+00 2

In [None]:
def start_ngrok():
    public_url = ngrok.connect(5006)  # ngrok will listen on port 5000
    print(" * Ngrok tunnel URL:", public_url)

if __name__ == '__main__':
    threading.Thread(target=start_ngrok).start()
    app.run(port=5006)  # Make sure app.run uses the same port as ngrok


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5012
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Ngrok tunnel URL: NgrokTunnel: "https://8ffa-35-231-73-128.ngrok-free.app" -> "http://localhost:5012"


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

# Load dataset, specifying the encoding
data = pd.read_csv('/content/first inten project.csv', encoding='latin1')

# Check for null values and data types
print("Null values in each column:\n", data.isnull().sum())
print("\nData types of each column:\n", data.dtypes)

# Strip whitespace from column names and data
data.columns = data.columns.str.strip()
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Display first few rows
print("\nFirst few rows of the dataset:\n", data.head())

  data = pd.read_csv('/content/first inten project.csv', encoding='latin1')
  data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Null values in each column:
 Booking_ID                    0
number of adults            129
number of children          129
number of weekend nights    256
number of week nights       256
type of meal                256
car parking space           256
room type                   256
lead time                   256
market segment type         256
repeated                    256
P-C                         257
P-not-C                     257
average price               257
special requests            257
date of reservation         257
booking status              257
dtype: int64

Data types of each column:
 Booking_ID                   object
number of adults             object
number of children           object
number of weekend nights     object
number of week nights       float64
type of meal                 object
car parking space           float64
room type                    object
lead time                   float64
market segment type          object
repeated                 

In [None]:
# Handle outliers using IQR
Q1 = data['average price'].quantile(0.25)
Q3 = data['average price'].quantile(0.75)
IQR = Q3 - Q1
outlier_condition = (data['average price'] < (Q1 - 1.5 * IQR)) | (data['average price'] > (Q3 + 1.5 * IQR))
data = data[~outlier_condition]

# Feature engineering: Total Guests, Total Nights, Special Request Count
data['Total Guests'] = data['number of adults'] + data['number of children']
# Convert columns to numeric before adding
data['Total Nights'] = pd.to_numeric(data['number of weekend nights'], errors='coerce') + pd.to_numeric(data['number of week nights'], errors='coerce')
data['Special Request Count'] = data['special requests'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)

# Encode Meal Type
meal_type_mapping = {meal: idx for idx, meal in enumerate(data['type of meal'].unique())}
data['Ordered Meal Type '] = data['type of meal'].map(meal_type_mapping)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Total Guests'] = data['number of adults'] + data['number of children']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Total Nights'] = pd.to_numeric(data['number of weekend nights'], errors='coerce') + pd.to_numeric(data['number of week nights'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

In [None]:
from sklearn.pipeline import Pipeline
# Select features and target
features = data.drop(columns=['Booking_ID', 'booking status', 'type of meal', 'room type', 'special requests'])
target = data['booking status']

# Define numeric and categorical features for the ColumnTransformer
num_features = ['lead time', 'average price', 'number of adults', 'number of children',
                'number of weekend nights', 'number of week nights', 'car parking space',
                'total of special requests', 'Total Guests', 'Total Nights', 'Special Request Count',
                'Ordered Meal Type']
cat_features = ['arrival date month', 'market segment type', 'repeated guest', 'is repeated guest',
                'reserved room type', 'assigned room type', 'deposit type', 'customer type', 'hotel']

# Ensure all num_features are numeric and handle errors
for feature in num_features:
    if feature in features.columns:
        # Convert the column to numeric, replacing errors with NaN
        features[feature] = pd.to_numeric(features[feature], errors='coerce')

        # Check if the column still contains non-numeric values after conversion
        if features[feature].dtype == object:
            print(f"Warning: '{feature}' still contains non-numeric values after conversion. Further investigation needed.")

            # Print unique non-numeric values for debugging
            # print(f"Unique non-numeric values in '{feature}': {features[feature][features[feature].apply(lambda x: isinstance(x, str))].unique()}")

num_features = [col for col in num_features if col in features.columns]
cat_features = [col for col in cat_features if col in features.columns]

# Create ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(), cat_features)
    ])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
y_train = y_train.astype(str)
y_test = y_test.astype(str)

# Impute NaN values using SimpleImputer before fitting/transforming
from sklearn.impute import SimpleImputer # Import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), # Impute missing numerical values with the mean
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features), # Use the pipeline for numerical features
        ('cat', OneHotEncoder(), cat_features)
    ])

# Preprocess training and testing data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100,150, 200],
    'max_depth': [10,15, 20, None],
    'min_samples_split': [2, 5,8, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_transformed, y_train)

best_rf = grid_search.best_estimator_
print(f"Best parameters for Random Forest: {grid_search.best_params_}")

# Evaluate best model
y_pred_best_rf = best_rf.predict(X_test_transformed)
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
print(f"Best Random Forest Accuracy: {accuracy_best_rf:.4f}")




Best parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
Best Random Forest Accuracy: 0.8297


In [None]:
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Use StratifiedKFold to maintain class balance during cross-validation
cv = StratifiedKFold(n_splits=8)

# Initialize RFECV to select the optimal number of features, using accuracy as the scoring metric
rfecv = RFECV(estimator=rf_model, step=1, cv=cv, scoring='accuracy')

# Fit RFECV to the training data
rfecv.fit(X_train_transformed, y_train)

# Transform both train and test data based on selected features
X_train_selected = rfecv.transform(X_train_transformed)
X_test_selected = rfecv.transform(X_test_transformed)

# Retrain the model using selected features
rf_model.fit(X_train_selected, y_train)

# Make predictions and evaluate accuracy
y_pred_selected = rf_model.predict(X_test_selected)
accuracy_selected = accuracy_score(y_test, y_pred_selected)

# Print the accuracy after RFECV feature selection
print(f"Accuracy after RFECV feature selection: {accuracy_selected:.4f}")

# Print the number of selected features and their indices
print(f"Optimal number of features: {rfecv.n_features_}")
print(f"Selected feature indices: {rfecv.support_}")

# Print all features and indicate which ones were selected
feature_names = list(range(X_train_transformed.shape[1]))
selected_features = [feature_names[i] for i, is_selected in enumerate(rfecv.support_) if is_selected]
discarded_features = [feature_names[i] for i, is_selected in enumerate(rfecv.support_) if not is_selected]

print("Selected features:")
print(selected_features)

print("\nDiscarded features:")
print(discarded_features)



Accuracy after RFECV feature selection: 0.8223
Optimal number of features: 9
Selected feature indices: [ True  True  True False  True  True  True  True False False False False
 False False  True  True]
Selected features:
[0, 1, 2, 4, 5, 6, 7, 14, 15]

Discarded features:
[3, 8, 9, 10, 11, 12, 13]


In [None]:
!pip install Flask pyngrok joblib

!ngrok config add-authtoken 2mlBokJkBJ3PSki24C2eu7VNVOl_2udn662gXcUGz7v6DS6TC

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
!pip install --upgrade flask-ngrok



In [None]:
!pip install flask-cors



In [None]:
from flask_cors import CORS
from flask import Flask, request, jsonify
from pyngrok import ngrok
import numpy as np
import joblib
import threading

app = Flask(__name__)
CORS(app)

# Load the pre-trained model
model = joblib.load('/content/best_rf_model.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    try:
        input_data = request.json

        # Log received data for debugging
        print("Received input data:", input_data)
        print("Input data keys:", input_data.keys())  # Debugging statement

        # Required features based on model training
        required_features = [
            'lead time', 'average price', 'number of adults', 'number of children',
            'number of weekend nights', 'number of week nights', 'required car parking spaces',
            'total of special requests', 'Total Guests', 'Total Nights', 'Special Request Count',
            'Ordered Meal Type', 'arrival date month',
            'market segment type', 'repeated guest', 'reserved room type', 'customer type'
        ]

        # Extract features from the request and create an input array for the model
        feature_array = np.array([input_data.get(feature) for feature in required_features]).reshape(1, -1)

        # Check if any feature is None and log it
        if any(value is None for value in feature_array.flatten()):
            return jsonify({'error': 'One or more required features are missing or invalid'}), 400

        # Log transformed features to ensure correct input
        print("Transformed feature array:", feature_array)

        # Make a prediction using the loaded model
        prediction = model.predict(feature_array)[0]

        # Log the prediction result
        print("Prediction result:", prediction)

        # Return the prediction result
        return jsonify({'status': prediction})

    except KeyError as e:
        # Catch missing keys in the input data
        print(f"Missing key error: {str(e)}")
        return jsonify({'error': f'Missing key: {str(e)}'}), 400

    except Exception as e:
        # Log any other exceptions that occur
        print(f"Exception occurred: {str(e)}")
        return jsonify({'error': str(e)}), 500

def start_ngrok():
    public_url = ngrok.connect(5006)  # ngrok will listen on port 5000
    print(" * Ngrok tunnel URL:", public_url)

if __name__ == '__main__':
    threading.Thread(target=start_ngrok).start()
    # Use a different port or kill the process using that port
    app.run(port=5006)  # Make sure app.run uses the same port as ngrok


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5006
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Ngrok tunnel URL: NgrokTunnel: "https://3fbf-34-133-169-175.ngrok-free.app" -> "http://localhost:5006"


INFO:werkzeug:127.0.0.1 - - [03/Oct/2024 00:12:26] "[33mGET / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [03/Oct/2024 00:12:26] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [03/Oct/2024 00:14:19] "OPTIONS /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [03/Oct/2024 00:14:19] "[31m[1mPOST /predict HTTP/1.1[0m" 400 -


Received input data: {'lead time': '10', 'average price': '150', 'number of adults': '2', 'number of children': '1', 'number of weekend nights': '2', 'number of week nights': '3', 'required car parking spaces': '0', 'total of special requests': '1', 'Total Guests': 3, 'Total Nights': 5, 'Special Request Count': '1', 'arrival date month': 8, 'market segment type': 'Online', 'repeated guest': 'Yes', 'reserved room type': 'Type 1', 'customer type': 'Hotel 1'}
Input data keys: dict_keys(['lead time', 'average price', 'number of adults', 'number of children', 'number of weekend nights', 'number of week nights', 'required car parking spaces', 'total of special requests', 'Total Guests', 'Total Nights', 'Special Request Count', 'arrival date month', 'market segment type', 'repeated guest', 'reserved room type', 'customer type'])


INFO:werkzeug:127.0.0.1 - - [03/Oct/2024 00:14:35] "OPTIONS /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [03/Oct/2024 00:14:35] "[31m[1mPOST /predict HTTP/1.1[0m" 400 -


Received input data: {'lead time': '10', 'average price': '150', 'number of adults': '2', 'number of children': '1', 'number of weekend nights': '2', 'number of week nights': '3', 'required car parking spaces': '0', 'total of special requests': '1', 'Total Guests': 3, 'Total Nights': 5, 'Special Request Count': '1', 'arrival date month': 8, 'market segment type': 'Online', 'repeated guest': 'Yes', 'reserved room type': 'Type 1', 'customer type': 'Hotel 1'}
Input data keys: dict_keys(['lead time', 'average price', 'number of adults', 'number of children', 'number of weekend nights', 'number of week nights', 'required car parking spaces', 'total of special requests', 'Total Guests', 'Total Nights', 'Special Request Count', 'arrival date month', 'market segment type', 'repeated guest', 'reserved room type', 'customer type'])


INFO:werkzeug:127.0.0.1 - - [03/Oct/2024 00:35:18] "OPTIONS /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [03/Oct/2024 00:35:18] "[35m[1mPOST /predict HTTP/1.1[0m" 500 -


Received input data: {'lead time': '10', 'average price': '150', 'number of adults': '2', 'number of children': '2', 'number of weekend nights': '1', 'number of week nights': '1', 'required car parking spaces': '1', 'total of special requests': '1', 'Total Guests': 4, 'Total Nights': 2, 'Special Request Count': '1', 'Ordered Meal Type': 'Meal Plan 1', 'arrival date month': '2018-02-05', 'market segment type': 'Online', 'repeated guest': 'Yes', 'reserved room type': 'Type 1', 'customer type': 'Hotel 1'}
Input data keys: dict_keys(['lead time', 'average price', 'number of adults', 'number of children', 'number of weekend nights', 'number of week nights', 'required car parking spaces', 'total of special requests', 'Total Guests', 'Total Nights', 'Special Request Count', 'Ordered Meal Type', 'arrival date month', 'market segment type', 'repeated guest', 'reserved room type', 'customer type'])
Transformed feature array: [['10' '150' '2' '2' '1' '1' '1' '1' '4' '2' '1' 'Meal Plan 1'
  '2018-

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
import pickle
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
import numpy as np
import joblib
import threading

# Load dataset and clean up data
data = pd.read_csv('/content/first inten project.csv', encoding='latin1')

# Strip whitespace from column names and data
data.columns = data.columns.str.strip()
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Handle outliers using IQR
Q1 = data['average price'].quantile(0.25)
Q3 = data['average price'].quantile(0.75)
IQR = Q3 - Q1
outlier_condition = (data['average price'] < (Q1 - 1.5 * IQR)) | (data['average price'] > (Q3 + 1.5 * IQR))
data = data[~outlier_condition]

# Feature engineering: Total Guests, Total Nights, Special Request Count
data['Total Guests'] = data['number of adults'] + data['number of children']
data['Total Nights'] = pd.to_numeric(data['number of weekend nights'], errors='coerce').fillna(0) + \
                       pd.to_numeric(data['number of week nights'], errors='coerce').fillna(0)
data['Special Request Count'] = data['special requests'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)

# Encode Meal Type
meal_type_mapping = {meal: idx for idx, meal in enumerate(data['type of meal'].unique())}
data['Ordered Meal Type'] = data['type of meal'].map(meal_type_mapping)

# Display first few rows
print("\nFirst few rows of the dataset:\n", data.head())

# Select features and target
features = data.drop(columns=['Booking_ID', 'booking status', 'type of meal', 'room type', 'special requests'])
target = data['booking status']

# Define numeric and categorical features for the ColumnTransformer
num_features = [
    'lead time',
    'average price',
    'number of adults',
    'number of children',
    'number of weekend nights',
    'number of week nights',
    'car parking space',
    'Total Guests',
    'Total Nights',
    'Special Request Count',
    'Ordered Meal Type'
]
cat_features = [
    'arrival date month',
    'market segment type',
    'repeated',
    'reserved room type',
    'customer type'
]

# Ensure all num_features are numeric and handle errors
for feature in num_features:
    if feature in features.columns:
        features[feature] = pd.to_numeric(features[feature], errors='coerce')
        if features[feature].dtype == object:
            print(f"Warning: '{feature}' still contains non-numeric values after conversion.")

num_features = [col for col in num_features if col in features.columns]
cat_features = [col for col in cat_features if col in features.columns]

# Create ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(), cat_features)
    ]
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
y_train = y_train.astype(str)
y_test = y_test.astype(str)

# Impute NaN values using SimpleImputer
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features) # handle_unknown='ignore' added to OneHotEncoder
    ]
)

# Preprocess training and testing data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 8, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_transformed, y_train)

best_rf = grid_search.best_estimator_
print(f"Best parameters for Random Forest: {grid_search.best_params_}")

# Evaluate best model
y_pred_best_rf = best_rf.predict(X_test_transformed)
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
print(f"Best Random Forest Accuracy: {accuracy_best_rf:.4f}")

# Feature selection with RFECV
rf_model = RandomForestClassifier(random_state=42)
cv = StratifiedKFold(n_splits=8)
rfecv = RFECV(estimator=rf_model, step=1, cv=cv, scoring='accuracy')
rfecv.fit(X_train_transformed, y_train)

# Transform both train and test data based on selected features
X_train_selected = rfecv.transform(X_train_transformed)
X_test_selected = rfecv.transform(X_test_transformed)

# Retrain the model using selected features
rf_model.fit(X_train_selected, y_train)

# Make predictions and evaluate accuracy
y_pred_selected = rf_model.predict(X_test_selected)
accuracy_selected = accuracy_score(y_test, y_pred_selected)

# Print the accuracy after RFECV feature selection
print(f"Accuracy after RFECV feature selection: {accuracy_selected:.4f}")

# Print the number of selected features and their indices
print(f"Optimal number of features: {rfecv.n_features_}")
print(f"Selected feature indices: {rfecv.support_}")

# Print all features and indicate which ones were selected
feature_names = list(range(X_train_transformed.shape[1]))
selected_features = [feature_names[i] for i, is_selected in enumerate(rfecv.support_) if is_selected]
discarded_features = [feature_names[i] for i, is_selected in enumerate(rfecv.support_) if not is_selected]

print("Selected features:")
print(selected_features)

print("\nDiscarded features:")
print(discarded_features)

# Save the trained model
with open('best_rf_model.pkl', 'wb') as f:
    pickle.dump(best_rf, f)

# Flask application setup
app = Flask(__name__)
CORS(app)

# Load the pre-trained model
model = joblib.load('best_rf_model.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    try:
        input_data = request.json

        # Log received data for debugging
        print("Received input data:", input_data)

        # Required features based on model training
        required_features = [
            'lead time',
            'average price',
            'number of adults',
            'number of children',
            'number of weekend nights',
            'number of week nights',
            'car parking space',
            'Total Guests',
            'Total Nights',
            'Special Request Count',
            'Ordered Meal Type',
            'arrival date month',
            'market segment type',
            'repeated guest',
            'reserved room type',
            'customer type'
        ]

        # Extract features from the request and create an input array for the model
        feature_array = np.array([input_data.get(feature) for feature in required_features]).reshape(1, -1)

        # Check if any feature is None and log it
        if any(value is None for value in feature_array.flatten()):
            return jsonify({'error': 'One or more required features are missing or invalid'}), 400

        # Log transformed features to ensure correct input
        print("Transformed feature array:", feature_array)

        # Make a prediction using the loaded model
        prediction = model.predict(feature_array)[0]

        # Log the prediction result
        print("Prediction result:", prediction)

        # Return the prediction result
        return jsonify({'status': prediction})

    except KeyError as e:
        print(f"Missing key error: {str(e)}")
        return jsonify({'error': f'Missing key: {str(e)}'}), 400

    except Exception as e:
        print(f"Exception occurred: {str(e)}")
        return jsonify({'error': str(e)}), 500

def start_ngrok():
    public_url = ngrok.connect(5006)
    print(" * Ngrok tunnel URL:", public_url)

if __name__ == '__main__':
    threading.Thread(target=start_ngrok).start()
    app.run(port=5006)


  data = pd.read_csv('/content/first inten project.csv', encoding='latin1')
  data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Total Guests'] = data['number of adults'] + data['number of children']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Total Nights'] = pd.to_numeric(data['number of weekend nights'], errors='coerce').fillna(0) + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in


First few rows of the dataset:
   Booking_ID number of adults number of children number of weekend nights  \
0   INN00001                1                  1                        2   
1   INN00002                1                  0                        1   
2   INN00003                2                  1                        1   
3   INN00004                1                  0                        0   
4   INN00005                1                  0                        1   

   number of week nights  type of meal  car parking space    room type  \
0                    5.0   Meal Plan 1                0.0  Room_Type 1   
1                    3.0  Not Selected                0.0  Room_Type 1   
2                    3.0   Meal Plan 1                0.0  Room_Type 1   
3                    2.0   Meal Plan 1                0.0  Room_Type 1   
4                    2.0  Not Selected                0.0  Room_Type 1   

   lead time market segment type  ...  P-C  P-not-C  averag

TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['float', 'int', 'str']