In [12]:
import pandas as pd
import joblib as jb

In [9]:
def save_model(model, name):
  with open(f'Saved Models/{name}.pkl', 'wb') as model_file:
      jb.dump(model, model_file)

def load_model(name):
  with open(f'Saved Models/{name}.pkl', 'rb') as model_file:
      model = jb.load(model_file)
  return model

In [3]:
# read the data
data = pd.read_csv('first inten project.csv')
data.head(5)

Unnamed: 0,Booking_ID,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,date of reservation,booking status,Unnamed: 17
0,INN00001,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.0,0,10/2/2015,Not_Canceled,1.0
1,INN00002,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,11/6/2018,Not_Canceled,
2,INN00003,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.0,0,2/28/2018,Canceled,
3,INN00004,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.0,1,5/20/2017,Canceled,
4,INN00005,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.0,0,4/11/2018,Canceled,


In [4]:
data.drop(['Unnamed: 17', 'Booking_ID'], axis= 1, inplace= True)
data.head(5)

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,date of reservation,booking status
0,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.0,0,10/2/2015,Not_Canceled
1,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,11/6/2018,Not_Canceled
2,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.0,0,2/28/2018,Canceled
3,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.0,1,5/20/2017,Canceled
4,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.0,0,4/11/2018,Canceled


# Handling Lables

In [5]:
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder

In [6]:
# convert date of reservation into date type
data['date of reservation'] = pd.to_datetime(data['date of reservation'], format='mixed', errors='coerce')
data.dropna(subset=['date of reservation'], inplace=True)

# Extract day and month
data['month'] = data['date of reservation'].dt.month
data['day'] = data['date of reservation'].dt.day


data.drop('date of reservation', axis= 1, inplace= True)
data.head()

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,booking status,month,day
0,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.0,0,Not_Canceled,10,2
1,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,Not_Canceled,11,6
2,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.0,0,Canceled,2,28
3,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.0,1,Canceled,5,20
4,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.0,0,Canceled,4,11


In [7]:
# encode market types 
market_biencoder = ce.BinaryEncoder(cols=['market segment type'])
market_encoded = market_biencoder.fit_transform(data['market segment type'])
save_model(market_biencoder, "market_Endcoder")

# encode type of meal
meal_biencoder = ce.BinaryEncoder(cols=['type of meal'])
meal_encoded = meal_biencoder.fit_transform(data['type of meal'])
save_model(meal_biencoder, "meal_Endcoder")

# encode room type
room_biencoder = ce.BinaryEncoder(cols=['room type'])
room_encoded = room_biencoder.fit_transform(data['room type'])
save_model(room_biencoder, "room_Endcoder")


data.drop(['room type', 'type of meal', 'market segment type'], axis=1, inplace=True)
data = pd.concat([data, room_encoded, meal_encoded, market_encoded], axis=1)
data.head(2)

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,car parking space,lead time,repeated,P-C,P-not-C,average price,...,day,room type_0,room type_1,room type_2,type of meal_0,type of meal_1,type of meal_2,market segment type_0,market segment type_1,market segment type_2
0,1,1,2,5,0,224,0,0,0,88.0,...,2,0,0,1,0,0,1,0,0,1
1,1,0,1,3,0,5,0,0,0,106.68,...,6,0,0,1,0,1,0,0,1,0


In [8]:
# label encoding for the target
le = LabelEncoder()
data['booking status'] = le.fit_transform(data['booking status'])
save_model(le, "LabelEncoder")
le.classes_

array(['Canceled', 'Not_Canceled'], dtype=object)

# Handling Numericals

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
correlation_matrix = data.corr()
target_corr = correlation_matrix['booking status'].sort_values(ascending=False)
filtered_features = target_corr[(target_corr >= 0.07) | (target_corr <= -0.07)]
print(filtered_features.index)

Index(['booking status', 'special requests', 'repeated',
       'market segment type_2', 'car parking space', 'number of adults',
       'number of week nights', 'average price ', 'lead time'],
      dtype='object')


In [11]:
cat = ['type of meal_0', 'type of meal_1', 'type of meal_2', 'room type_0', 'room type_1', 'room type_2', 'market segment type_0',
   'market segment type_1', 'market segment type_2', 'booking status', 'repeated', 'car parking space']

# filter out categorics
numerical_col = [col for col in filtered_features.index if col not in cat]

# scale
scaler = StandardScaler()
data[numerical_col] = scaler.fit_transform(data[numerical_col])
save_model(scaler, 'StandardScaler')


# Train The Models

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [13]:
data = data[filtered_features.index]
X = data.drop('booking status', axis= 1)
y = data['booking status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Logistic Regression
# -- Train
logreg_param_grid = [
    {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2'], 'max_iter': [50, 150, 300, 500, 1000]},
]

logreg = LogisticRegression(solver= 'liblinear', random_state = 42)
logreg_grid = GridSearchCV(logreg, logreg_param_grid, cv=5, verbose=1, n_jobs=-1)
logreg_grid.fit(X_train, y_train)

print("Best parameters for Logistic Regression:", logreg_grid.best_params_)

# -- Test
y_pred_logreg = logreg_grid.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Logistic Regression Accuracy:", accuracy_logreg)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))

# -- save model
save_model(logreg_grid, 'LogisticeRegression_model')

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for Logistic Regression: {'C': 100, 'max_iter': 50, 'penalty': 'l1'}
Logistic Regression Accuracy: 0.7997241379310345
Confusion Matrix:
 [[1411  958]
 [ 494 4387]]
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.60      0.66      2369
           1       0.82      0.90      0.86      4881

    accuracy                           0.80      7250
   macro avg       0.78      0.75      0.76      7250
weighted avg       0.79      0.80      0.79      7250



In [15]:
# KNN
# -- Train
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
}
knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn, knn_param_grid, cv=5, verbose=1, n_jobs=-1)
knn_grid.fit(X_train, y_train)

print("Best parameters for KNN:", knn_grid.best_params_)

# -- Test
y_pred_knn = knn_grid.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("KNN Accuracy:", accuracy_knn)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))

# -- save model
save_model(knn_grid, 'KNN_model')

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best parameters for KNN: {'n_neighbors': 7}
KNN Accuracy: 0.8533793103448276
Confusion Matrix:
 [[1748  621]
 [ 442 4439]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.74      0.77      2369
           1       0.88      0.91      0.89      4881

    accuracy                           0.85      7250
   macro avg       0.84      0.82      0.83      7250
weighted avg       0.85      0.85      0.85      7250



In [16]:
# Decision Tree
# -- Train
dtree_param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
}
dtree = DecisionTreeClassifier(random_state = 42)
dtree_grid = GridSearchCV(dtree, dtree_param_grid, cv=5, verbose=1, n_jobs=-1)
dtree_grid.fit(X_train, y_train)

print("Best parameters for Decision Tree:", dtree_grid.best_params_)

# -- Test
y_pred_dtree = dtree_grid.predict(X_test)
accuracy_dtree = accuracy_score(y_test, y_pred_dtree)
print("Decision Tree Accuracy:", accuracy_dtree)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dtree))
print("Classification Report:\n", classification_report(y_test, y_pred_dtree))

# -- save model
save_model(dtree_grid, 'DecisionTree_model')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_split': 10}
Decision Tree Accuracy: 0.8533793103448276
Confusion Matrix:
 [[1776  593]
 [ 470 4411]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.75      0.77      2369
           1       0.88      0.90      0.89      4881

    accuracy                           0.85      7250
   macro avg       0.84      0.83      0.83      7250
weighted avg       0.85      0.85      0.85      7250



In [17]:
# Random Forest
# -- Train
rf_param_grid = {
    'n_estimators': [10, 50, 100, 150, 300],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    }
rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, verbose=1, n_jobs=-1)
rf_grid.fit(X_train, y_train)

print("Best parameters for Random Forest:", rf_grid.best_params_)

# -- Test
y_pred_rf = rf_grid.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


# -- save model
save_model(rf_grid, 'RandomForest_model')

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 300}
Random Forest Accuracy: 0.8823448275862069
Confusion Matrix:
 [[1859  510]
 [ 343 4538]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.78      0.81      2369
           1       0.90      0.93      0.91      4881

    accuracy                           0.88      7250
   macro avg       0.87      0.86      0.86      7250
weighted avg       0.88      0.88      0.88      7250



# Deployment

In [1]:
from flask import Flask, request, render_template

In [14]:
# Flask app setup
app = Flask(__name__)

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():

    model = load_model('RandomForest_model')

    # Read the data from the form
    form_data = {
        'number of adults': int(request.form['number of adults']),
        'number of week nights': int(request.form['number of week nights']),
        'car parking space': 1 if 'car parking space' in request.form else 0,
        'lead time': int(request.form['lead time']),
        'market segment type': request.form['market segment type'],
        'repeated': 1 if 'repeated' in request.form else 0,
        'average price ': float(request.form['average price ']),
        'special requests': int(request.form['special requests'])
    }
    df = pd.DataFrame([form_data])

    # preprocessing pipeline
    market_encoder = load_model('market_Endcoder')
    market_en = market_encoder.transform(df['market segment type'])
    df = pd.concat([df, market_en], axis=1)
    

    # data scaling
    numerical_col =['special requests', 'number of adults', 'number of week nights', 
                    'average price ', 'lead time']

    scaler = load_model('StandardScaler')
    df[numerical_col] = scaler.transform(df[numerical_col])


    # Predict
    expected_columns = ['special requests', 'repeated',
       'market segment type_2', 'car parking space', 'number of adults',
       'number of week nights', 'average price ', 'lead time']
    df_final = df[expected_columns]


    # Make prediction
    prediction = model.predict(df_final)

    print(prediction)  
    result = 'Booking will be canceled' if prediction[0] == 0 else 'Booking will not be canceled'

    return result

In [16]:
# Run the app
if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.1.6:5000
Press CTRL+C to quit
127.0.0.1 - - [06/Oct/2024 16:42:22] "POST /predict HTTP/1.1" 200 -


[1]


127.0.0.1 - - [06/Oct/2024 16:42:22] "POST /predict HTTP/1.1" 200 -


[1]
