# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
import pickle

# Reading Data set

In [2]:
df = pd.read_csv("./first_project.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36285 entries, 0 to 36284
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Booking_ID                36285 non-null  object 
 1   number of adults          36285 non-null  int64  
 2   number of children        36285 non-null  int64  
 3   number of weekend nights  36285 non-null  int64  
 4   number of week nights     36285 non-null  int64  
 5   type of meal              36285 non-null  object 
 6   car parking space         36285 non-null  int64  
 7   room type                 36285 non-null  object 
 8   lead time                 36285 non-null  int64  
 9   market segment type       36285 non-null  object 
 10  repeated                  36285 non-null  int64  
 11  P-C                       36285 non-null  int64  
 12  P-not-C                   36285 non-null  int64  
 13  average price             36285 non-null  float64
 14  specia

In [4]:
df.head()

Unnamed: 0,Booking_ID,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,date of reservation,booking status
0,INN00001,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.0,0,10/2/2015,Not_Canceled
1,INN00002,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,11/6/2018,Not_Canceled
2,INN00003,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.0,0,2/28/2018,Canceled
3,INN00004,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.0,1,5/20/2017,Canceled
4,INN00005,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.0,0,4/11/2018,Canceled


# Data pre-processing

## Checking if there is duplicate data

In [5]:
# checking duplicates, turns out there's none
print(df.duplicated().sum())

0


## Checking if there's Null data

In [6]:
# checking if there are null values in any columns, turns out there's no null values
print(df.isnull().sum())

Booking_ID                  0
number of adults            0
number of children          0
number of weekend nights    0
number of week nights       0
type of meal                0
car parking space           0
room type                   0
lead time                   0
market segment type         0
repeated                    0
P-C                         0
P-not-C                     0
average price               0
special requests            0
date of reservation         0
booking status              0
dtype: int64


# Handling Outliers

## Seeing how many rows contain outliers, to see if it's possible to delete them

In [7]:
def count_outliers_iqr(df, columns):
    outliers = pd.DataFrame(index=df.index)
    
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Add a boolean column for each numerical column indicating if it's an outlier
        outliers[col] = (df[col] < lower_bound) | (df[col] > upper_bound)
    
    # Find rows that contain outliers in any of the columns
    total_outliers = outliers.any(axis=1).sum()
    return total_outliers, outliers

# Specify your numerical columns
numerical_cols = ['number of adults', 'number of children', 'number of weekend nights', 
                  'number of week nights', 'lead time', 'car parking space', 
                  'average price ', 'special requests']

# Get the count of rows with outliers
outlier_count, outlier_flags = count_outliers_iqr(df, numerical_cols)
print(f'Total rows with outliers: {outlier_count}')

Total rows with outliers: 15327


## Applying Capping on all outliers in each column

In [8]:
def cap_outliers_iqr(df, columns):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Cap the outliers
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

# Apply capping on numerical columns
numerical_cols = ['number of adults', 'number of children', 'number of weekend nights', 'number of week nights',
                  'lead time', 'car parking space', 'average price ', 'special requests']
df = cap_outliers_iqr(df, numerical_cols)
outlier_count, outlier_flags = count_outliers_iqr(df, numerical_cols)
print(f'Total rows with outliers after capping: {outlier_count}')


Total rows with outliers after capping: 0


## Convert the date of reservation column into new features like year, month, or day

### Checking if there are any incorrect dates, such as considering February 29 in a non-leap year or something similar

In [9]:
df['date of reservation'] = pd.to_datetime(df['date of reservation'], errors='coerce')

In [10]:
invalid_dates = df[df['date of reservation'].isna()]
print(invalid_dates.count())
print("----------------------")
print(df.count())

Booking_ID                  37
number of adults            37
number of children          37
number of weekend nights    37
number of week nights       37
type of meal                37
car parking space           37
room type                   37
lead time                   37
market segment type         37
repeated                    37
P-C                         37
P-not-C                     37
average price               37
special requests            37
date of reservation          0
booking status              37
dtype: int64
----------------------
Booking_ID                  36285
number of adults            36285
number of children          36285
number of weekend nights    36285
number of week nights       36285
type of meal                36285
car parking space           36285
room type                   36285
lead time                   36285
market segment type         36285
repeated                    36285
P-C                         36285
P-not-C                     3

### Handling invalid dates

Since there are only 37 row with invalid date, and more than 36000 row, it makes since to drop these rows, better than making any date assumptions

In [11]:
df = df.dropna(subset=['date of reservation'])

In [12]:
invalid_dates = df[df['date of reservation'].isna()]
print(invalid_dates.count())
print("----------------------------------")
print(df.count())

Booking_ID                  0
number of adults            0
number of children          0
number of weekend nights    0
number of week nights       0
type of meal                0
car parking space           0
room type                   0
lead time                   0
market segment type         0
repeated                    0
P-C                         0
P-not-C                     0
average price               0
special requests            0
date of reservation         0
booking status              0
dtype: int64
----------------------------------
Booking_ID                  36248
number of adults            36248
number of children          36248
number of weekend nights    36248
number of week nights       36248
type of meal                36248
car parking space           36248
room type                   36248
lead time                   36248
market segment type         36248
repeated                    36248
P-C                         36248
P-not-C                     36248


In [13]:
# droping booking_id column
df = df.drop(['Booking_ID'], axis = 1) 

In [14]:
# Convert the date of reservation column into new features like year, month, or day of week to extract time-based trends.
df['reservation_year'] = pd.to_datetime(df['date of reservation']).dt.year
df['reservation_month'] = pd.to_datetime(df['date of reservation']).dt.month
df['reservation_day'] = pd.to_datetime(df['date of reservation']).dt.day
df = df.drop(['date of reservation'], axis = 1)
df.head()

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,booking status,reservation_year,reservation_month,reservation_day
0,2.0,0.0,2.0,5.0,Meal Plan 1,0.0,Room_Type 1,224.0,Offline,0,0,0,88.0,0.0,Not_Canceled,2015,10,2
1,2.0,0.0,1.0,3.0,Not Selected,0.0,Room_Type 1,5.0,Online,0,0,0,106.68,1.0,Not_Canceled,2018,11,6
2,2.0,0.0,1.0,3.0,Meal Plan 1,0.0,Room_Type 1,1.0,Online,0,0,0,50.0,0.0,Canceled,2018,2,28
3,2.0,0.0,0.0,2.0,Meal Plan 1,0.0,Room_Type 1,211.0,Online,0,0,0,100.0,1.0,Canceled,2017,5,20
4,2.0,0.0,1.0,2.0,Not Selected,0.0,Room_Type 1,48.0,Online,0,0,0,77.0,0.0,Canceled,2018,4,11


# Applying One Hot encoder on suitable Attributes

In [15]:
one_hot_encoder = OneHotEncoder(sparse_output=False)

def apply_one_hot_encoder(df, field_name):
    field_encoded = one_hot_encoder.fit_transform(df[[field_name]])
    df = pd.concat([df,pd.DataFrame(field_encoded, columns=one_hot_encoder.get_feature_names_out([field_name]), index=df.index)], axis = 1)
    df = df.drop([field_name], axis = 1)
    return df

## Applying one hot encoder on type of meal, room type, market segment type

In [16]:
print(f"type of meal classes: {df['type of meal'].unique()}")
print(f"room type classes: {df['room type'].unique()}")
print(f"market segment type classes: {df['market segment type'].unique()}")

type of meal classes: ['Meal Plan 1' 'Not Selected' 'Meal Plan 2' 'Meal Plan 3']
room type classes: ['Room_Type 1' 'Room_Type 4' 'Room_Type 2' 'Room_Type 6' 'Room_Type 5'
 'Room_Type 7' 'Room_Type 3']
market segment type classes: ['Offline' 'Online' 'Corporate' 'Aviation' 'Complementary']


In [17]:
df = apply_one_hot_encoder(df, 'type of meal')
df = apply_one_hot_encoder(df, 'room type')
df = apply_one_hot_encoder(df, 'market segment type')

In [18]:
df

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,car parking space,lead time,repeated,P-C,P-not-C,average price,...,room type_Room_Type 3,room type_Room_Type 4,room type_Room_Type 5,room type_Room_Type 6,room type_Room_Type 7,market segment type_Aviation,market segment type_Complementary,market segment type_Corporate,market segment type_Offline,market segment type_Online
0,2.0,0.0,2.0,5.0,0.0,224.0,0,0,0,88.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2.0,0.0,1.0,3.0,0.0,5.0,0,0,0,106.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2.0,0.0,1.0,3.0,0.0,1.0,0,0,0,50.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2.0,0.0,0.0,2.0,0.0,211.0,0,0,0,100.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2.0,0.0,1.0,2.0,0.0,48.0,0,0,0,77.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36280,2.0,0.0,0.0,2.0,0.0,289.5,0,0,0,115.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
36281,2.0,0.0,1.0,3.0,0.0,34.0,0,0,0,107.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
36282,2.0,0.0,1.0,3.0,0.0,83.0,0,0,0,105.61,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
36283,2.0,0.0,0.0,4.0,0.0,121.0,0,0,0,96.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Convert (Not Cancelled -> 1 and Cacelled --> 0)

In [19]:
status_dec = {'Not_Canceled': 0, 'Canceled': 1}
df['booking status'] = [status_dec[i] for i in df['booking status']]
df['booking status']

0        0
1        0
2        1
3        1
4        1
        ..
36280    1
36281    0
36282    0
36283    0
36284    0
Name: booking status, Length: 36248, dtype: int64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36248 entries, 0 to 36284
Data columns (total 31 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   number of adults                   36248 non-null  float64
 1   number of children                 36248 non-null  float64
 2   number of weekend nights           36248 non-null  float64
 3   number of week nights              36248 non-null  float64
 4   car parking space                  36248 non-null  float64
 5   lead time                          36248 non-null  float64
 6   repeated                           36248 non-null  int64  
 7   P-C                                36248 non-null  int64  
 8   P-not-C                            36248 non-null  int64  
 9   average price                      36248 non-null  float64
 10  special requests                   36248 non-null  float64
 11  booking status                     36248 non-null  int64  


In [21]:
x_all = df.drop(['booking status'], axis=1)
y_all = df['booking status'].values.flatten()
print("Number of columns: ", x_all.shape[1])

Number of columns:  30


# Next Steps Plan:
We want to perform **feature extraction** to reduce the number of columns (dimensions). However, before doing so, we want to see how the model's accuracy will be without removing any columns or performing any feature extraction. The plan is as follows:

1. Run the Logistic Regression and KNN models to compare their accuracies.
2. Perform feature extraction to reduce the number of columns and the computational power required.
3. Rerun the models on the dataset after reducing the columns and see how it affects the accuracy.

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Running The Logistic Regression Model

In [23]:
logistic_model = LogisticRegression()

# For our training data, we will use cross-validation to ensure that our model doesn't overfit to only one split
logistic_cv_scores = cross_val_score(logistic_model, x_train_scaled, y_train, cv=5)
logistic_cv_scores = np.round(logistic_cv_scores, 2)
print("Logistic Regression Cross-Validation Scores:", logistic_cv_scores)
print("Logistic Regression Mean CV Accuracy:", logistic_cv_scores.mean())

Logistic Regression Cross-Validation Scores: [0.8  0.81 0.8  0.81 0.8 ]
Logistic Regression Mean CV Accuracy: 0.804


In [24]:
# simulating the process of putting our model in production environment, and seeing new data other than the training one
print("Simulating the process of putting our model in production environment, and seeing new data")

logistic_model.fit(x_train_scaled, y_train)
y_pred = logistic_model.predict(x_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Simulating the process of putting our model in production environment, and seeing new data
Accuracy: 0.81


# Running KNN model

In [25]:
knn_model = KNeighborsClassifier(n_neighbors=3)

# Fit the model on the scaled training data
knn_cv_scores = cross_val_score(knn_model, x_train_scaled, y_train, cv=5)
knn_cv_scores = np.round(knn_cv_scores, 2)
print("KNN Cross-Validation Scores:", knn_cv_scores)
print("KNN Mean CV Accuracy:", knn_cv_scores.mean())

KNN Cross-Validation Scores: [0.85 0.85 0.85 0.84 0.84]
KNN Mean CV Accuracy: 0.8459999999999999


In [26]:
# simulating the process of putting our model in production environment, and seeing new data other than the training one
print("Simulating the process of putting our model in production environment, and seeing new data")

knn_model.fit(x_train_scaled, y_train)
y_pred_knn = knn_model.predict(x_test_scaled)

# Evaluate the KNN model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'Accuracy: {accuracy_knn:.2f}')

Simulating the process of putting our model in production environment, and seeing new data
Accuracy: 0.85


# Running RandomForest model

In [27]:
# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier()

# Cross-validation to ensure the model doesn't overfit
rf_cv_scores = cross_val_score(rf_model, x_train_scaled, y_train, cv=5)
rf_cv_scores = np.round(rf_cv_scores, 2)

print("Random Forest Cross-Validation Scores:", rf_cv_scores)
print("Random Forest Mean CV Accuracy:", rf_cv_scores.mean())

Random Forest Cross-Validation Scores: [0.89 0.9  0.89 0.9  0.9 ]
Random Forest Mean CV Accuracy: 0.8960000000000001


In [28]:
print("Simulating the process of putting our model in production environment, and seeing new data")
# Train the model on the training data
rf_model.fit(x_train_scaled, y_train)

# Predict using the test data
y_pred = rf_model.predict(x_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Simulating the process of putting our model in production environment, and seeing new data
Accuracy: 0.90


# Feature Selection To Reduce Computational Power

## Filter by variance

In [29]:
x_all.var()

number of adults                        0.000000
number of children                      0.000000
number of weekend nights                0.752785
number of week nights                   1.665486
car parking space                       0.000000
lead time                            6672.013029
repeated                                0.024894
P-C                                     0.135742
P-not-C                                 3.073451
average price                        1003.182688
special requests                        0.558144
reservation_year                        0.147760
reservation_month                       9.404063
reservation_day                        76.287644
type of meal_Meal Plan 1                0.178616
type of meal_Meal Plan 2                0.082821
type of meal_Meal Plan 3                0.000138
type of meal_Not Selected               0.121519
room type_Room_Type 1                   0.174064
room type_Room_Type 2                   0.018727
room type_Room_Type 

In [30]:
variances = x_all.var()

# Define a threshold
threshold = 0.1

# Identify columns to drop based on variance threshold
columns_to_drop = variances[variances < threshold].index

x_all.drop(columns=columns_to_drop, inplace=True)
print("Updated DataFrame columns with their variance:")
print(x_all.var())


Updated DataFrame columns with their variance:
number of weekend nights          0.752785
number of week nights             1.665486
lead time                      6672.013029
P-C                               0.135742
P-not-C                           3.073451
average price                  1003.182688
special requests                  0.558144
reservation_year                  0.147760
reservation_month                 9.404063
reservation_day                  76.287644
type of meal_Meal Plan 1          0.178616
type of meal_Not Selected         0.121519
room type_Room_Type 1             0.174064
room type_Room_Type 4             0.139070
market segment type_Offline       0.206011
market segment type_Online        0.230389
dtype: float64


In [31]:
## Filter by Correlation
correlation_matrix = x_all.corr().abs()

# Set a threshold for high correlation
threshold = 0.8

# Create a set to hold features to drop
features_to_drop = set()

# Loop through the correlation matrix
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if correlation_matrix.iloc[i, j] > threshold:
            # Get the names of the features
            feature_name = correlation_matrix.columns[i]
            features_to_drop.add(feature_name)

print("Dropping highly correlated features:", features_to_drop)
x_all.drop(columns=features_to_drop, inplace=True)
print("Number of columns: ", x_all.shape[1])
print(x_all.columns)

Dropping highly correlated features: {'room type_Room_Type 4', 'market segment type_Online'}
Number of columns:  14
Index(['number of weekend nights', 'number of week nights', 'lead time', 'P-C',
       'P-not-C', 'average price ', 'special requests', 'reservation_year',
       'reservation_month', 'reservation_day', 'type of meal_Meal Plan 1',
       'type of meal_Not Selected', 'room type_Room_Type 1',
       'market segment type_Offline'],
      dtype='object')


In [32]:
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Running Linear Regression Model Again

In [33]:
model = LogisticRegression()

logistic_model = LogisticRegression()

# For our training data, we will use cross-validation to ensure that our model doesn't overfit to only one split
logistic_cv_scores = cross_val_score(logistic_model, x_train_scaled, y_train, cv=5)
logistic_cv_scores = np.round(logistic_cv_scores, 2)
print("Logistic Regression Cross-Validation Scores:", logistic_cv_scores)
print("Logistic Regression Mean CV Accuracy:", logistic_cv_scores.mean())

Logistic Regression Cross-Validation Scores: [0.79 0.8  0.8  0.8  0.8 ]
Logistic Regression Mean CV Accuracy: 0.798


In [34]:

# simulating the process of putting our model in production environment, and seeing new data other than the training one
print("Simulating the process of putting our model in production environment, and seeing new data")
model.fit(x_train_scaled, y_train)
# Make predictions on the test data
y_pred = model.predict(x_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Simulating the process of putting our model in production environment, and seeing new data
Accuracy: 0.80


# Running KNN Model Again

In [35]:
knn_model = KNeighborsClassifier(n_neighbors=3)

# For our training data, we will use cross-validation to ensure that our model doesn't overfit to only one split
knn_cv_scores = cross_val_score(knn_model, x_train_scaled, y_train, cv=5)
knn_cv_scores = np.round(knn_cv_scores, 2)
print("KNN Cross-Validation Scores:", knn_cv_scores)
print("KNN Mean CV Accuracy:", knn_cv_scores.mean())


KNN Cross-Validation Scores: [0.85 0.85 0.84 0.85 0.84]
KNN Mean CV Accuracy: 0.8460000000000001


In [36]:
# simulating the process of putting our model in production environment, and seeing new data other than the training one
print("Simulating the process of putting our model in production environment, and seeing new data")

knn_model.fit(x_train_scaled, y_train)
# Make predictions on the scaled test data
y_pred_knn = knn_model.predict(x_test_scaled)

# Evaluate the KNN model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'Accuracy: {accuracy_knn:.2f}')

Simulating the process of putting our model in production environment, and seeing new data
Accuracy: 0.85


# Running RandomForest model

In [37]:
# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier()

# Cross-validation to ensure the model doesn't overfit
rf_cv_scores = cross_val_score(rf_model, x_train_scaled, y_train, cv=5)
rf_cv_scores = np.round(rf_cv_scores, 2)

print("Random Forest Cross-Validation Scores:", rf_cv_scores)
print("Random Forest Mean CV Accuracy:", rf_cv_scores.mean())

Random Forest Cross-Validation Scores: [0.89 0.9  0.89 0.9  0.9 ]
Random Forest Mean CV Accuracy: 0.8960000000000001


In [38]:
print("Simulating the process of putting our model in production environment, and seeing new data")
# Train the model on the training data
rf_model.fit(x_train_scaled, y_train)

# Predict using the test data
y_pred = rf_model.predict(x_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Simulating the process of putting our model in production environment, and seeing new data
Accuracy: 0.90


In [39]:
pickle.dump(rf_model, open('model.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))

In [40]:
print(x_test.columns)

Index(['number of weekend nights', 'number of week nights', 'lead time', 'P-C',
       'P-not-C', 'average price ', 'special requests', 'reservation_year',
       'reservation_month', 'reservation_day', 'type of meal_Meal Plan 1',
       'type of meal_Not Selected', 'room type_Room_Type 1',
       'market segment type_Offline'],
      dtype='object')


In [41]:
print(x_test.iloc[45])
print("--------------------")
print(x_test_scaled[45])
print(y_pred[45])
print("-----------------")
print(y_test[45])

number of weekend nights          2.00
number of week nights             2.00
lead time                       229.00
P-C                               0.00
P-not-C                           0.00
average price                    80.75
special requests                  0.00
reservation_year               2018.00
reservation_month                10.00
reservation_day                  22.00
type of meal_Meal Plan 1          1.00
type of meal_Not Selected         0.00
room type_Room_Type 1             1.00
market segment type_Offline       0.00
Name: 5254, dtype: float64
--------------------
[ 1.3749596  -0.13996881  1.76942261 -0.0646496  -0.08670301 -0.70146497
 -0.80988696  0.46722758  0.84484826  0.73479046  0.54980532 -0.4044135
  0.53564216 -0.64459543]
1
-----------------
1


In [42]:
print(x_test.iloc[5])
print("--------------------")
print(x_test_scaled[5])
print(y_pred[5])
print("-----------------")
print(y_test[5])

number of weekend nights          0.00
number of week nights             1.00
lead time                         1.00
P-C                               0.00
P-not-C                           2.00
average price                    20.75
special requests                  1.00
reservation_year               2017.00
reservation_month                10.00
reservation_day                  13.00
type of meal_Meal Plan 1          1.00
type of meal_Not Selected         0.00
room type_Room_Type 1             1.00
market segment type_Offline       0.00
Name: 32004, dtype: float64
--------------------
[-0.93259479 -0.91558804 -1.01696081 -0.0646496   1.00809945 -2.59570761
  0.5318122  -2.13628229  0.84484826 -0.29700863  0.54980532 -0.4044135
  0.53564216 -0.64459543]
0
-----------------
0


# Coclusion After Feature Extraction
While the results indicate a minor decline in logistic regression accuracy, the constant accuracy in the KNN model and the reduced number of features suggest that the feature extraction process was somewhat successful. The key is to balance performance with complexity and to ensure that the models generalize well to unseen data. 