In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Load the dataset
df = pd.read_csv('first inten project.csv')

# Check for null values
print("Null values in the dataset:")
print(df.isnull().sum())

# Check data types
print("\nData types in the dataset:")
print(df.dtypes)

# Remove any whitespace in column names
df.columns = df.columns.str.replace(' ', '')
print(df.columns)

Null values in the dataset:
Booking_ID                  0
number of adults            0
number of children          0
number of weekend nights    0
number of week nights       0
type of meal                0
car parking space           0
room type                   0
lead time                   0
market segment type         0
repeated                    0
P-C                         0
P-not-C                     0
average price               0
special requests            0
date of reservation         0
booking status              0
dtype: int64

Data types in the dataset:
Booking_ID                   object
number of adults              int64
number of children            int64
number of weekend nights      int64
number of week nights         int64
type of meal                 object
car parking space             int64
room type                    object
lead time                     int64
market segment type          object
repeated                      int64
P-C                      

In [23]:

# Check for outliers using IQR
numeric_df = df.select_dtypes(include=['int64', 'float64'])
Q1 = numeric_df.quantile(0.25)
Q3 = numeric_df.quantile(0.75)
IQR = Q3 - Q1

outliers = numeric_df.apply(lambda x: ((x < (Q1[x.name] - 1.5 * IQR[x.name])) | (x > (Q3[x.name] + 1.5 * IQR[x.name]))))

print("\nOutliers in the dataset:")
print(outliers.sum())


Outliers in the dataset:
numberofadults           10175
numberofchildren          2702
numberofweekendnights       21
numberofweeknights         324
carparkingspace           1124
leadtime                  1332
repeated                   930
P-C                        338
P-not-C                    812
averageprice              1696
specialrequests            762
dtype: int64


In [33]:
if 'booking status' in df.columns:
    X = df.drop('booking status', axis=1)
    y = df['booking status']
    X_selected = SelectKBest(chi2, k=10).fit_transform(X, y)
else:
    print("Column 'booking status' does not exist in the DataFrame.")

Column 'booking status' does not exist in the DataFrame.


In [53]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

if 'booking status' in df.columns:
    X = df.drop('booking status', axis=1)
    y = df['booking status']
    X_selected = SelectKBest(chi2, k=10).fit_transform(X, y)

    # Reshape X_selected to a 2D array
    X_selected = X_selected.reshape(-1, 1)

    X_selecetd_scaled = scaler.fit_transform(X_selected)
else:
    print("Column 'booking status' does not exist in the DataFrame.")

Column 'booking status' does not exist in the DataFrame.


In [71]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2

# Assuming df is your DataFrame
if 'booking status' in df.columns:
    X = df.drop('booking status', axis=1)
    y = df['booking status']

    # Encode categorical features using LabelEncoder
    le = LabelEncoder()
    for col in X.columns:
        if X[col].dtype == 'object':
            X[col] = le.fit_transform(X[col])

    # Use SelectKBest with chi2
    X_selected = SelectKBest(chi2, k=10).fit_transform(X, y)

    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Training the Logistic Regression model on the Training set
    model = LogisticRegression()
    model.fit(X_train_scaled, y_train)

    # Predicting the Test set results
    y_pred = model.predict(X_test_scaled)

    # Making the Confusion Matrix
    print("\nAccuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


Accuracy: 0.7807634008543475
Classification Report:
              precision    recall  f1-score   support

    Canceled       0.73      0.53      0.61      2402
Not_Canceled       0.80      0.91      0.85      4855

    accuracy                           0.78      7257
   macro avg       0.76      0.72      0.73      7257
weighted avg       0.78      0.78      0.77      7257

Confusion Matrix:
[[1270 1132]
 [ 459 4396]]
