In [2]:
import pandas as pd

In [None]:
df = pd.read_csv("./DATA/customer_booking.csv", encoding="ISO-8859-1")
df.head(30)

In [None]:
df.info()




<i>Information about the columns</i>
- `num_passengers` = number of passengers travelling
- `sales_channel` = sales channel booking was made on

- 
- `trip_type` = trip Type (Round Trip, One Way, Circle Trip)
- `purchase_lead` = number of days between travel date and booking date
- `length_of_stay` = number of days spent at destination
- `flight_hour` = hour of flight departure
- `flight_day` = day of week of flight departure
- `route` = origin -> destination flight route
- `booking_origin` = country from where booking was made
- `wants_extra_baggage` = if the customer wanted extra baggage in the booking
- `wants_preferred_seat` = if the customer wanted a preferred seat in the booking
- `wants_in_flight_meals` = if the customer wanted in-flight meals in the booking
- `flight_duration` = total duration of flight (in hours)
- `booking_complete` = flag indicating if the customer completed the booking

In [None]:
df.describe(include='all')

In [None]:
df['flight_day'].unique()

In [7]:
days = {
    'Mon':1,
    'Tue':2,
    'Wed':3,
    'Thu':4,
    'Fri':5,
    'Sat':6,
    'Sun':7,
}

In [8]:
df["flight_day"] = df["flight_day"].map(days)

In [None]:
df['flight_day'].unique()

In [None]:
df.describe()


In [None]:
df.head(40)

In [None]:
df['sales_channel'].value_counts()


In [None]:
df = pd.get_dummies(df, columns=['sales_channel'])


df.head(30)

In [None]:
import seaborn as sns 
sns.regplot(x='purchase_lead',y='booking_complete',data=df)

In [None]:
sns.regplot(x='wants_extra_baggage',y='booking_complete',data=df)

In [None]:
sns.regplot(x='wants_preferred_seat',y='booking_complete',data=df)

In [None]:
sns.regplot(x='flight_hour',y='booking_complete',data=df)

In [None]:
sns.regplot(x='flight_day',y='booking_complete',data=df)

In [None]:
sns.regplot(x='flight_duration',y='booking_complete',data=df)


In [None]:
sns.regplot(x='sales_channel_Mobile',y='booking_complete',data=df)

In [None]:
sns.regplot(x='length_of_stay',y='booking_complete',data=df)

In [None]:
sns.regplot(x='num_passengers',y='booking_complete',data=df)

In [23]:
df = pd.get_dummies(df, columns=['trip_type'])

In [None]:
df.head(30)

In [None]:
sns.regplot(x='trip_type_CircleTrip',y='booking_complete',data=df)

In [None]:
sns.regplot(x='trip_type_OneWay',y='booking_complete',data=df)

In [None]:
sns.regplot(x='trip_type_RoundTrip',y='booking_complete',data=df)

In [None]:
!pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
import matplotlib.pyplot as plt

In [29]:
corr_columns = ['purchase_lead', 'length_of_stay', 'flight_day', 'wants_extra_baggage',
                    'wants_preferred_seat', 'wants_in_flight_meals', 'flight_duration',
                    'sales_channel_Internet', 'sales_channel_Mobile', 'trip_type_CircleTrip',
                    'trip_type_OneWay', 'trip_type_RoundTrip', 'booking_complete']
corr_df = df[corr_columns]

correlation_matrix = corr_df.corr()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix,cmap="coolwarm", fmt=".2f")
plt.title('Pearson Correlation Heatmap')
plt.savefig('./DATA/Correlation_heatmap.jpg',bbox_inches='tight', dpi=720)
plt.show()

In [31]:

features = ['purchase_lead', 'length_of_stay', 'flight_day', 'wants_extra_baggage',
                    'wants_preferred_seat', 'wants_in_flight_meals', 'flight_duration',
                    'sales_channel_Internet', 'sales_channel_Mobile', 'trip_type_CircleTrip',
                    'trip_type_OneWay', 'trip_type_RoundTrip']




X_train, X_test, y_train, y_test = train_test_split(df[features], df['booking_complete'], test_size=0.2, random_state=42)


degree = 3
model = make_pipeline(PolynomialFeatures(degree), StandardScaler(), LinearRegression())


model.fit(X_train, y_train)


y_pred = model.predict(X_test)




In [None]:
# evaluating the model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)


print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r_squared}')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))


sns.distplot(y_test, hist=False, color="blue", label="Actual")


sns.distplot(y_pred, hist=False, color="red", label="Predicted")

plt.title("Distribution Plot of Actual vs Predicted Values")
plt.xlabel("Booking Completion")
plt.ylabel("Density")
plt.legend()
plt.savefig('./DATA/displot.jpg',bbox_inches='tight', dpi=720)
plt.show()
