# Import Required Libraries

In [None]:
# Import Required Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Loading the Dataset

In [None]:
data = pd.read_csv('Marketing_Analytics_Insights_Case_Study_Dataset.csv')
data.head()

# Data Cleaning

In [None]:
# Check for missing values in the dataset
data.isnull().sum()

In [None]:
# Removing data from imaginary date
data = data[data['Date'] != '2017-02-29']

# Correct data types
data['Date'] = pd.to_datetime(data['Date'], format = "%d/%m/%Y")  # Convert 'Date' column to datetime

data.head()

In [None]:
data.info()

# Descriptive Statistics

In [None]:
data.describe()

# Key Metrics

In [None]:
# Sum Net Room Nights by Date
net_room_nights_by_date = data.groupby('Date')['Net Room Nights'].sum().reset_index()

# # Daily plot for Net Room Nights
# plt.figure(figsize=(14, 8))
# plt.plot(net_room_nights_by_date['Date'], net_room_nights_by_date['Net Room Nights'])
# plt.title('Net Room Nights Over Time')
# plt.xlabel('Date')
# plt.ylabel('Net Room Nights')
# plt.xticks(rotation=45)
# plt.grid(True)
# plt.show()

# Resample and sum Net Room Nights by week
data.set_index('Date', inplace=True)
net_room_nights_by_week = data['Net Room Nights'].resample('W').sum().reset_index()

# Plot the timeseries for Net Room Nights
plt.figure(figsize=(20, 6))
plt.plot(net_room_nights_by_week['Date'], net_room_nights_by_week['Net Room Nights'])
plt.title('Net Room Nights by Week')
plt.xlabel('Week')
plt.ylabel('Net Room Nights')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()  # Adjust layout to prevent cut-off labels
plt.show()

data.reset_index(inplace=True)


In [None]:
# Data overview by marketing channel
metrics = ['Visits', 'Qualified Shoppers', 'Bounced Visits', 'Gross Orders', 'Net Orders', 'Net Room Nights']
results = []

# Calculate statistics for each marketing channel
for channel in data['Marketing Channel'].unique():
    channel_data = data[data['Marketing Channel'] == channel]
    
    # Calculate statistics for each metric
    channel_stats = {
        'Marketing Channel': channel,
    }
    
    for metric in metrics:
        channel_stats[f'{metric}_Median'] = channel_data[metric].median()
        channel_stats[f'{metric}_Mean'] = channel_data[metric].mean()
        channel_stats[f'{metric}_Total'] = channel_data[metric].sum()
    
    results.append(channel_stats)

channel_stats_df = pd.DataFrame(results)

# Sort by mean net room nights
channel_stats_df = channel_stats_df.sort_values('Net Room Nights_Mean', ascending=False)

numeric_cols = channel_stats_df.select_dtypes(include=['float64']).columns
channel_stats_df[numeric_cols] = channel_stats_df[numeric_cols].round(2)

channel_stats_df

# Correlation Analysis

In [None]:
# Visualize the correlation matrix to understand relationships between variables
plt.figure(figsize=(12, 8))
numeric_data = data.select_dtypes(include=[np.number]) # Only keeping numerical variables for correlation
correlation_matrix = numeric_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

# Segment plots

In [None]:
# Group data by Geographic Region and calculate mean Net Room Nights
geo_region_group = data.groupby('Geographic Region')['Net Room Nights'].mean().sort_values(ascending=False)

# Group data by Platform and calculate mean Net Room Nights
platform_group = data.groupby('Platform')['Net Room Nights'].mean().sort_values(ascending=False)


# Group data by Marketing Channel and calculate mean Net Room Nights
marketing_channel_group = data.groupby('Marketing Channel')['Net Room Nights'].mean().sort_values(ascending=False)

# Create a 2x2 grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Key Metrics Visualization', fontsize=16)

# Flatten the axs array for easier indexing
axs = axs.flatten()

# 1. Histogram of 'Net Room Nights'
sns.histplot(data['Net Room Nights'], bins=30, ax=axs[0])
axs[0].set_title('Distribution of Net Room Nights')
axs[0].set_xlabel('Net Room Nights')
axs[0].set_ylabel('Frequency')

# 2. Mean Net Room Nights by Geographic Region
geo_region_group.plot(kind='bar', ax=axs[1])
axs[1].set_title('Mean Net Room Nights by Geographic Region')
axs[1].set_xlabel('Geographic Region')
axs[1].set_ylabel('Mean Net Room Nights')

# 3. Mean Net Room Nights by Platform
platform_group.plot(kind='bar', ax=axs[2])
axs[2].set_title('Mean Net Room Nights by Platform')
axs[2].set_xlabel('Platform')
axs[2].set_ylabel('Mean Net Room Nights')

# 4. Mean Net Room Nights by Marketing Channel
marketing_channel_group.plot(kind='bar', ax=axs[3])
axs[3].set_title('Mean Net Room Nights by Marketing Channel')
axs[3].set_xlabel('Marketing Channel')
axs[3].set_ylabel('Mean Net Room Nights')

# Add annotations to bar plots
for i in range(1, 4):  # For subplots 1, 2, and 3 (index 1, 2, 3)
    for p in axs[i].patches:
        axs[i].annotate(f'{p.get_height():.2f}',
                        (p.get_x() + p.get_width() / 2, p.get_height()),
                        ha='center', va='bottom', fontsize=8, color='black')

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
# Box Plots by variables
variable = 'Bounced Visits'

# Box plot of variable by 'Geographic Region'
plt.figure(figsize=(6, 3))
sns.boxplot(x='Geographic Region', y=variable, data=data)
plt.title(variable + ' by Geographic Region')
plt.xlabel('Geographic Region')
plt.ylabel(variable)
plt.show()

# Box plot of variable by 'Platform'
plt.figure(figsize=(6, 3))
sns.boxplot(x='Platform', y=variable, data=data)
plt.title(variable + ' by Platform')
plt.xlabel('Platform')
plt.ylabel(variable)
plt.show()

# Box plot of 'variable' by 'Marketing Channel'
plt.figure(figsize=(6, 3))
sns.boxplot(x='Marketing Channel', y=variable, data=data)
plt.title(variable + ' by Marketing Channel')
plt.xlabel('Marketing Channel')
plt.ylabel(variable)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Box Plots by variables
variable = 'Bounced Visits'

# Create a single figure with three stacked subplots
fig, axes = plt.subplots(3, 1, figsize=(6, 9), sharex=False)

# Box plot of variable by 'Geographic Region'
sns.boxplot(x='Geographic Region', y=variable, data=data, ax=axes[0])
axes[0].set_title(variable + ' by Geographic Region')
axes[0].set_xlabel('Geographic Region')
axes[0].set_ylabel(variable)
axes[0].set_yscale('log')  # Set y-axis to logarithmic scale

# Box plot of variable by 'Platform'
sns.boxplot(x='Platform', y=variable, data=data, ax=axes[1])
axes[1].set_title(variable + ' by Platform')
axes[1].set_xlabel('Platform')
axes[1].set_ylabel(variable)
axes[1].set_yscale('log')  # Set y-axis to logarithmic scale

# Box plot of variable by 'Marketing Channel'
sns.boxplot(x='Marketing Channel', y=variable, data=data, ax=axes[2])
axes[2].set_title(variable + ' by Marketing Channel')
axes[2].set_xlabel('Marketing Channel')
axes[2].set_ylabel(variable)
axes[2].tick_params(axis='x', rotation=45)
axes[2].set_yscale('log')  # Set y-axis to logarithmic scale

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()


# Regression Analysis

In [None]:
# Extract date features
data['Day_of_Week'] = data['Date'].dt.dayofweek
data['Day_of_Month'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Is_Weekend'] = (data['Day_of_Week'] >= 5).astype(int) # 5 and 6 correspond to Saturday and Sunday

# Prepare features
categorical_features = ['Geographic Region', 'Platform', 'Marketing Channel']
numeric_features = ['Visits', 'Qualified Shoppers', 'Bounced Visits', 'Gross Orders', 'Net Orders',
                    'Day_of_Week', 'Day_of_Month', 'Month', 'Is_Weekend']
numeric_features_reduced = ['Visits', 'Bounced Visits',
                    'Day_of_Week', 'Day_of_Month', 'Month', 'Is_Weekend']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(sparse_output=False), categorical_features)
    ])

# Prepare X and y
X = data.drop(['Date', 'Net Room Nights'], axis=1)
# X = data.drop(['Date', 'Net Room Nights', 'Gross Orders', 'Net Orders', 'Qualified Shoppers'], axis=1)
y = data['Net Room Nights']

# Fit preprocessor
X_processed = preprocessor.fit_transform(X)

# Get feature names
feature_names = (numeric_features + 
                 preprocessor.named_transformers_['cat']
                 .get_feature_names_out(categorical_features).tolist())

# # Fit statsmodels OLS for detailed statistics
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)
X_train_sm = sm.add_constant(X_processed_df)

X_train_sm.index = y.index  # Align the indices
model_sm = sm.OLS(y, X_train_sm).fit()

model_sm.summary()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each predictor
vif_data = pd.DataFrame()
vif_data["feature"] = X_train_sm.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_sm.values, i) for i in range(X_train_sm.shape[1])]

vif_data.sort_values('VIF', ascending=False)

# Feature Importance

In [None]:
# importances = pd.Series(model.coef_, index=X.columns)

# Train a Random Forest Regressor to determine feature importance
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_processed_df, y)

# Predict on the training data
y_train_pred = rf_model.predict(X_processed_df)

# Calculate R² score on training data
r2_train = r2_score(y, y_train_pred)

print(f"R² Score on Training Data: {r2_train:.4f}")

In [None]:
# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=feature_names)

# Plot feature importances
plt.figure(figsize=(10, 6))
feature_importances.sort_values().plot(kind='barh')
plt.title('Feature Importances for Net Room Nights')
plt.xlabel('Gini Importance')
plt.ylabel('Feature')
plt.show()


# Trend Analysis

In [None]:
# Trend Analysis

# Analyze trends over time to identify patterns and opportunities for growing Room Nights

# Group data by Date and calculate the sum of Net Room Nights
date_group = data.groupby('Date')['Net Room Nights'].sum().reset_index()

# Plot the trend of Net Room Nights over time
plt.figure(figsize=(14, 8))
plt.plot(date_group['Date'], date_group['Net Room Nights'], marker='o')
plt.title('Trend of Net Room Nights Over Time')
plt.xlabel('Date')
plt.ylabel('Net Room Nights')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

# Group data by Date and calculate the sum of Visits
visits_group = data.groupby('Date')['Visits'].sum().reset_index()

# Plot the trend of Visits over time
plt.figure(figsize=(14, 8))
plt.plot(visits_group['Date'], visits_group['Visits'], marker='o', color='orange')
plt.title('Trend of Visits Over Time')
plt.xlabel('Date')
plt.ylabel('Visits')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

# Group data by Date and calculate the sum of Qualified Shoppers
qualified_shoppers_group = data.groupby('Date')['Qualified Shoppers'].sum().reset_index()

# Plot the trend of Qualified Shoppers over time
plt.figure(figsize=(14, 8))
plt.plot(qualified_shoppers_group['Date'], qualified_shoppers_group['Qualified Shoppers'], marker='o', color='green')
plt.title('Trend of Qualified Shoppers Over Time')
plt.xlabel('Date')
plt.ylabel('Qualified Shoppers')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

# Group data by Date and calculate the sum of Gross Orders
gross_orders_group = data.groupby('Date')['Gross Orders'].sum().reset_index()

# Plot the trend of Gross Orders over time
plt.figure(figsize=(14, 8))
plt.plot(gross_orders_group['Date'], gross_orders_group['Gross Orders'], marker='o', color='red')
plt.title('Trend of Gross Orders Over Time')
plt.xlabel('Date')
plt.ylabel('Gross Orders')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

# Group data by Date and calculate the sum of Net Orders
net_orders_group = data.groupby('Date')['Net Orders'].sum().reset_index()

# Plot the trend of Net Orders over time
plt.figure(figsize=(14, 8))
plt.plot(net_orders_group['Date'], net_orders_group['Net Orders'], marker='o', color='purple')
plt.title('Trend of Net Orders Over Time')
plt.xlabel('Date')
plt.ylabel('Net Orders')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

# Analyze the relationship between Net Room Nights and other key metrics over time
plt.figure(figsize=(14, 8))
plt.plot(date_group['Date'], date_group['Net Room Nights'], marker='o', label='Net Room Nights')
plt.plot(visits_group['Date'], visits_group['Visits'], marker='o', label='Visits', color='orange')
plt.plot(qualified_shoppers_group['Date'], qualified_shoppers_group['Qualified Shoppers'], marker='o', label='Qualified Shoppers', color='green')
plt.plot(gross_orders_group['Date'], gross_orders_group['Gross Orders'], marker='o', label='Gross Orders', color='red')
plt.plot(net_orders_group['Date'], net_orders_group['Net Orders'], marker='o', label='Net Orders', color='purple')
plt.title('Trend Analysis of Key Metrics Over Time')
plt.xlabel('Date')
plt.ylabel('Values')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()