In [None]:
##### Hotel rates with respect to customer segment type

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(data=df, x='market_segment', y='avg_daily_rate')
plt.show()

In [None]:
q1 = df.avg_daily_rate.quantile(0.25)
q3 = df.avg_daily_rate.quantile(0.75)
iqr = q3 - q1
ul = q3 + 1.5 * iqr
ul

In [None]:
df.loc[df['avg_daily_rate'] >= 500, 'avg_daily_rate'] = ul

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(data=df, x='market_segment', y='avg_daily_rate')

* Rooms booked Online and Direct have high variation in the prices.
* Complementary type segment has the very low price.
* While Coroprate, Offline and Groups segments have the almost the similar room prices.

In [None]:
plt.figure(figsize=(9,3))
sns.countplot(data = df, x = 'market_segment', order=df['market_segment'].value_counts().sort_values(ascending=False).index)
plt.show()

In [None]:
stacked = (pd.crosstab(df['market_segment'], df['is_canceled'], normalize='index')*100)
stacked

In [None]:
stacked.plot(kind = 'bar', stacked = True)
plt.legend(loc = 'upper right')
plt.xticks(rotation=20)
plt.ylabel("Percentage Cancellation")

In [None]:
* Complementary and Direct booking are the common customer with the lowest percentage of cancellation rate, while the Groups customers along with Online and Offline customers have the higher cancallation rate.
* As per the graph show above, companies should target the Online and Offline customers in addition to Groups booking customers.
* Corporate and Aviation customers also have the higher cancellation rates that hotel should focus on it.

In [None]:
stacked_deposit = (pd.crosstab(df['deposit_type'], df['is_canceled'], normalize='index')*100)
stacked

In [None]:
stacked_deposit.plot(kind = 'bar', stacked = True)
plt.legend(loc = 'upper right')
plt.xticks(rotation = 0)
plt.ylabel("Percentage Cancellation")

In [None]:
* Non-Refundable deposit type results almost 100% cancellation percentage, while the Non-Deposit and Refundable type retain almost 75% of reservation.
* The importance of the deposit type in reservation management is immense.

In [None]:
stacked_repeated = (pd.crosstab(df['is_repeated_guest'], df['is_canceled'], normalize='index')*100)
stacked_repeated

In [None]:
stacked_repeated.plot(kind = 'bar', stacked = True)
plt.legend(loc = 'upper right')
plt.xticks(rotation=0)
plt.xlabel('Repeated Guest')
plt.ylabel("Percentage Cancellation")

In [None]:
* The rate of cancellation of non-repeated guest is higher than repeated guest:
    - It's important for management to focus on improving the experience of non-repeated guest.

In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
grup = df.groupby('arrival_date_month', as_index=False).agg({'avg_daily_rate':'mean'})
grup['arrival_date_month'] = pd.Categorical(grup['arrival_date_month'], categories=months, ordered=True)
grup = grup.sort_values('arrival_date_month')
grup

In [None]:
plt.figure(figsize=(12,4))
sns.lineplot(x = grup['arrival_date_month'], y = grup['avg_daily_rate'])
plt.show()

In [None]:
##### 3. Outlier Detection and Treatment (IQR Method with Winsorization)


In [None]:
##### 4. Association Rule Mining using Apriori Algorithm


In [None]:


from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import warnings
warnings.filterwarnings('ignore')

print("Apriori Algorithm for Association Rule Mining")
print("=" * 80)
print("Objective: Find frequent patterns and association rules in hotel booking data")
print("Focus: Discover which combinations of features are associated with cancellations")
print("=" * 80)


In [None]:
##### 4.2. Data Preparation for Apriori Algorithm


In [None]:
# Create a copy of the original dataframe for Apriori analysis
df_apriori = pd.read_csv(r"C:\Users\91904\OneDrive\Desktop\dm-2\Hotel-Reservation-Cancellation-Prediction\hotel_booking.csv")

# Select key categorical and binned numerical features for association rule mining
# Convert numerical features to categorical bins for better rule discovery

# Lead Time Binning
df_apriori['lead_time_category'] = pd.cut(df_apriori['lead_time'], 
                                           bins=[0, 30, 90, 180, 365, float('inf')],
                                           labels=['Very_Short', 'Short', 'Medium', 'Long', 'Very_Long'])

# Average Daily Rate Binning
df_apriori['adr_category'] = pd.cut(df_apriori['avg_daily_rate'],
                                   bins=[0, 50, 100, 150, 200, float('inf')],
                                   labels=['Low', 'Medium', 'High', 'Very_High', 'Premium'])

# Stays Binning
df_apriori['total_stays'] = df_apriori['stays_in_weekend_nights'] + df_apriori['stays_in_week_nights']
df_apriori['total_stays_category'] = pd.cut(df_apriori['total_stays'],
                                            bins=[0, 2, 5, 10, float('inf')],
                                            labels=['Short_Stay', 'Medium_Stay', 'Long_Stay', 'Extended_Stay'])

# Days in Waiting List Binning
df_apriori['waiting_list_category'] = pd.cut(df_apriori['days_in_waiting_list'],
                                              bins=[-1, 0, 30, 100, float('inf')],
                                              labels=['No_Wait', 'Short_Wait', 'Medium_Wait', 'Long_Wait'])

# Convert cancellation to categorical
df_apriori['is_canceled_cat'] = df_apriori['is_canceled'].map({0: 'Not_Canceled', 1: 'Canceled'})

# print("Data prepared for Apriori Algorithm")
# print(f"Total records: {len(df_apriori)}")
# print("\nSelected features for association rule mining:")
# print("  - Hotel type")
# print("  - Market segment")
# print("  - Deposit type")
# print("  - Meal type")
# print("  - Is repeated guest")
# print("  - Lead time category")
# print("  - ADR category")
# print("  - Total stays category")
# print("  - Waiting list category")
# print("  - Cancellation status")


In [None]:
##### 4.3. Creating Transaction Dataset


In [None]:
# Select key features for Apriori (reduced set to avoid memory issues)
features_for_apriori = [
    'hotel',
    'market_segment',
    'deposit_type',
    'meal',
    'is_repeated_guest',
    'lead_time_category',
    'adr_category',
    'total_stays_category',
    'is_canceled_cat'
]

# Use a sample of data to reduce memory usage (20% sample = ~24,000 records)
sample_size = int(len(df_apriori) * 0.2)
df_apriori_sample = df_apriori.sample(n=sample_size, random_state=42).reset_index(drop=True)

print(f"Using sample of {sample_size:,} records ({sample_size/len(df_apriori)*100:.1f}% of total data)")
print("This reduces memory usage while maintaining statistical significance.")

# Create transactions with simpler item names (reduces unique items)
transactions = []
for idx, row in df_apriori_sample.iterrows():
    transaction = []
    for feature in features_for_apriori:
        if pd.notna(row[feature]):
            # Use simpler format: feature_value (no spaces, shorter)
            value = str(row[feature]).replace(' ', '_').replace('-', '_')
            transaction.append(f"{feature}_{value}")
    transactions.append(transaction)

# print(f"\nTotal transactions created: {len(transactions)}")
# print(f"Sample transaction (first 5 items): {transactions[0][:5]}")
# print(f"Average items per transaction: {np.mean([len(t) for t in transactions]):.2f}")

# Encode transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# print(f"\nEncoded dataset shape: {df_encoded.shape}")
# print(f"Total unique items: {len(te.columns_)}")
# print(f"Memory usage reduced significantly!")


In [None]:
##### 4.4. Finding Frequent Itemsets using Apriori


In [None]:
# Apply Apriori algorithm to find frequent itemsets
# Increased min_support to reduce memory usage and focus on more frequent patterns
min_support = 0.05  # 5% of transactions (increased from 1% to reduce combinations)

# print("Running Apriori Algorithm...")
# print(f"Minimum Support: {min_support} ({min_support*100}% of transactions)")
# print(f"Using low_memory mode to handle large dataset efficiently")
# print("=" * 80)

# Use low_memory=True and max_len to limit itemset size
try:
    frequent_itemsets = apriori(df_encoded, 
                                min_support=min_support, 
                                use_colnames=True, 
                                verbose=1,
                                low_memory=True,
                                max_len=4)  # Limit to 4-item itemsets max
    
    print(f"\n✓ Apriori algorithm completed!")
    print(f"Total frequent itemsets found: {len(frequent_itemsets)}")
    
    if len(frequent_itemsets) > 0:
        print(f"\nTop 10 frequent itemsets by support:")
        print(frequent_itemsets.nlargest(10, 'support').to_string(index=False))
    else:
        print("\nNo frequent itemsets found. Try reducing min_support threshold.")
        
except MemoryError:
    print("\n⚠ Memory error still occurring. Further optimizations needed:")
    print("   - Increasing min_support to 0.1 (10%)")
    print("   - Using even smaller sample size")
    
    # Try with higher support
    min_support = 0.1
    print(f"\nRetrying with min_support = {min_support}...")
    frequent_itemsets = apriori(df_encoded, 
                                min_support=min_support, 
                                use_colnames=True, 
                                verbose=1,
                                low_memory=True,
                                max_len=3)  # Limit to 3-item itemsets
    
    print(f"\n✓ Apriori algorithm completed with higher support!")
    print(f"Total frequent itemsets found: {len(frequent_itemsets)}")
    if len(frequent_itemsets) > 0:
        print(f"\nTop 10 frequent itemsets by support:")
        print(frequent_itemsets.nlargest(10, 'support').to_string(index=False))


In [None]:
# Visualize frequent itemsets
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Top 15 frequent itemsets
top_itemsets = frequent_itemsets.nlargest(15, 'support')

axes[0].barh(range(len(top_itemsets)), top_itemsets['support'], color='steelblue', alpha=0.7)
axes[0].set_yticks(range(len(top_itemsets)))
axes[0].set_yticklabels([', '.join(list(itemset)[:2]) + '...' if len(itemset) > 2 else ', '.join(list(itemset)) 
                          for itemset in top_itemsets['itemsets']], fontsize=8)
axes[0].set_xlabel('Support', fontsize=12, fontweight='bold')
axes[0].set_title('Top 15 Frequent Itemsets by Support', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='x')
axes[0].invert_yaxis()

# Itemset length distribution
itemset_lengths = frequent_itemsets['itemsets'].apply(len)
length_counts = itemset_lengths.value_counts().sort_index()

axes[1].bar(length_counts.index, length_counts.values, color='coral', alpha=0.7)
axes[1].set_xlabel('Itemset Length', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Number of Itemsets', fontsize=12, fontweight='bold')
axes[1].set_title('Distribution of Itemset Lengths', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

for i, v in enumerate(length_counts.values):
    axes[1].text(length_counts.index[i], v + 10, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nItemset Length Statistics:")
print(f"  1-item itemsets: {len(frequent_itemsets[itemset_lengths == 1])}")
print(f"  2-item itemsets: {len(frequent_itemsets[itemset_lengths == 2])}")
print(f"  3-item itemsets: {len(frequent_itemsets[itemset_lengths == 3])}")
print(f"  4+ item itemsets: {len(frequent_itemsets[itemset_lengths >= 4])}")


In [None]:
# Generate association rules from frequent itemsets
# Check if we have frequent itemsets first
if len(frequent_itemsets) == 0:
    print(" No frequent itemsets found. Cannot generate association rules.")
    print("Try reducing min_support threshold in the previous cell.")
    rules = pd.DataFrame()  # Empty dataframe
else:
    # min_threshold: minimum confidence threshold
    min_confidence = 0.3

    print("Generating Association Rules...")
    print(f"Minimum Confidence: {min_confidence} ({min_confidence*100}%)")
    print("=" * 80)

    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

    # Sort by confidence and lift
    rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])

    print(f"\n Association rules generated!")
    print(f"Total rules found: {len(rules)}")
    
    if len(rules) > 0:
        print(f"\nTop 10 Association Rules by Confidence:")
        print("=" * 80)

        # Display top rules in a readable format
        top_rules = rules.head(10)
        for idx, (rule_idx, rule) in enumerate(top_rules.iterrows(), 1):
            antecedents = ', '.join([str(item).replace('_', ' ').title() for item in list(rule['antecedents'])])
            consequents = ', '.join([str(item).replace('_', ' ').title() for item in list(rule['consequents'])])
            print(f"\nRule {idx}:")
            print(f"  IF {antecedents}")
            print(f"  THEN {consequents}")
            print(f"  Support: {rule['support']:.4f} | Confidence: {rule['confidence']:.4f} | Lift: {rule['lift']:.4f}")
    else:
        print("No association rules found with current confidence threshold.")
        print("Try reducing min_confidence threshold.")


In [None]:
# Filter rules related to cancellation (only if rules exist)
if len(rules) > 0:
    # Helper function to check if consequents contain 'canceled' but not 'not_canceled'
    def is_canceled_rule(consequents):
        consequents_str = ' '.join([str(item).lower() for item in consequents])
        return 'canceled' in consequents_str and 'not_canceled' not in consequents_str
    
    # Helper function to check if consequents contain 'not_canceled'
    def is_not_canceled_rule(consequents):
        consequents_str = ' '.join([str(item).lower() for item in consequents])
        return 'not_canceled' in consequents_str
    
    canceled_rules = rules[rules['consequents'].apply(is_canceled_rule)]
    not_canceled_rules = rules[rules['consequents'].apply(is_not_canceled_rule)]

    print("=" * 80)
    print("ASSOCIATION RULES RELATED TO CANCELLATION")
    print("=" * 80)

    print(f"\nRules leading to CANCELLATION (Top 10):")
    print("-" * 80)
    if len(canceled_rules) > 0:
        top_canceled = canceled_rules.head(10)
        for idx, (rule_idx, rule) in enumerate(top_canceled.iterrows(), 1):
            antecedents = ', '.join([str(item).replace('_', ' ').title() for item in list(rule['antecedents'])])
            print(f"\n{idx}. IF {antecedents}")
            print(f"   THEN Canceled")
            print(f"   Support: {rule['support']:.4f} | Confidence: {rule['confidence']:.4f} | Lift: {rule['lift']:.4f}")
    else:
        print("No rules found leading to cancellation with current thresholds.")

    print(f"\n\nRules leading to NO CANCELLATION (Top 10):")
    print("-" * 80)
    if len(not_canceled_rules) > 0:
        top_not_canceled = not_canceled_rules.head(10)
        for idx, (rule_idx, rule) in enumerate(top_not_canceled.iterrows(), 1):
            antecedents = ', '.join([str(item).replace('_', ' ').title() for item in list(rule['antecedents'])])
            print(f"\n{idx}. IF {antecedents}")
            print(f"   THEN Not Canceled")
            print(f"   Support: {rule['support']:.4f} | Confidence: {rule['confidence']:.4f} | Lift: {rule['lift']:.4f}")
    else:
        print("No rules found leading to no cancellation with current thresholds.")
else:
    print("No association rules available to filter.")
    canceled_rules = pd.DataFrame()
    not_canceled_rules = pd.DataFrame()


In [None]:
##### 4.6. Visualization of Association Rules


In [None]:
# Visualization (only if we have rules)
if len(rules) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    # Plot 1: Support vs Confidence
    scatter1 = axes[0, 0].scatter(rules['support'], rules['confidence'], 
                       c=rules['lift'], cmap='viridis', alpha=0.6, s=50)
    axes[0, 0].set_xlabel('Support', fontsize=12, fontweight='bold')
    axes[0, 0].set_ylabel('Confidence', fontsize=12, fontweight='bold')
    axes[0, 0].set_title('Support vs Confidence (colored by Lift)', fontsize=14, fontweight='bold')
    axes[0, 0].grid(True, alpha=0.3)
    cbar1 = plt.colorbar(scatter1, ax=axes[0, 0])
    cbar1.set_label('Lift', fontsize=10)

    # Plot 2: Confidence vs Lift
    scatter2 = axes[0, 1].scatter(rules['confidence'], rules['lift'], 
                       c=rules['support'], cmap='plasma', alpha=0.6, s=50)
    axes[0, 1].set_xlabel('Confidence', fontsize=12, fontweight='bold')
    axes[0, 1].set_ylabel('Lift', fontsize=12, fontweight='bold')
    axes[0, 1].set_title('Confidence vs Lift (colored by Support)', fontsize=14, fontweight='bold')
    axes[0, 1].grid(True, alpha=0.3)
    cbar2 = plt.colorbar(scatter2, ax=axes[0, 1])
    cbar2.set_label('Support', fontsize=10)

    # Plot 3: Top rules by confidence (for cancellation)
    if len(canceled_rules) > 0:
        top_canceled_viz = canceled_rules.head(10)
        y_pos = np.arange(len(top_canceled_viz))
        axes[1, 0].barh(y_pos, top_canceled_viz['confidence'], color='crimson', alpha=0.7)
        axes[1, 0].set_yticks(y_pos)
        # Use iterrows() instead of itertuples() to access columns by name
        labels = []
        for idx, rule in top_canceled_viz.iterrows():
            antecedents = list(rule['antecedents'])
            label = ', '.join([str(item).replace('_', ' ')[:15] for item in antecedents[:2]])
            if len(antecedents) > 2:
                label += '...'
            labels.append(label[:40])
        axes[1, 0].set_yticklabels(labels, fontsize=8)
        axes[1, 0].set_xlabel('Confidence', fontsize=12, fontweight='bold')
        axes[1, 0].set_title('Top 10 Rules Leading to Cancellation', fontsize=14, fontweight='bold')
        axes[1, 0].grid(True, alpha=0.3, axis='x')
        axes[1, 0].invert_yaxis()
    else:
        axes[1, 0].text(0.5, 0.5, 'No cancellation rules found', 
                        ha='center', va='center', fontsize=12)
        axes[1, 0].set_title('Top 10 Rules Leading to Cancellation', fontsize=14, fontweight='bold')

    # Plot 4: Top rules by confidence (for no cancellation)
    if len(not_canceled_rules) > 0:
        top_not_canceled_viz = not_canceled_rules.head(10)
        y_pos = np.arange(len(top_not_canceled_viz))
        axes[1, 1].barh(y_pos, top_not_canceled_viz['confidence'], color='forestgreen', alpha=0.7)
        axes[1, 1].set_yticks(y_pos)
        # Use iterrows() instead of itertuples() to access columns by name
        labels = []
        for idx, rule in top_not_canceled_viz.iterrows():
            antecedents = list(rule['antecedents'])
            label = ', '.join([str(item).replace('_', ' ')[:15] for item in antecedents[:2]])
            if len(antecedents) > 2:
                label += '...'
            labels.append(label[:40])
        axes[1, 1].set_yticklabels(labels, fontsize=8)
        axes[1, 1].set_xlabel('Confidence', fontsize=12, fontweight='bold')
        axes[1, 1].set_title('Top 10 Rules Leading to No Cancellation', fontsize=14, fontweight='bold')
        axes[1, 1].grid(True, alpha=0.3, axis='x')
        axes[1, 1].invert_yaxis()
    else:
        axes[1, 1].text(0.5, 0.5, 'No no-cancellation rules found', 
                        ha='center', va='center', fontsize=12)
        axes[1, 1].set_title('Top 10 Rules Leading to No Cancellation', fontsize=14, fontweight='bold')

    plt.suptitle('Association Rules Analysis - Apriori Algorithm', fontsize=16, fontweight='bold', y=0.995)
    plt.tight_layout()
    plt.show()
else:
    print("No rules available for visualization. Please ensure frequent itemsets were found.")


In [None]:
##### 4.7. Key Insights from Apriori Algorithm


In [None]:
# Summary statistics
print("=" * 80)
print("APRIORI ALGORITHM SUMMARY")
print("=" * 80)

print(f"\n1. Data Processing:")
print(f"   - Sample size used: {sample_size:,} records ({sample_size/len(df_apriori)*100:.1f}% of total)")
print(f"   - Features analyzed: {len(features_for_apriori)}")
print(f"   - Unique items in transactions: {len(te.columns_)}")

print(f"\n2. Frequent Itemsets:")
if len(frequent_itemsets) > 0:
    print(f"   - Total frequent itemsets found: {len(frequent_itemsets)}")
    print(f"   - Minimum support threshold: {min_support} ({min_support*100}%)")
else:
    print(f"   - No frequent itemsets found with min_support = {min_support}")
    print(f"   - Try reducing min_support threshold")

if len(rules) > 0:
    print(f"\n3. Association Rules:")
    print(f"   - Total rules generated: {len(rules)}")
    print(f"   - Minimum confidence threshold: {min_confidence} ({min_confidence*100}%)")
    print(f"   - Average confidence: {rules['confidence'].mean():.4f}")
    print(f"   - Average lift: {rules['lift'].mean():.4f}")
    print(f"   - Maximum lift: {rules['lift'].max():.4f}")

    print(f"\n4. Cancellation-Related Rules:")
    print(f"   - Rules leading to cancellation: {len(canceled_rules)}")
    if len(canceled_rules) > 0:
        print(f"   - Average confidence: {canceled_rules['confidence'].mean():.4f}")
        print(f"   - Average lift: {canceled_rules['lift'].mean():.4f}")

    print(f"   - Rules leading to no cancellation: {len(not_canceled_rules)}")
    if len(not_canceled_rules) > 0:
        print(f"   - Average confidence: {not_canceled_rules['confidence'].mean():.4f}")
        print(f"   - Average lift: {not_canceled_rules['lift'].mean():.4f}")
else:
    print(f"\n3. Association Rules:")
    print(f"   - No rules generated (no frequent itemsets found)")

print(f"\n5. Key Metrics Explained:")
print(f"   - Support: Frequency of itemset in transactions")
print(f"   - Confidence: Probability of consequent given antecedent")
print(f"   - Lift: How much more likely consequent is with antecedent")
print(f"     (Lift > 1: Positive association, Lift < 1: Negative association)")

print("\n" + "=" * 80)
print("✓ Apriori algorithm analysis completed!")
print("=" * 80)


In [None]:
# Summary statistics
print("=" * 80)
print("APRIORI ALGORITHM SUMMARY")
print("=" * 80)

print(f"\n1. Data Processing:")
print(f"   - Sample size used: {sample_size:,} records ({sample_size/len(df_apriori)*100:.1f}% of total)")
print(f"   - Features analyzed: {len(features_for_apriori)}")
print(f"   - Unique items in transactions: {len(te.columns_)}")

print(f"\n2. Frequent Itemsets:")
if len(frequent_itemsets) > 0:
    print(f"   - Total frequent itemsets found: {len(frequent_itemsets)}")
    print(f"   - Minimum support threshold: {min_support} ({min_support*100}%)")
else:
    print(f"   - No frequent itemsets found with min_support = {min_support}")
    print(f"   - Try reducing min_support threshold")

if len(rules) > 0:
    print(f"\n3. Association Rules:")
    print(f"   - Total rules generated: {len(rules)}")
    print(f"   - Minimum confidence threshold: {min_confidence} ({min_confidence*100}%)")
    print(f"   - Average confidence: {rules['confidence'].mean():.4f}")
    print(f"   - Average lift: {rules['lift'].mean():.4f}")
    print(f"   - Maximum lift: {rules['lift'].max():.4f}")

    print(f"\n4. Cancellation-Related Rules:")
    print(f"   - Rules leading to cancellation: {len(canceled_rules)}")
    if len(canceled_rules) > 0:
        print(f"   - Average confidence: {canceled_rules['confidence'].mean():.4f}")
        print(f"   - Average lift: {canceled_rules['lift'].mean():.4f}")

    print(f"   - Rules leading to no cancellation: {len(not_canceled_rules)}")
    if len(not_canceled_rules) > 0:
        print(f"   - Average confidence: {not_canceled_rules['confidence'].mean():.4f}")
        print(f"   - Average lift: {not_canceled_rules['lift'].mean():.4f}")
else:
    print(f"\n3. Association Rules:")
    print(f"   - No rules generated (no frequent itemsets found)")

print(f"\n5. Key Metrics Explained:")
print(f"   - Support: Frequency of itemset in transactions")
print(f"   - Confidence: Probability of consequent given antecedent")
print(f"   - Lift: How much more likely consequent is with antecedent")
print(f"     (Lift > 1: Positive association, Lift < 1: Negative association)")

print("\n" + "=" * 80)
print("✓ Apriori algorithm analysis completed!")
print("=" * 80)


In [None]:
# Select numeric columns for outlier detection (excluding target variable and year)
numeric_cols = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 
                'adults', 'children', 'is_repeated_guest', 'previous_cancellations',
                'previous_bookings_not_canceled', 'booking_changes', 
                'days_in_waiting_list', 'avg_daily_rate', 
                'required_car_parking_spaces', 'total_of_special_requests']

print("Original dataset shape:", df.shape)
print(f"Number of numeric columns for outlier detection: {len(numeric_cols)}")
print(f"Columns: {', '.join(numeric_cols)}")


In [None]:
##### 3.1. IQR-Based Outlier Detection and Treatment


In [None]:
def detect_and_treat_outliers_iqr(df, columns):
    
    # Detect and treat outliers using IQR method with Winsorization (capping)
    # This is the best method for hotel booking data as it:
    # - Is robust to outliers (uses median-based quartiles)
    # - Preserves all data points (caps instead of removing)
    # - Works well with skewed distributions
    # - Prevents data loss
    
    # Returns: cleaned dataframe and summary statistics
    
    df_cleaned = df.copy()
    summary = {}
    total_outliers = 0
    
    print("IQR-Based Outlier Detection and Treatment (Winsorization)")
    print("=" * 90)
    print(f"{'Column':<30} {'Outliers':<15} {'Percentage':<15} {'Action':<20}")
    print("-" * 90)
    
    for col in columns:
        # Calculate IQR
        Q1 = df_cleaned[col].quantile(0.25)
        Q3 = df_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        
        # Calculate bounds (1.5 * IQR is standard)
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Ensure lower bound is not negative for count-based features
        if col in ['adults', 'children', 'stays_in_weekend_nights', 'stays_in_week_nights',
                   'previous_cancellations', 'previous_bookings_not_canceled', 
                   'booking_changes', 'required_car_parking_spaces', 
                   'total_of_special_requests', 'days_in_waiting_list']:
            lower_bound = max(0, lower_bound)
        
        # Count outliers before treatment
        outliers_before = ((df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)).sum()
        outlier_pct = (outliers_before / len(df_cleaned)) * 100
        total_outliers += outliers_before
        
        # Cap outliers (Winsorization)
        df_cleaned[col] = np.clip(df_cleaned[col], lower_bound, upper_bound)
        
        # Store summary
        summary[col] = {
            'outliers_before': outliers_before,
            'outlier_percentage': outlier_pct,
            'lower_bound': lower_bound,
            'upper_bound': upper_bound,
            'Q1': Q1,
            'Q3': Q3,
            'IQR': IQR
        }
        
        # Print summary for this column
        action = f"Capped to [{lower_bound:.1f}, {upper_bound:.1f}]"
        print(f"{col:<30} {outliers_before:<15} {outlier_pct:<14.2f}% {action:<20}")
    
    print("-" * 90)
    print(f"{'TOTAL OUTLIERS TREATED':<30} {total_outliers:<15} {(total_outliers/len(df)*100):<14.2f}%")
    print("=" * 90)
    
    return df_cleaned, summary

# Apply outlier detection and treatment
df_cleaned, outlier_summary = detect_and_treat_outliers_iqr(df, numeric_cols)

print(f"\n✓ Outlier treatment completed!")
print(f"Dataset shape: {df.shape} → {df_cleaned.shape} (no rows removed)")


In [None]:
##### 3.2. Visualization of Outliers (Before Treatment)


In [None]:
# Visualize outliers in key columns using boxplots
key_cols = ['lead_time', 'avg_daily_rate', 'stays_in_weekend_nights', 
            'stays_in_week_nights', 'days_in_waiting_list', 'previous_cancellations']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(key_cols):
    # Create boxplot
    bp = axes[idx].boxplot(df[col], vert=True, patch_artist=True)
    bp['boxes'][0].set_facecolor('lightblue')
    axes[idx].set_title(f'Boxplot of {col}\n(Outliers shown as points)', fontsize=11, fontweight='bold')
    axes[idx].set_ylabel('Value', fontsize=10)
    axes[idx].grid(True, alpha=0.3)
    
    # Add outlier count annotation
    info = outlier_summary[col]
    axes[idx].text(0.5, 0.95, f"Outliers: {info['outliers_before']} ({info['outlier_percentage']:.1f}%)",
                   transform=axes[idx].transAxes, ha='center', va='top',
                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
                   fontsize=9)

plt.suptitle('Outlier Visualization - Before Treatment', fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()


In [None]:
##### 3.3. Statistical Comparison: Before vs After Treatment


In [None]:
# Compare statistics before and after outlier treatment
comparison_cols = ['lead_time', 'avg_daily_rate', 'stays_in_weekend_nights', 
                   'stays_in_week_nights', 'days_in_waiting_list', 'previous_cancellations']

print("\nStatistical Comparison: Before vs After Outlier Treatment")
print("=" * 110)
print(f"{'Column':<25} {'Metric':<15} {'Before':<15} {'After':<15} {'Change':<15} {'% Change':<15}")
print("-" * 110)

for col in comparison_cols:
    before_mean = df[col].mean()
    after_mean = df_cleaned[col].mean()
    before_std = df[col].std()
    after_std = df_cleaned[col].std()
    before_max = df[col].max()
    after_max = df_cleaned[col].max()
    before_median = df[col].median()
    after_median = df_cleaned[col].median()
    
    print(f"\n{col.upper()}")
    print(f"{'':<25} {'Mean':<15} {before_mean:<15.2f} {after_mean:<15.2f} {after_mean-before_mean:<15.2f} {((after_mean-before_mean)/before_mean*100):<14.2f}%")
    print(f"{'':<25} {'Std Dev':<15} {before_std:<15.2f} {after_std:<15.2f} {after_std-before_std:<15.2f} {((after_std-before_std)/before_std*100):<14.2f}%")
    print(f"{'':<25} {'Median':<15} {before_median:<15.2f} {after_median:<15.2f} {after_median-before_median:<15.2f} {((after_median-before_median)/before_median*100):<14.2f}%")
    print(f"{'':<25} {'Max':<15} {before_max:<15.2f} {after_max:<15.2f} {after_max-before_max:<15.2f} {((after_max-before_max)/before_max*100):<14.2f}%")

print("=" * 110)


In [None]:
##### 3.4. Visualization: After Treatment


In [None]:
# Visualize the cleaned data to show the effect of outlier treatment
key_cols = ['lead_time', 'avg_daily_rate', 'stays_in_weekend_nights', 
            'stays_in_week_nights', 'days_in_waiting_list', 'previous_cancellations']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(key_cols):
    # Create boxplot for cleaned data
    bp = axes[idx].boxplot(df_cleaned[col], vert=True, patch_artist=True)
    bp['boxes'][0].set_facecolor('lightgreen')
    axes[idx].set_title(f'Boxplot of {col}\n(After Outlier Treatment)', fontsize=11, fontweight='bold')
    axes[idx].set_ylabel('Value', fontsize=10)
    axes[idx].grid(True, alpha=0.3)
    
    # Add summary annotation
    info = outlier_summary[col]
    axes[idx].text(0.5, 0.95, f"Outliers treated: {info['outliers_before']}",
                   transform=axes[idx].transAxes, ha='center', va='top',
                   bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.5),
                   fontsize=9)

plt.suptitle('Outlier Visualization - After Treatment (Winsorization)', fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()


In [None]:
##### 3.5. Update Dataset


In [None]:
# Update the main dataframe to use cleaned data
df = df_cleaned.copy()

print("✓ Dataset updated with outlier-treated data")
print(f"Final dataset shape: {df.shape}")
print(f"✓ All outliers have been treated using IQR Winsorization method")
print(f"✓ Data is now ready for model building")


In [None]:
##### Summary


In [None]:
# Display final summary
total_treated = sum([info['outliers_before'] for info in outlier_summary.values()])
total_pct = (total_treated / len(df)) * 100

print("\n" + "=" * 90)
print("OUTLIER DETECTION AND TREATMENT SUMMARY")
print("=" * 90)
print(f"Method Used: IQR (Interquartile Range) with Winsorization")
print(f"Total Outliers Treated: {total_treated:,} ({total_pct:.2f}% of dataset)")
print(f"Treatment Method: Capping (preserves all data points)")
print(f"Dataset Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print("\nWhy IQR Method?")
print("  ✓ Robust to outliers (uses quartiles, not mean/std)")
print("  ✓ Preserves all data (no rows removed)")
print("  ✓ Works well with skewed distributions")
print("  ✓ Industry standard for hotel/booking data")
print("  ✓ Prevents extreme values from skewing the model")
print("=" * 90)


In [None]:
##### Model Making

In [None]:
##### 1. Data Import

In [None]:
df.head()

In [None]:
##### 2. Data Pre-Processing

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
fig.show()

In [None]:
##### 3. Labeling the Data

In [None]:
df_num = df.select_dtypes(include=[np.number])
df_num.head()

In [None]:
df_cat = df.select_dtypes(include=['object'])
df_cat.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
df_cat = df_cat.apply(LabelEncoder().fit_transform)
df_cat.head(3)

In [None]:
df = pd.concat([df_num, df_cat], axis=1)
df.head()

In [None]:
##### 3. Data Partition

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['is_canceled'], axis = 1)
y = df['is_canceled']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)

In [None]:
df.shape

In [None]:
##### 4. Model Building (Random Forest)

In [None]:
##### Grid Search method

In [None]:
paras = {'min_samples_split': [50, 75, 150, 250], 
         'min_samples_leaf': [10, 30, 50, 70],
         'max_depth': np.arange(2, 7, 2)}

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42),
                           paras,
                           verbose=1,
                           cv=10)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
from sklearn.ensemble import RandomForestClassifier

Model = RandomForestClassifier(n_estimators=25,
                               criterion='gini',
                               max_depth=6,
                               min_samples_leaf=10,
                               min_samples_split=50,
                               max_features='sqrt')
Model.fit(X_train, y_train)

In [None]:
##### Important Features

In [None]:
imp = pd.Series(data = Model.feature_importances_, index = Model.feature_names_in_).sort_values(ascending=False)
plt.figure(figsize=(10, 10))
plt.title("Feature Imprtance / Selection")
ax = sns.barplot(y = imp.index, x = imp.values, palette = 'BrBG', orient = 'h')

In [None]:
##### Random Forest Visulization

In [None]:
from sklearn.tree import export_graphviz
import pydot

In [None]:
list(X.columns)

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

feature_list = list(X.columns)
canceled = ['No', 'Yes']

# pick one tree from the RandomForest
tree = Model.estimators_[10]

plt.figure(figsize=(25, 12))
plot_tree(
    tree,
    feature_names=feature_list,
    class_names=canceled,
    filled=True,
    rounded=True,
    fontsize=8
)

plt.savefig("project_tree.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
##### 5. Prediction on Train dataset

In [None]:
train = pd.concat([X_train, y_train], axis=1)
train.head()

In [None]:
train['Predicted'] = Model.predict(X_train)
train.head()

In [None]:
##### 6. Model Performance

In [None]:
from sklearn.metrics import classification_report
print(classification_report(train['is_canceled'], train['Predicted']))

In [None]:
##### 7. Predictions on Test dataset

In [None]:
test = pd.concat([X_test, y_test], axis=1)
test.head()

In [None]:
test['Predicted'] = Model.predict(X_test)
test.head()

In [None]:
##### 8. Model Performance

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test['is_canceled'], test['Predicted']))

In [None]:
import pickle

pickle.dump(Model, open(r'build.pkl', 'wb'))