# 📊 Exploratory Data Analysis (EDA)

This notebook focuses on discovering trends, correlations, and insights from the Airbnb New User Booking dataset. Through various visualizations, we'll explore relationships between user attributes and their booking destinations.

## Objectives
- Visualize key relationships between features and target variable
- Identify patterns in user behavior and demographics
- Discover potentially useful features for prediction
- Gain insights to guide our modeling approach

## Import Required Libraries

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Others
import warnings
from pathlib import Path

# Settings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_palette('viridis')
%matplotlib inline

## Load Preprocessed Data

We'll load the preprocessed data from the previous step. This data should have handled missing values, encoded categorical variables, and created new features.

In [None]:
# Define paths
data_path = Path('../data/processed')

# Load preprocessed data
df = pd.read_csv(data_path / 'preprocessed_train_users.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Check column types and summary statistics
df.info()

In [None]:
# Check numerical statistics
df.describe()

## Target Variable Analysis

Let's first look at the distribution of our target variable: `country_destination`.

In [None]:
# Count the number of users in each destination
dest_counts = df['country_destination'].value_counts().reset_index()
dest_counts.columns = ['Country', 'Count']
dest_counts['Percentage'] = 100 * dest_counts['Count'] / dest_counts['Count'].sum()

# Create a bar plot
plt.figure(figsize=(12, 6))
sns.barplot(x='Country', y='Count', data=dest_counts, palette='viridis')
plt.title('Distribution of Country Destinations', fontsize=16)
plt.xlabel('Country', fontsize=14)
plt.ylabel('Number of Users', fontsize=14)
plt.xticks(rotation=45)
for i, v in enumerate(dest_counts['Count']):
    plt.text(i, v + 500, f'{dest_counts["Percentage"][i]:.1f}%', ha='center', fontsize=10)
plt.tight_layout()
plt.show()

# Create a pie chart
plt.figure(figsize=(10, 10))
plt.pie(dest_counts['Count'], labels=dest_counts['Country'], autopct='%1.1f%%', 
        startangle=90, shadow=True, explode=[0.05] * len(dest_counts))
plt.title('Proportion of Users by Destination Country', fontsize=16)
plt.axis('equal')
plt.tight_layout()
plt.show()

## Demographic Analysis

### Age Distribution by Destination

In [None]:
# Violin plot of age distribution by destination
plt.figure(figsize=(14, 8))
sns.violinplot(x='country_destination', y='age', data=df, palette='viridis', 
               inner='quartile', scale='width')
plt.title('Age Distribution by Destination Country', fontsize=16)
plt.xlabel('Country', fontsize=14)
plt.ylabel('Age', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Box plot for another view
plt.figure(figsize=(14, 8))
sns.boxplot(x='country_destination', y='age', data=df, palette='viridis')
plt.title('Age Distribution by Destination Country (Box Plot)', fontsize=16)
plt.xlabel('Country', fontsize=14)
plt.ylabel('Age', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Age group analysis
# Create age groups
bins = [0, 25, 35, 45, 55, 65, 100]
labels = ['<25', '25-34', '35-44', '45-54', '55-64', '65+']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

# Calculate percentages of each age group per destination
age_dest = pd.crosstab(df['age_group'], df['country_destination'], normalize='columns') * 100

# Plot heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(age_dest, annot=True, cmap='viridis', fmt='.1f')
plt.title('Age Group Distribution (%) by Destination Country', fontsize=16)
plt.tight_layout()
plt.show()

### Gender Analysis

In [None]:
# Gender distribution by destination
gender_dest = pd.crosstab(df['gender'], df['country_destination'])
gender_dest_pct = pd.crosstab(df['gender'], df['country_destination'], normalize='columns') * 100

# Stacked bar chart
gender_dest.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='viridis')
plt.title('Gender Distribution by Destination Country', fontsize=16)
plt.xlabel('Gender', fontsize=14)
plt.ylabel('Number of Users', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Percentage view
plt.figure(figsize=(14, 8))
sns.heatmap(gender_dest_pct, annot=True, cmap='viridis', fmt='.1f')
plt.title('Gender Distribution (%) by Destination Country', fontsize=16)
plt.tight_layout()
plt.show()

## Marketing Channel Analysis

In [None]:
# First affiliate tracked distribution
top_affiliates = df['first_affiliate_tracked'].value_counts().nlargest(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=top_affiliates.index, y=top_affiliates.values, palette='viridis')
plt.title('Top 10 First Affiliate Tracked', fontsize=16)
plt.xlabel('Affiliate', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Affiliate tracked vs destination
affiliate_dest = pd.crosstab(df['first_affiliate_tracked'], df['country_destination'], normalize='index') * 100

plt.figure(figsize=(14, 10))
sns.heatmap(affiliate_dest, annot=True, cmap='viridis', fmt='.1f')
plt.title('Destination Distribution (%) by First Affiliate Tracked', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Marketing channel signup flows
signup_flows = df['signup_flow'].value_counts().nlargest(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=signup_flows.index, y=signup_flows.values, palette='viridis')
plt.title('Top 10 Signup Flows', fontsize=16)
plt.xlabel('Signup Flow', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.tight_layout()
plt.show()

# Signup flow vs destination
flow_dest = pd.crosstab(df['signup_flow'], df['country_destination'])
flow_dest_pct = pd.crosstab(df['signup_flow'], df['country_destination'], normalize='index') * 100

# Focus on top 6 flows for readability
top_flows = signup_flows.nlargest(6).index
flow_dest_pct_filtered = flow_dest_pct.loc[top_flows]

plt.figure(figsize=(14, 8))
sns.heatmap(flow_dest_pct_filtered, annot=True, cmap='viridis', fmt='.1f')
plt.title('Destination Distribution (%) by Top Signup Flows', fontsize=16)
plt.tight_layout()
plt.show()

## Device and Browser Analysis

In [None]:
# Device type analysis
device_counts = df['first_device_type'].value_counts()

# Create a pie chart for device distribution
plt.figure(figsize=(12, 8))
plt.pie(device_counts, labels=device_counts.index, autopct='%1.1f%%', 
        startangle=90, shadow=True, 
        explode=[0.05 if i < 3 else 0 for i in range(len(device_counts))])
plt.title('Distribution of First Device Type', fontsize=16)
plt.axis('equal')
plt.tight_layout()
plt.show()

# Device type vs destination
# Focus on top devices for readability
top_devices = device_counts.nlargest(6).index
device_dest = pd.crosstab(df['first_device_type'], df['country_destination'])
device_dest_pct = pd.crosstab(df['first_device_type'], df['country_destination'], normalize='index') * 100
device_dest_pct_filtered = device_dest_pct.loc[top_devices]

plt.figure(figsize=(14, 8))
sns.heatmap(device_dest_pct_filtered, annot=True, cmap='viridis', fmt='.1f')
plt.title('Destination Distribution (%) by First Device Type', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Browser analysis
browser_counts = df['first_browser'].value_counts().nlargest(10)

plt.figure(figsize=(14, 6))
sns.barplot(x=browser_counts.index, y=browser_counts.values, palette='viridis')
plt.title('Top 10 First Browsers', fontsize=16)
plt.xlabel('Browser', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Browser vs destination
# Focus on top browsers for readability
top_browsers = browser_counts.nlargest(6).index
browser_dest_pct = pd.crosstab(df['first_browser'], df['country_destination'], normalize='index') * 100
browser_dest_pct_filtered = browser_dest_pct.loc[top_browsers]

plt.figure(figsize=(14, 8))
sns.heatmap(browser_dest_pct_filtered, annot=True, cmap='viridis', fmt='.1f')
plt.title('Destination Distribution (%) by Top Browsers', fontsize=16)
plt.tight_layout()
plt.show()

## User Behavior Analysis

Let's analyze some of the engineered features related to user behavior.

In [None]:
# Check if we have the engineered features
behavior_features = ['days_before_signup', 'signup_day_of_week', 'signup_weekend']
existing_behavior_features = [col for col in behavior_features if col in df.columns]

if existing_behavior_features:
    print(f"Found engineered behavior features: {existing_behavior_features}")
else:
    print("No engineered behavior features found. Creating basic ones for analysis.")
    
    # Create some basic features if they don't exist
    if 'date_account_created' in df.columns and 'timestamp_first_active' in df.columns:
        # Convert to datetime
        df['date_account_created'] = pd.to_datetime(df['date_account_created'])
        df['timestamp_first_active'] = pd.to_datetime(df['timestamp_first_active'], format='%Y%m%d%H%M%S')
        
        # Calculate days between first active and account creation
        df['days_before_signup'] = (df['date_account_created'] - df['timestamp_first_active']).dt.days
        
        # Day of week
        df['signup_day_of_week'] = df['date_account_created'].dt.dayofweek
        df['signup_weekend'] = df['signup_day_of_week'].isin([5, 6]).astype(int)

In [None]:
# Analyze days before signup
if 'days_before_signup' in df.columns:
    plt.figure(figsize=(14, 6))
    sns.histplot(df['days_before_signup'], bins=50, kde=True)
    plt.title('Distribution of Days Between First Activity and Signup', fontsize=16)
    plt.xlabel('Days', fontsize=14)
    plt.ylabel('Count', fontsize=14)
    plt.xlim(-10, 30)  # Focus on a reasonable range
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Days before signup vs destination
    plt.figure(figsize=(14, 6))
    sns.boxplot(x='country_destination', y='days_before_signup', data=df)
    plt.title('Days Before Signup by Destination', fontsize=16)
    plt.xlabel('Country', fontsize=14)
    plt.ylabel('Days', fontsize=14)
    plt.ylim(-5, 15)  # Focus on a reasonable range
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
# Analyze signup day of week
if 'signup_day_of_week' in df.columns:
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_counts = df['signup_day_of_week'].value_counts().sort_index()
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=[days[i] for i in day_counts.index], y=day_counts.values, palette='viridis')
    plt.title('Distribution of Signup Day of Week', fontsize=16)
    plt.xlabel('Day', fontsize=14)
    plt.ylabel('Count', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Weekend vs weekday booking destinations
    if 'signup_weekend' in df.columns:
        weekend_dest = pd.crosstab(df['signup_weekend'], df['country_destination'])
        weekend_dest.index = ['Weekday', 'Weekend']
        weekend_dest_pct = pd.crosstab(df['signup_weekend'], df['country_destination'], normalize='index') * 100
        weekend_dest_pct.index = ['Weekday', 'Weekend']
        
        plt.figure(figsize=(14, 6))
        sns.heatmap(weekend_dest_pct, annot=True, cmap='viridis', fmt='.1f')
        plt.title('Destination Distribution (%) by Weekend vs Weekday Signup', fontsize=16)
        plt.tight_layout()
        plt.show()

## Correlation Analysis

Let's examine correlations between numerical features and look for multicollinearity.

In [None]:
# Check which columns are numerical
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f"Numerical columns: {numerical_cols}")

In [None]:
# Correlation matrix
correlation_matrix = df[numerical_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(16, 12))
mask = np.triu(correlation_matrix)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', mask=mask)
plt.title('Correlation Matrix of Numerical Features', fontsize=16)
plt.tight_layout()
plt.show()

## Dimensionality Reduction and Feature Visualization

Let's use PCA and t-SNE to visualize the high-dimensional data.

In [None]:
# Prepare data for dimensionality reduction
# Select only numerical columns and categorical columns that have been encoded
feature_cols = numerical_cols.copy()

# Remove any target-related or ID columns
for col in ['id', 'country_destination']:
    if col in feature_cols:
        feature_cols.remove(col)

# Check if we have enough features
if len(feature_cols) < 2:
    print("Not enough numerical features for dimensionality reduction.")
else:
    # Standardize the features (important for PCA and t-SNE)
    from sklearn.preprocessing import StandardScaler
    X = df[feature_cols].fillna(0)  # Replace NaNs with 0 for this visualization
    X_scaled = StandardScaler().fit_transform(X)
    
    print(f"Performing dimensionality reduction with {X_scaled.shape[1]} features")

In [None]:
# PCA
if 'X_scaled' in locals() and X_scaled.shape[1] >= 2:
    # Apply PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(X_scaled)
    
    # Create a DataFrame for easier plotting
    pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])
    pca_df['country_destination'] = df['country_destination'].values
    
    # Plot PCA results
    plt.figure(figsize=(14, 10))
    sns.scatterplot(x='PC1', y='PC2', hue='country_destination', data=pca_df, 
                    palette='viridis', s=50, alpha=0.7)
    plt.title('PCA: 2D Projection of User Features by Destination', fontsize=16)
    plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.2%} variance)', fontsize=14)
    plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.2%} variance)', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.legend(title='Destination', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
    
    # Print explained variance
    print(f"Total variance explained by 2 principal components: {sum(pca.explained_variance_ratio_):.2%}")

In [None]:
# t-SNE (can be slow on large datasets)
if 'X_scaled' in locals() and X_scaled.shape[1] >= 2:
    # Take a sample if dataset is large
    sample_size = min(5000, X_scaled.shape[0])
    indices = np.random.choice(X_scaled.shape[0], sample_size, replace=False)
    X_sample = X_scaled[indices]
    y_sample = df['country_destination'].iloc[indices]
    
    print(f"Applying t-SNE on a sample of {sample_size} observations...")
    
    # Apply t-SNE
    tsne = TSNE(n_components=2, perplexity=30, n_iter=300, random_state=42)
    tsne_result = tsne.fit_transform(X_sample)
    
    # Create a DataFrame for easier plotting
    tsne_df = pd.DataFrame(data=tsne_result, columns=['TSNE1', 'TSNE2'])
    tsne_df['country_destination'] = y_sample.values
    
    # Plot t-SNE results
    plt.figure(figsize=(14, 10))
    sns.scatterplot(x='TSNE1', y='TSNE2', hue='country_destination', data=tsne_df, 
                   palette='viridis', s=50, alpha=0.7)
    plt.title('t-SNE: 2D Projection of User Features by Destination', fontsize=16)
    plt.xlabel('t-SNE Component 1', fontsize=14)
    plt.ylabel('t-SNE Component 2', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.legend(title='Destination', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()