In [None]:
# Load necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from sklearn.impute import SimpleImputer

# Initialize logging
logging.basicConfig(
    filename='eda_log.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger()

# Load datasets
try:
    train = pd.read_csv('../data/rossmann_store_sales/train.csv')
    test = pd.read_csv('../data/rossmann_store_sales/test.csv')
    store = pd.read_csv('../data/rossmann_store_sales/store.csv')
    sample_submission = pd.read_csv('../data/rossmann_store_sales/sample_submission.csv')
    logger.info("Datasets loaded successfully.")
except Exception as e:
    logger.error(f"Error loading datasets: {e}")
    raise

# Merge datasets
try:
    train = train.merge(store, on='Store', how='left')
    test = test.merge(store, on='Store', how='left')
    logger.info("Train and test datasets merged with store dataset.")
except Exception as e:
    logger.error(f"Error merging datasets: {e}")
    raise

# Handle missing values
try:
    imputer = SimpleImputer(strategy='mean')
    train['CompetitionDistance'] = imputer.fit_transform(train[['CompetitionDistance']])
    logger.info("Missing values handled for CompetitionDistance.")
except Exception as e:
    logger.error(f"Error handling missing values: {e}")
    raise

# Exploratory Data Analysis
## Q1: Distribution of Promotions
try:
    sns.histplot(train['Promo'], label='Train', kde=False, color='blue', alpha=0.6)
    sns.histplot(test['Promo'], label='Test', kde=False, color='orange', alpha=0.6)
    plt.legend()
    plt.title('Promo Distribution in Train and Test Sets')
    plt.show()
    logger.info("Plotted Promo distribution comparison.")
except Exception as e:
    logger.error(f"Error in Promo distribution visualization: {e}")

## Q2: Sales Before, During, and After Holidays
try:
    holiday_sales = train.groupby('StateHoliday')['Sales'].mean()
    holiday_sales.plot(kind='bar', color='skyblue', title='Average Sales by State Holiday')
    plt.ylabel('Average Sales')
    plt.show()
    print("Average Sales by State Holiday:\n", holiday_sales)
    logger.info("Analyzed sales before, during, and after holidays.")
except Exception as e:
    logger.error(f"Error analyzing sales during holidays: {e}")

## Q3: Seasonal Purchase Behaviors
try:
    train['Month'] = pd.to_datetime(train['Date']).dt.month
    monthly_sales = train.groupby('Month')['Sales'].mean()
    monthly_sales.plot(kind='line', title='Monthly Sales Trends')
    plt.ylabel('Average Sales')
    plt.show()
    print("Average Monthly Sales:\n", monthly_sales)
    logger.info("Examined seasonal purchase behaviors.")
except Exception as e:
    logger.error(f"Error examining seasonal trends: {e}")

## Q4: Correlation Between Sales and Customers
try:
    correlation = train[['Sales', 'Customers']].corr()
    sns.heatmap(correlation, annot=True, cmap='coolwarm', square=True)
    plt.title('Correlation Between Sales and Customers')
    plt.show()
    print("Correlation Between Sales and Customers:\n", correlation)
    logger.info("Checked correlation between sales and customers.")
except Exception as e:
    logger.error(f"Error in correlation analysis: {e}")

## Q5: Effectiveness of Promotions
try:
    promo_sales = train.groupby('Promo')['Sales'].mean()
    promo_sales.plot(kind='bar', color=['gray', 'green'], title='Effect of Promotions on Sales')
    plt.ylabel('Average Sales')
    plt.show()
    print("Effect of Promotions on Sales:\n", promo_sales)
    logger.info("Explored the effect of promotions on sales.")
except Exception as e:
    logger.error(f"Error analyzing promo effectiveness: {e}")

## Q6: Store Characteristics (Assortment, Competitor Distance)
try:
    assortment_sales = train.groupby('Assortment')['Sales'].mean()
    assortment_sales.plot(kind='bar', title='Average Sales by Assortment Type', color='orange')
    plt.ylabel('Average Sales')
    plt.show()
    print("Average Sales by Assortment Type:\n", assortment_sales)

    sns.scatterplot(data=train, x='CompetitionDistance', y='Sales', alpha=0.6)
    plt.title('Sales vs. Competition Distance')
    plt.show()
    print("Effect of Competition Distance on Sales:\n", train[['CompetitionDistance', 'Sales']].describe())
    logger.info("Checked the effect of competitor distance on sales.")
except Exception as e:
    logger.error(f"Error analyzing store characteristics: {e}")

# Conclusion
logger.info("Exploratory Data Analysis completed.")
