In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

sns.set(style='whitegrid')

# Load the dataset
df = pd.read_csv('Data/data.csv')
print("Data shape:", df.shape)
df.head()

# Data types and missing values
df.info()

# Summary statistics
df.describe(include='all').T

# Count missing values per column
missing_vals = df.isnull().sum()
missing_vals[missing_vals > 0].sort_values(ascending=False)

# Select categorical columns
categorical_cols = df.select_dtypes(include='object').columns

for col in categorical_cols:
    print(f"\nValue counts for {col}:")
    print(df[col].value_counts(normalize=True))
    
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, y=col, order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.xlabel('Count')
    plt.ylabel('')
    plt.show()
    
    # Select numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in numerical_cols:
    plt.figure(figsize=(10, 4))
    
    # Histogram
    plt.subplot(1, 2, 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    
    # Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    
    plt.tight_layout()
    plt.show()
    
    # Correlation matrix
plt.figure(figsize=(12, 8))
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.show()

# Convert TransactionStartTime to datetime
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

# Extract time-based features
df['transaction_hour'] = df['TransactionStartTime'].dt.hour
df['transaction_day'] = df['TransactionStartTime'].dt.day
df['transaction_month'] = df['TransactionStartTime'].dt.month
df['transaction_year'] = df['TransactionStartTime'].dt.year

# Plot transactions by hour
plt.figure(figsize=(10, 4))
sns.countplot(data=df, x='transaction_hour')
plt.title('Number of Transactions by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Count')
plt.show()

# Plot transactions by month
plt.figure(figsize=(10, 4))
sns.countplot(data=df, x='transaction_month')
plt.title('Number of Transactions by Month')
plt.xlabel('Month')
plt.ylabel('Count')
plt.show()

# Recency: days since last transaction
snapshot_date = df['TransactionStartTime'].max() + pd.Timedelta(days=1)
rfm = df.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,
    'TransactionId': 'count',
    'Amount': 'sum'
})
rfm.rename(columns={
    'TransactionStartTime': 'Recency',
    'TransactionId': 'Frequency',
    'Amount': 'Monetary'
}, inplace=True)

# Show distribution of RFM values
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(rfm.columns):
    sns.histplot(rfm[col], ax=axes[i], kde=True)

plt.suptitle('Distribution of RFM Metrics')
plt.tight_layout()
plt.show()



