# Data Exploration

**Author:** Nino Gagnidze
**Purpose:** Initial exploration and quality assessment of the Mall Customers dataset

## Objectives
- Load and inspect the raw dataset
- Understand data structure and types
- Identify data quality issues (missing values, duplicates, outliers)
- Generate initial statistical summaries

## 1. Setup and Data Loading

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)

In [None]:
# Load the dataset
data_path = '../data/raw/mall_customers.csv'
df = pd.read_csv(data_path)

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")

## 2. Initial Data Inspection

In [None]:
# Display first few rows
print("First 10 rows of the dataset:")
df.head(10)

In [None]:
# Display last few rows
print("Last 5 rows of the dataset:")
df.tail()

In [None]:
# Get dataset information
print("Dataset Information:")
df.info()

In [None]:
# Get column names and data types
print("Column Names and Data Types:")
print(df.dtypes)

## 3. Data Quality Assessment

In [None]:
# Check for missing values
print("Missing Values Count:")
print(df.isnull().sum())
print("\nMissing Values Percentage:")
print((df.isnull().sum() / len(df) * 100).round(2))

In [None]:
# Check for duplicate records
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print("\nDuplicate rows:")
    print(df[df.duplicated(keep=False)])

In [None]:
# Check for duplicate CustomerIDs
duplicate_ids = df['CustomerID'].duplicated().sum()
print(f"Number of duplicate CustomerIDs: {duplicate_ids}")

if duplicate_ids > 0:
    print("\nDuplicate CustomerIDs:")
    print(df[df['CustomerID'].duplicated(keep=False)].sort_values('CustomerID'))

## 4. Statistical Summary

In [None]:
# Generate descriptive statistics for numerical features
print("Descriptive Statistics:")
df.describe()

In [None]:
# Additional statistics
print("Additional Statistics:")
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

stats_df = pd.DataFrame({
    'Mean': df[numerical_cols].mean(),
    'Median': df[numerical_cols].median(),
    'Mode': df[numerical_cols].mode().iloc[0],
    'Std': df[numerical_cols].std(),
    'Variance': df[numerical_cols].var(),
    'Range': df[numerical_cols].max() - df[numerical_cols].min(),
    'IQR': df[numerical_cols].quantile(0.75) - df[numerical_cols].quantile(0.25)
})

stats_df

## 5. Categorical Features Analysis

In [None]:
# Analyze Gender distribution
print("Gender Distribution:")
print(df['Gender'].value_counts())
print("\nGender Percentage:")
print(df['Gender'].value_counts(normalize=True) * 100)

In [None]:
# Check for unique values in categorical columns
print("Unique values in Gender column:")
print(df['Gender'].unique())

## 6. Outlier Detection (Initial Assessment)

In [None]:
# Function to detect outliers using IQR method
def detect_outliers_iqr(data, column):
    """
    Detect outliers using the Interquartile Range (IQR) method.
    
    Parameters:
    -----------
    data : pd.DataFrame
        Input dataframe
    column : str
        Column name to check for outliers
    
    Returns:
    --------
    dict
        Dictionary with outlier information
    """
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    
    return {
        'column': column,
        'Q1': Q1,
        'Q3': Q3,
        'IQR': IQR,
        'lower_bound': lower_bound,
        'upper_bound': upper_bound,
        'outlier_count': len(outliers),
        'outlier_percentage': (len(outliers) / len(data)) * 100
    }

In [None]:
# Detect outliers for numerical columns
numerical_features = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']

print("Outlier Detection Summary (IQR Method):")
print("=" * 80)

for col in numerical_features:
    outlier_info = detect_outliers_iqr(df, col)
    print(f"\n{col}:")
    print(f"  Q1: {outlier_info['Q1']:.2f}")
    print(f"  Q3: {outlier_info['Q3']:.2f}")
    print(f"  IQR: {outlier_info['IQR']:.2f}")
    print(f"  Lower Bound: {outlier_info['lower_bound']:.2f}")
    print(f"  Upper Bound: {outlier_info['upper_bound']:.2f}")
    print(f"  Outlier Count: {outlier_info['outlier_count']}")
    print(f"  Outlier Percentage: {outlier_info['outlier_percentage']:.2f}%")

## 7. Data Range Validation

In [None]:
# Check if values are within expected ranges
print("Data Range Validation:")
print("=" * 80)

# Age validation (should be positive)
invalid_age = df[df['Age'] <= 0]
print(f"\nInvalid Age values (<=0): {len(invalid_age)}")

# Income validation (should be positive)
invalid_income = df[df['Annual Income (k$)'] <= 0]
print(f"Invalid Annual Income values (<=0): {len(invalid_income)}")

# Spending Score validation (should be between 1-100)
invalid_spending = df[(df['Spending Score (1-100)'] < 1) | (df['Spending Score (1-100)'] > 100)]
print(f"Invalid Spending Score values (not in 1-100 range): {len(invalid_spending)}")

# Gender validation
valid_genders = ['Male', 'Female']
invalid_gender = df[~df['Gender'].isin(valid_genders)]
print(f"Invalid Gender values: {len(invalid_gender)}")

## 8. Initial Insights Summary

Run all cells above and document your findings here:

### Data Quality Status:
- Missing Values: [To be filled after running cells]
- Duplicate Records: [To be filled after running cells]
- Data Types: [To be filled after running cells]

### Key Observations:
- Total Records: [To be filled after running cells]
- Features: [To be filled after running cells]
- Gender Distribution: [To be filled after running cells]

### Data Quality Issues Identified:
1. [To be filled after running cells]
2. [To be filled after running cells]

### Next Steps:
1. Address identified data quality issues in preprocessing notebook
2. Decide on outlier handling strategy
3. Plan feature engineering approach

## 9. Save Exploration Report

In [None]:
# Create a data quality report
quality_report = {
    'Total Records': len(df),
    'Total Features': len(df.columns),
    'Missing Values': df.isnull().sum().sum(),
    'Duplicate Rows': df.duplicated().sum(),
    'Duplicate CustomerIDs': df['CustomerID'].duplicated().sum(),
    'Gender Distribution': df['Gender'].value_counts().to_dict(),
    'Age Range': f"{df['Age'].min()} - {df['Age'].max()}",
    'Income Range': f"{df['Annual Income (k$)'].min()} - {df['Annual Income (k$)'].max()}",
    'Spending Score Range': f"{df['Spending Score (1-100)'].min()} - {df['Spending Score (1-100)'].max()}"
}

print("Data Quality Report:")
for key, value in quality_report.items():
    print(f"{key}: {value}")

In [None]:
# Save basic statistics to file for reference
stats_output_path = '../reports/results/data_exploration_stats.txt'

with open(stats_output_path, 'w') as f:
    f.write("DATA EXPLORATION SUMMARY\n")
    f.write("=" * 80 + "\n\n")
    f.write(f"Dataset Shape: {df.shape}\n")
    f.write(f"Total Records: {len(df)}\n")
    f.write(f"Total Features: {len(df.columns)}\n\n")
    f.write("Columns: " + ", ".join(df.columns.tolist()) + "\n\n")
    f.write("Data Types:\n")
    f.write(df.dtypes.to_string() + "\n\n")
    f.write("Missing Values:\n")
    f.write(df.isnull().sum().to_string() + "\n\n")
    f.write("Descriptive Statistics:\n")
    f.write(df.describe().to_string() + "\n")

print(f"Exploration summary saved to {stats_output_path}")