# 📋 Data Overview - Initial Exploration

This notebook provides the initial data loading and basic exploration of the leads dataset.

## 🎯 Objectives
- Load raw data and data dictionary
- Understand dataset structure and dimensions
- Identify data types and missing values
- Get initial value counts for all columns

In [1]:
# Import required libraries
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), '..'))
from src.data.data_loader import load_raw_data, basic_data_info, save_processed_data

## 📊 Load Data

In [2]:
# Load raw data and data dictionary
df_raw = load_raw_data(file_path='../../data/raw/raw_data.csv')
df_dict = load_raw_data(file_path='../../data/raw/raw_data_description.csv')

2025-08-07 09:52:27,723 - src.data.data_loader - INFO - Successfully loaded 48665 rows from ../../data/raw/raw_data.csv
2025-08-07 09:52:27,726 - src.data.data_loader - INFO - Successfully loaded 41 rows from ../../data/raw/raw_data_description.csv
2025-08-07 09:52:27,726 - src.data.data_loader - INFO - Successfully loaded 41 rows from ../../data/raw/raw_data_description.csv


## 📖 Data Dictionary

In [3]:
# Display data dictionary information
print("📖 Data Dictionary Structure:")
basic_data_info(df_dict)
print("\n📋 Data Dictionary Content:")
display(df_dict)

📖 Data Dictionary Structure:

📋 Data Dictionary Content:


Unnamed: 0,attribute,description
0,cd_advertise,ad ID
1,cd_client,advertiser ID
2,cd_type_individual,advertiser type: Individual=1 Business=2
3,priority,ad priority (1=high 2=medium 3=low)
4,leads,total number of proposals received
5,views,number of ad views
6,phone_clicks,number of phone clicks
7,cd_vehicle_brand,vehicle brand code
8,cd_model_vehicle,vehicle model code
9,cd_version_vehicle,vehicle version code


## 🔍 Raw Data Overview

In [4]:
# Basic information about the raw dataset
print("📊 Raw Data Basic Information:")
basic_info = basic_data_info(df_raw)
print(f"\n📐 Dataset Shape: {basic_info['shape']}")
print(f"📋 Number of Columns: {len(basic_info['columns'])}")
print(f"📏 Memory Usage: {basic_info['memory_usage']:,} bytes")

📊 Raw Data Basic Information:

📐 Dataset Shape: (48665, 41)
📋 Number of Columns: 41
📏 Memory Usage: 48,606,640 bytes


In [5]:
# Detailed info about the dataset
print("🔍 Detailed Dataset Information:")
df_raw.info()

🔍 Detailed Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48665 entries, 0 to 48664
Data columns (total 41 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   cd_advertise                         48665 non-null  int64  
 1   cd_client                            48665 non-null  int64  
 2   cd_type_individual                   48665 non-null  int64  
 3   priority                             48665 non-null  int64  
 4   leads                                48665 non-null  int64  
 5   views                                48665 non-null  int64  
 6   phone_clicks                         48665 non-null  int64  
 7   cd_vehicle_brand                     48665 non-null  int64  
 8   cd_model_vehicle                     48665 non-null  int64  
 9   cd_version_vehicle                   48665 non-null  int64  
 10  year_model                           48665 non-null  int64  
 

In [None]:
# Statistical summary for numerical columns
print("📈 Statistical Summary:")
display(df_raw.describe())

In [6]:
df_raw[["vl_advertise", "km_vehicle"]].describe()

Unnamed: 0,vl_advertise,km_vehicle
count,48665.0,48665.0
mean,109279.5,379000.7
std,2866381.0,20127400.0
min,0.0,0.0
25%,30490.0,29000.0
50%,41000.0,57000.0
75%,64888.0,90000.0
max,405000000.0,2147484000.0


## 🕳️ Missing Values Analysis

In [None]:
# Check percentage of missing values in each column
missing_pct = round(100*(df_raw.isnull().sum()/len(df_raw.index)), 2)
missing_df = pd.DataFrame({
    'Column': missing_pct.index,
    'Missing_Percentage': missing_pct.values
}).sort_values('Missing_Percentage', ascending=False)

print("🕳️ Missing Values Analysis:")
print(f"Columns with missing values: {(missing_df['Missing_Percentage'] > 0).sum()}")
display(missing_df[missing_df['Missing_Percentage'] > 0])

# Visualize missing values
if (missing_df['Missing_Percentage'] > 0).sum() > 0:
    plt.figure(figsize=(10, 6))
    missing_data = missing_df[missing_df['Missing_Percentage'] > 0]
    sns.barplot(data=missing_data, x='Missing_Percentage', y='Column')
    plt.title('Missing Values by Column (%)')
    plt.xlabel('Missing Percentage')
    plt.tight_layout()
    plt.show()
else:
    print("✅ No missing values found in the dataset!")

## 📊 Column Types Analysis

In [None]:
# Analyze column types
column_types = df_raw.dtypes.value_counts()
print("📊 Column Types Distribution:")
display(column_types)

# Categorize columns by naming patterns
column_categories = {
    'Flag Columns (flg_)': [col for col in df_raw.columns if col.startswith('flg_')],
    'Code Columns (cd_)': [col for col in df_raw.columns if col.startswith('cd_')],
    'Value Columns (vl_)': [col for col in df_raw.columns if col.startswith('vl_')],
    'Number Columns (n_)': [col for col in df_raw.columns if col.startswith('n_')],
    'Other Columns': [col for col in df_raw.columns if not any(col.startswith(prefix) for prefix in ['flg_', 'cd_', 'vl_', 'n_'])]
}

print("\n🏷️ Column Categories by Naming Pattern:")
for category, columns in column_categories.items():
    if columns:
        print(f"\n{category} ({len(columns)} columns):")
        for col in columns[:10]:  # Show first 10
            print(f"  - {col}")
        if len(columns) > 10:
            print(f"  ... and {len(columns) - 10} more")

## 🎯 Target Variable Overview

In [None]:
# Focus on the target variable 'leads'
print("🎯 Target Variable Analysis - 'leads':")
print(f"Data type: {df_raw['leads'].dtype}")
print(f"Range: {df_raw['leads'].min()} to {df_raw['leads'].max()}")
print(f"Mean: {df_raw['leads'].mean():.2f}")
print(f"Median: {df_raw['leads'].median():.2f}")
print(f"Zero leads: {(df_raw['leads'] == 0).sum()} ({(df_raw['leads'] == 0).mean()*100:.1f}%)")

# Target distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
df_raw['leads'].hist(bins=50, ax=ax1)
ax1.set_title('Distribution of Leads')
ax1.set_xlabel('Number of Leads')
ax1.set_ylabel('Frequency')

# Box plot
sns.boxplot(x=df_raw['leads'], ax=ax2)
ax2.set_title('Leads Distribution (Box Plot)')
ax2.set_xlabel('Number of Leads')

plt.tight_layout()
plt.show()

# Top lead values
print("\n📊 Top 10 Most Common Lead Values:")
lead_counts = df_raw['leads'].value_counts().head(10)
for value, count in lead_counts.items():
    pct = (count / len(df_raw)) * 100
    print(f"  {value} leads: {count:,} ads ({pct:.1f}%)")

## 📋 Sample Data Preview

In [None]:
# Display first few rows
print("📋 First 5 rows of the dataset:")
display(df_raw.head())

print("\n📋 Random 5 rows sample:")
display(df_raw.sample(5, random_state=42))

In [None]:
df_raw.columns

## 🔍 Quick Value Counts Preview

In [None]:
# Quick preview of unique values for categorical-looking columns
preview_columns = ['flg_air_conditioning', 'transmission_type', 'fuel_type', 'priority', 'cd_type_individual']

print("🔍 Quick Value Counts Preview for Key Categorical Columns:")
for col in preview_columns:
    if col in df_raw.columns:
        print(f"\n📊 {col}:")
        value_counts = df_raw[col].value_counts()
        total = len(df_raw)
        for value, count in value_counts.head(5).items():
            pct = (count / total) * 100
            print(f"  {value}: {count:,} ({pct:.1f}%)")
        if len(value_counts) > 5:
            print(f"  ... and {len(value_counts) - 5} more unique values")