In [1]:
import pandas as pd
import os

In [15]:
files = [
    'market_size_quickcom_india.csv',
    'ecom_overall_india.csv',
    'competitors_quickcom_india.csv',
    'swiggy_dunzo_financials.csv',
    'E-commerce Customer Behavior - Sheet1.csv',  
    'Augmented_IndiaTransactMultiFacet2024.csv', 
    'Customer_Behaviour_Survey_responses.csv',  
    'multiTimeline.csv'  
]

In [5]:
os.makedirs('../data/clean', exist_ok=True)

In [16]:
for file in files:
    file_path = f'../data/raw/{file}'
    try:
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            print(f"Processing {file}: Initial shape {df.shape}, Columns: {list(df.columns)}")
            # General cleaning: Handle dates and numeric columns
            if 'date' in df.columns:
                df['date'] = pd.to_datetime(df['date'], errors='coerce')
            if 'year' in df.columns:
                df['year'] = pd.to_numeric(df['year'], errors='coerce')
            # Clean strings and convert to numeric where possible
            for col in df.columns:
                if df[col].dtype == 'object':
                    df[col] = df[col].str.replace(',', '', regex=False).str.replace('₹', '', regex=False)
                    df[col] = pd.to_numeric(df[col], errors='coerce')
            # Specific cleaning for financial data
            if 'revenue_cr_inr' in df.columns:
                df['revenue_usd_m'] = pd.to_numeric(df['revenue_cr_inr'], errors='coerce') * 0.012
            if 'gov_cr_inr' in df.columns:
                df['gov_usd_m'] = pd.to_numeric(df['gov_cr_inr'], errors='coerce') * 0.012
            # Save cleaned file with original name (or mapped name if renamed)
            clean_path = f'../data/clean/{file}'
            df.to_csv(clean_path, index=False)
            print(f"Saved cleaned file: {clean_path}, Final shape: {df.shape}")
        else:
            print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error processing {file}: {str(e)}")

Processing market_size_quickcom_india.csv: Initial shape (11, 4), Columns: ['year', 'gmv_usd_billion', 'cagr_pct', 'source']
Saved cleaned file: ../data/clean/market_size_quickcom_india.csv, Final shape: (11, 4)
Processing ecom_overall_india.csv: Initial shape (5, 3), Columns: ['year', 'gmv_usd_billion', 'cagr_pct']
Saved cleaned file: ../data/clean/ecom_overall_india.csv, Final shape: (5, 3)
Processing competitors_quickcom_india.csv: Initial shape (6, 6), Columns: ['company', 'fiscal_year', 'gov_cr_inr', 'revenue_cr_inr', 'market_share_pct', 'orders_per_day_lakh']
Saved cleaned file: ../data/clean/competitors_quickcom_india.csv, Final shape: (6, 8)
Processing swiggy_dunzo_financials.csv: Initial shape (4, 7), Columns: ['company', 'fiscal_year', 'revenue_cr_inr', 'losses_cr_inr', 'total_funding_usd_m', 'acquisition_price_est_usd_m', 'cost_structure_pct_ops']
Saved cleaned file: ../data/clean/swiggy_dunzo_financials.csv, Final shape: (4, 8)
Processing E-commerce Customer Behavior - Shee

In [17]:
try:
    dd = pd.DataFrame({
        'file': files,
        'description': [
            'Quick-commerce GMV and CAGR 2020-2030',
            'Overall e-commerce GMV 2021-2030',
            'Competitor GOV/revenue FY24-FY25',
            'Swiggy/Dunzo financials FY23-FY25',
            'Customer purchases/adoption by tier/city',
            'Consumer spending patterns and LTV/CAC proxies',
            'Tier-1/2 adoption rates',
            'Search interest for quick-commerce terms 2020-2025'
        ],
        'columns': [list(pd.read_csv(f'../data/raw/{f}').columns) if os.path.exists(f'../data/raw/{f}') else [] for f in files],
        'rows': [pd.read_csv(f'../data/raw/{f}').shape[0] if os.path.exists(f'../data/raw/{f}') else 0 for f in files],
        'source': [
            'RedSeer/IBEF/Statista',
            'IBEF/Deloitte',
            'Company DRHPs/HSBC',
            'Tracxn/MCA/Entrackr',
            'Kaggle',
            'Kaggle',
            'Kaggle',
            'Google Trends'
        ]
    })
    dd.to_csv('../data/data_dictionary.csv', index=False)
    with open('../data/data_dictionary.md', 'w') as f:
        f.write(dd.to_markdown(index=False))
    print("Data dictionary saved as CSV and Markdown!")
except Exception as e:
    print(f"Error generating data dictionary: {str(e)}")

Data dictionary saved as CSV and Markdown!


In [18]:
clean_files = os.listdir('../data/clean')
print(f"Files in /data/clean/: {clean_files}")

Files in /data/clean/: ['Augmented_IndiaTransactMultiFacet2024.csv', 'competitors_quickcom_india.csv', 'Customer_Behaviour_Survey_responses.csv', 'E-commerce Customer Behavior - Sheet1.csv', 'ecom_overall_india.csv', 'market_size_quickcom_india.csv', 'multiTimeline.csv', 'swiggy_dunzo_financials.csv']
