In [None]:
# 01_ingest_clean.ipynb
import pandas as pd
import os

# ------------------------------
# Paths
# ------------------------------
BASE_DIR = "../"
RAW_DIR = os.path.join(BASE_DIR, "data/raw")
PROCESSED_DIR = os.path.join(BASE_DIR, "data/processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)

# ------------------------------
# Load datasets
# ------------------------------
try:
    df_ev = pd.read_csv(os.path.join(RAW_DIR, "ev_registrations.csv"))
    print("EV Registrations loaded")
except FileNotFoundError:
    print("ev_registrations.csv not found in data/raw")

try:
    df_pop = pd.read_csv(os.path.join(RAW_DIR, "population.csv"))
    print("Population data loaded")
except FileNotFoundError:
    print("population.csv not found in data/raw")

# ------------------------------
# Quick inspection
# ------------------------------
print("EV data sample:")
display(df_ev.head())

print("Population data sample:")
display(df_pop.head())

# ------------------------------
# Basic cleaning function
# ------------------------------
def clean_data(df):
    df = df.drop_duplicates()
    df.columns = df.columns.str.strip()
    # Fill numeric missing values
    num_cols = df.select_dtypes(include="number").columns
    df[num_cols] = df[num_cols].fillna(0)
    # Fill categorical missing values
    cat_cols = df.select_dtypes(include="object").columns
    df[cat_cols] = df[cat_cols].fillna("Unknown")
    return df

df_ev_clean = clean_data(df_ev)
df_pop_clean = clean_data(df_pop)

# ------------------------------
# Save cleaned datasets
# ------------------------------
df_ev_clean.to_csv(os.path.join(PROCESSED_DIR, "ev_registrations_clean.csv"), index=False)
df_pop_clean.to_csv(os.path.join(PROCESSED_DIR, "population_clean.csv"), index=False)

print("Processed files saved to data/processed/")


In [None]:
%

Note: you may need to restart the kernel to use updated packages.
ev_registrations.csv not found in data/raw
population.csv not found in data/raw
EV data not loaded.
Population data not loaded.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


AttributeError: 'NoneType' object has no attribute 'drop_duplicates'

In [1]:
import pandas as pd
import os

# ------------------------------
# 1. Define folder paths
# ------------------------------
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # if .py
except NameError:
    BASE_DIR = os.getcwd()  # if Jupyter

RAW_DIR = os.path.join(BASE_DIR, "data", "raw")
PROCESSED_DIR = os.path.join(BASE_DIR, "data", "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)

# ------------------------------
# 2. Generic cleaning function
# ------------------------------
def clean_data(df):
    """Basic cleaning: drop duplicates, trim col names, fill NaN"""
    df = df.drop_duplicates()
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
    
    # Fill missing numeric with 0
    num_cols = df.select_dtypes(include="number").columns
    df[num_cols] = df[num_cols].fillna(0)
    
    # Fill missing categorical with 'Unknown'
    cat_cols = df.select_dtypes(include="object").columns
    df[cat_cols] = df[cat_cols].fillna("Unknown")
    
    return df

# ------------------------------
# 3. Load datasets
# ------------------------------
datasets = {
    "final_dataset.csv": None,
    "ev-charging-stations-india.csv": None,
    "EV Maker by Place.csv": None
}

for fname in datasets.keys():
    path = os.path.join(RAW_DIR, fname)
    if os.path.exists(path):
        df = pd.read_csv(path, encoding="utf-8", low_memory=False)
        print(f"Loaded {fname} ‚Üí Shape: {df.shape}")
        datasets[fname] = df
    else:
        print(f"‚ùå File {fname} not found in {RAW_DIR}")

# ------------------------------
# 4. Clean & Save
# ------------------------------
for fname, df in datasets.items():
    if df is not None:
        df_clean = clean_data(df)
        out_name = fname.replace(".csv", "_clean.csv").lower().replace(" ", "_")
        out_path = os.path.join(PROCESSED_DIR, out_name)
        df_clean.to_csv(out_path, index=False)
        print(f"‚úÖ Cleaned {fname} ‚Üí Saved as {out_name}")



‚ùå File final_dataset.csv not found in c:\Users\Abhay\EV-Charging-Demand-Analysis\notebooks\data\raw
‚ùå File ev-charging-stations-india.csv not found in c:\Users\Abhay\EV-Charging-Demand-Analysis\notebooks\data\raw
‚ùå File EV Maker by Place.csv not found in c:\Users\Abhay\EV-Charging-Demand-Analysis\notebooks\data\raw


In [2]:
BASE_DIR = os.getcwd()
RAW_DIR = os.path.join(BASE_DIR, "data", "raw")


In [3]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))  # go one level up
RAW_DIR = os.path.join(BASE_DIR, "data", "raw")


In [4]:
import os
import pandas as pd

# ------------------------------
# Correct project base path
# ------------------------------
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # if .py script
except NameError:
    BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))  # if Jupyter

RAW_DIR = os.path.join(BASE_DIR, "data", "raw")
PROCESSED_DIR = os.path.join(BASE_DIR, "data", "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)

print("Base directory:", BASE_DIR)
print("Raw data directory:", RAW_DIR)



Base directory: c:\Users\Abhay\EV-Charging-Demand-Analysis
Raw data directory: c:\Users\Abhay\EV-Charging-Demand-Analysis\data\raw


In [7]:
# ------------------------------
# 3. List of datasets to load
# ------------------------------
file_list = [
    "final_dataset.csv",
    "ev-charging-stations-india.csv",
    "EV Maker by Place.csv",
    "ev-cat_01-24.csv",
    "operationalPC.csv",
    "Vehicle cladd-All.csv"
]

datasets = {}

# ------------------------------
# 4. Load datasets
# ------------------------------
for fname in file_list:
    path = os.path.join(RAW_DIR, fname)
    if os.path.exists(path):
        try:
            df = pd.read_csv(path, encoding="utf-8", low_memory=False)
        except UnicodeDecodeError:
            df = pd.read_csv(path, encoding="latin1", low_memory=False)  # fallback
        print(f"‚úÖ Loaded {fname} ‚Üí Shape: {df.shape}")
        datasets[fname] = df
    else:
        print(f"‚ùå File not found: {fname}")

# ------------------------------
# 5. Clean & Save
# ------------------------------
for fname, df in datasets.items():
    df_clean = clean_data(df)
    out_name = fname.replace(".csv", "_clean.csv").lower().replace(" ", "_")
    out_path = os.path.join(PROCESSED_DIR, out_name)
    df_clean.to_csv(out_path, index=False)
    print(f"üíæ Cleaned {fname} ‚Üí Saved as {out_name}")
    print(df_clean.head(), "\n")  # preview


‚úÖ Loaded final_dataset.csv ‚Üí Shape: (32, 13)
‚úÖ Loaded ev-charging-stations-india.csv ‚Üí Shape: (1547, 7)
‚úÖ Loaded EV Maker by Place.csv ‚Üí Shape: (62, 3)
‚ùå File not found: ev-cat_01-24.csv
‚úÖ Loaded operationalPC.csv ‚Üí Shape: (34, 2)
‚ùå File not found: Vehicle cladd-All.csv
üíæ Cleaned final_dataset.csv ‚Üí Saved as final_dataset_clean.csv
   unnamed:_0                  state_name  two_wheeler  three_wheeler  \
0           0  Andaman and Nicobar Island            1           30.0   
1           1           Arunachal Pradesh           14            0.0   
2           2                       Assam          721        47041.0   
3           3                       Bihar         5003        59079.0   
4           4                  Chandigarh          298         1410.0   

   four_wheeler  goods_vehicles  public_service_vehicle  \
0            81             0.0                    40.0   
1             5             0.0                     0.0   
2           161          

In [6]:
def clean_data(df):
    df = df.copy()  # ‚úÖ make sure we‚Äôre working on a full copy
    df = df.drop_duplicates()
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # Fill numeric missing values
    num_cols = df.select_dtypes(include=["float64", "int64"]).columns
    df.loc[:, num_cols] = df[num_cols].fillna(0)

    # Fill categorical missing values
    cat_cols = df.select_dtypes(include=["object"]).columns
    df.loc[:, cat_cols] = df[cat_cols].fillna("Unknown")

    return df
