# Comprehensive Data Understanding for BEV Charging Time Prediction
## Research Paper: Systematic Dataset Analysis
### Dataset: BEV Charging Sessions (75,000 records, 74 features)

In [1]:
# Essential Libraries for Data Understanding
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

In [2]:
# Data Loading
df = pd.read_csv(r"../data/Dataset_BEV_CADCS_Project.csv") ## Data loading 
pf = df.copy()  # Working copy 
print("Dataset loaded successfully") 
print(f"Dataset shape: {pf.shape}")

Dataset loaded successfully
Dataset shape: (75000, 74)


In [3]:
pf.head()

Unnamed: 0,session_id,vehicle_id,user_id,start_time,end_time,charging_duration_min,energy_delivered_kwh,vehicle_make,vehicle_model,battery_capacity_kwh,vehicle_age_years,battery_health_index,user_type,income_bracket,city,state,latitude,longitude,charging_station_id,station_operator,charger_type,charger_power_kw,plug_type,initial_soc_percent,final_soc_percent,soc_gained_percent,day_of_week,hour_of_day,month,season,is_weekend,is_peak_hour,festival,ambient_temperature_c,humidity_percent,weather_condition,air_quality_index,battery_temperature_c,grid_frequency_hz,grid_reliability_index,power_quality_score,load_shedding_event,grid_load_mw,tariff_per_kwh_inr,total_cost_inr,subsidy_amount_inr,payment_method,payment_success_rate,station_congestion_level,queue_wait_time_min,charger_utilization_rate,station_uptime_percent,charging_efficiency_percent,charging_curve_efficiency,cooling_system_active,thermal_management,app_reliability_rating,session_success_rate,user_satisfaction_score,trip_purpose,distance_to_station_km,range_remaining_km,next_destination_distance_km,state_ev_policy,charging_standard_compliance,safety_certification,session_type,booking_lead_time_min,session_rating,energy_efficiency_kwh_per_100km,week_of_year,day_of_year,is_month_end,is_quarter_end
0,IND_BEV_100000,HER5506,USR_81426,2018-01-19 03:30:07,2018-01-19 03:35:07,5.0,1.664,Hero,Vida V1,3.94,1,0.972,private,Lower_Middle,Delhi,Delhi,28.611178,77.200649,DEL_7912,Adani Total,AC_Fast,21.5,Bharat AC001,43,83,40,Friday,3,1,Winter,False,False,,22.6,42.0,Clear,371.0,23.7,49.96,0.88,0.857,Yes,0.96,7.67,12.76,10000,Subscription,0.957,Medium,5.3,,88.0,92.08,0.895,No,Passive,Good,0.945,4.2,Shopping,1.2,169.8,9.4,Delhi_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Scheduled,100.0,4.4,12.408844,3,19,False,False
1,IND_BEV_100001,TVS2519,USR_57400,2021-11-08 22:12:33,2021-11-08 22:19:41,7.13,0.838,TVS,iQube,2.25,0,0.998,private,Middle,Pune,Maharashtra,18.530614,73.853706,PUN_9785,HPCL,AC_Fast,11.7,Bharat AC001,39,73,34,Monday,22,11,Post_Monsoon,False,True,,31.4,95.0,Clear,91.0,34.2,50.08,0.94,0.88,No,2.73,9.49,7.95,10000,Card,0.933,Medium,17.0,0.32,95.9,91.1,0.894,No,Passive,Average,0.962,3.8,Personal,6.5,145.6,5.9,Maharashtra_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Scheduled,77.0,3.8,12.408844,45,312,False,False
2,IND_BEV_100002,OLA7065,USR_38785,2022-04-18 02:15:23,2022-04-18 02:20:23,5.0,1.289,Ola,S1 Pro,4.0,0,0.955,private,Middle,Gurgaon,Haryana,28.470351,77.009562,GUR_3803,Ola Electric,DC_Fast,69.0,CHAdeMO,48,80,32,Monday,2,4,Summer,False,False,,34.6,28.1,Partly_Cloudy,238.0,43.7,49.93,0.86,0.966,No,3.01,10.89,14.04,10000,Card,0.971,High,2.1,0.895,96.1,94.87,0.945,Yes,Active,Good,0.962,3.4,Tourism,1.2,174.8,16.4,Haryana_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Scheduled,43.0,4.5,12.408844,16,108,False,False
3,IND_BEV_100003,GRE4483,USR_76784,2023-08-19 13:05:25,2023-08-19 13:19:27,14.04,0.822,Greaves,Ampere Zeal,1.8,5,0.887,taxi,Middle,Kochi,Kerala,9.936656,76.259874,KOC_9830,ChargeZone,AC_Slow,5.2,Type2,16,62,46,Saturday,13,8,Monsoon,True,False,,25.3,95.0,Clear,76.0,22.5,50.09,0.96,0.996,No,3.16,9.81,8.07,0,Card,0.922,Low,0.0,0.391,92.9,89.28,0.882,No,Passive,Poor,0.967,3.5,Tourism,7.2,60.0,17.8,Kerala_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Scheduled,27.0,4.9,12.408844,33,231,False,False
4,IND_BEV_100004,BAJ1188,USR_33416,2019-04-19 12:31:40,2019-04-19 12:36:40,5.0,1.348,Bajaj,Urbanite,2.9,4,0.889,taxi,Lower_Middle,Jaipur,Rajasthan,26.902165,75.774032,JAI_2827,IOCL,AC_Fast,18.0,Bharat AC001,16,68,52,Friday,12,4,Summer,False,False,,42.8,59.1,Clear,151.0,51.7,50.21,0.85,0.868,Yes,2.56,8.19,11.04,10000,Subscription,0.939,Medium,11.6,0.757,89.7,91.48,0.919,Yes,Active,Poor,0.989,4.7,Personal,0.7,55.9,5.4,Rajasthan_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Scheduled,112.0,5.0,12.408844,16,109,False,False


In [4]:
df.columns

Index(['session_id', 'vehicle_id', 'user_id', 'start_time', 'end_time',
       'charging_duration_min', 'energy_delivered_kwh', 'vehicle_make',
       'vehicle_model', 'battery_capacity_kwh', 'vehicle_age_years',
       'battery_health_index', 'user_type', 'income_bracket', 'city', 'state',
       'latitude', 'longitude', 'charging_station_id', 'station_operator',
       'charger_type', 'charger_power_kw', 'plug_type', 'initial_soc_percent',
       'final_soc_percent', 'soc_gained_percent', 'day_of_week', 'hour_of_day',
       'month', 'season', 'is_weekend', 'is_peak_hour', 'festival',
       'ambient_temperature_c', 'humidity_percent', 'weather_condition',
       'air_quality_index', 'battery_temperature_c', 'grid_frequency_hz',
       'grid_reliability_index', 'power_quality_score', 'load_shedding_event',
       'grid_load_mw', 'tariff_per_kwh_inr', 'total_cost_inr',
       'subsidy_amount_inr', 'payment_method', 'payment_success_rate',
       'station_congestion_level', 'queue_wait

In [5]:
print(len(df.columns))

74


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 74 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   session_id                       75000 non-null  object 
 1   vehicle_id                       75000 non-null  object 
 2   user_id                          75000 non-null  object 
 3   start_time                       75000 non-null  object 
 4   end_time                         75000 non-null  object 
 5   charging_duration_min            75000 non-null  float64
 6   energy_delivered_kwh             75000 non-null  float64
 7   vehicle_make                     75000 non-null  object 
 8   vehicle_model                    75000 non-null  object 
 9   battery_capacity_kwh             75000 non-null  float64
 10  vehicle_age_years                75000 non-null  int64  
 11  battery_health_index             75000 non-null  float64
 12  user_type         

In [7]:
df.describe()

Unnamed: 0,charging_duration_min,energy_delivered_kwh,battery_capacity_kwh,vehicle_age_years,battery_health_index,latitude,longitude,charger_power_kw,initial_soc_percent,final_soc_percent,soc_gained_percent,hour_of_day,month,ambient_temperature_c,humidity_percent,air_quality_index,battery_temperature_c,grid_frequency_hz,grid_reliability_index,power_quality_score,grid_load_mw,tariff_per_kwh_inr,total_cost_inr,subsidy_amount_inr,payment_success_rate,queue_wait_time_min,charger_utilization_rate,station_uptime_percent,charging_efficiency_percent,charging_curve_efficiency,session_success_rate,user_satisfaction_score,distance_to_station_km,range_remaining_km,next_destination_distance_km,booking_lead_time_min,session_rating,energy_efficiency_kwh_per_100km,week_of_year,day_of_year
count,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,73178.0,75000.0,75000.0,75000.0,75000.0,73142.0,75000.0,75000.0,75000.0,75000.0,75000.0,73087.0,73159.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,73152.0,73119.0,75000.0,75000.0,75000.0,75000.0
mean,62.841921,7.396865,18.979154,2.038133,0.9443,20.594925,77.625106,20.701103,31.2108,69.19916,37.98836,11.517707,6.505187,28.418644,67.933445,145.457351,33.434065,50.000432,0.898523,0.888607,2.501642,9.393375,69.474214,7905.0,0.954928,5.480827,0.625993,91.490752,89.986662,0.919869,0.965032,4.000337,2.494047,109.207323,14.987376,59.632954,4.248637,14.458426,26.48292,182.646013
std,89.779555,8.243719,19.038962,2.498307,0.053539,6.190846,4.114272,32.220217,12.41883,13.708004,12.968575,6.921636,3.444323,8.379292,18.716892,77.139269,10.161478,0.149491,0.040178,0.091972,0.981047,1.651613,79.763825,7047.314183,0.02022,6.602719,0.187894,3.744871,2.888055,0.023063,0.014459,0.464784,2.511933,44.971469,15.067062,34.574346,0.434876,2.259941,15.046003,105.327886
min,5.0,0.276,1.8,0.0,0.65,9.911208,72.551431,3.3,10.0,30.0,20.0,0.0,1.0,8.0,20.0,45.0,3.1,49.42,0.83,0.484,0.8,5.5,1.96,0.0,0.92,0.0,0.3,85.0,85.0,0.88,0.94,3.2,0.0,15.1,0.0,0.0,3.5,12.408844,1.0,1.0
25%,7.5,1.07,2.9,0.0,0.921,17.365085,75.767842,5.0,22.0,59.0,27.0,6.0,4.0,23.9,53.8,93.0,26.6,49.9,0.87,0.826,1.65,8.2,9.93,0.0,0.937,0.0,0.463,88.3,87.48,0.9,0.953,3.6,0.7,75.5,4.3,30.0,3.9,12.408844,14.0,92.0
50%,21.26,5.151,15.0,1.0,0.956,19.095042,77.190551,6.7,29.0,71.0,36.0,12.0,7.0,28.9,69.3,128.0,33.8,50.0,0.9,0.899,2.5,9.35,46.04,10000.0,0.955,0.0,0.626,91.5,89.98,0.92,0.965,4.0,1.7,102.8,10.3,60.0,4.2,12.408844,26.0,183.0
75%,87.17,10.989,26.0,3.0,0.98,26.865404,80.250919,17.6,39.0,79.0,47.0,18.0,9.0,33.7,83.3,163.0,40.6,50.1,0.94,0.972,3.35,10.31,102.27,15000.0,0.972,11.0,0.79,94.7,92.48,0.94,0.978,4.4,3.4,137.8,20.8,90.0,4.6,16.950283,39.0,274.0
max,1055.25,71.115,95.0,26.0,1.02,28.633896,88.3839,150.0,59.0,100.0,70.0,23.0,12.0,45.0,95.0,379.0,60.0,50.62,0.96,1.0,4.2,15.26,828.63,20000.0,0.99,20.0,0.95,98.0,95.0,0.96,0.99,4.8,24.9,226.4,193.6,119.0,5.0,16.950283,53.0,366.0


In [8]:
categorrical_featuers = df.select_dtypes(include=['object','category','bool']).columns.tolist()
numerical_featuers = df.select_dtypes(include=['float64','int64']).columns.tolist()
print(categorrical_featuers)
print(len(categorrical_featuers))
print(numerical_featuers)
print(len(numerical_featuers))
print(df.columns)
print(len(df.columns))
print(df.shape)
print(df.size)

['session_id', 'vehicle_id', 'user_id', 'start_time', 'end_time', 'vehicle_make', 'vehicle_model', 'user_type', 'income_bracket', 'city', 'state', 'charging_station_id', 'station_operator', 'charger_type', 'plug_type', 'day_of_week', 'season', 'is_weekend', 'is_peak_hour', 'festival', 'weather_condition', 'load_shedding_event', 'payment_method', 'station_congestion_level', 'cooling_system_active', 'thermal_management', 'app_reliability_rating', 'trip_purpose', 'state_ev_policy', 'charging_standard_compliance', 'safety_certification', 'session_type', 'is_month_end', 'is_quarter_end']
34
['charging_duration_min', 'energy_delivered_kwh', 'battery_capacity_kwh', 'vehicle_age_years', 'battery_health_index', 'latitude', 'longitude', 'charger_power_kw', 'initial_soc_percent', 'final_soc_percent', 'soc_gained_percent', 'hour_of_day', 'month', 'ambient_temperature_c', 'humidity_percent', 'air_quality_index', 'battery_temperature_c', 'grid_frequency_hz', 'grid_reliability_index', 'power_quality_

In [9]:
categorrical_featuers = df.select_dtypes(include=['object','category','bool']).columns.tolist()
for item in categorrical_featuers:
    print(item)
    print(df[item].value_counts(normalize=True).head(3))
    print('\n')

session_id
session_id
IND_BEV_100000    0.000013
IND_BEV_149988    0.000013
IND_BEV_150004    0.000013
Name: proportion, dtype: float64


vehicle_id
vehicle_id
TAT2949    0.000120
TAT9364    0.000107
MAH2394    0.000107
Name: proportion, dtype: float64


user_id
user_id
USR_80771    0.000107
USR_46589    0.000093
USR_21484    0.000093
Name: proportion, dtype: float64


start_time
start_time
2023-08-23 03:46:36    0.000027
2020-01-30 17:58:30    0.000027
2024-09-24 15:12:42    0.000027
Name: proportion, dtype: float64


end_time
end_time
2020-10-14 00:21:52    0.000027
2019-04-15 05:39:47    0.000027
2023-04-28 07:17:42    0.000027
Name: proportion, dtype: float64


vehicle_make
vehicle_make
Tata        0.253000
Mahindra    0.198307
Bajaj       0.150160
Name: proportion, dtype: float64


vehicle_model
vehicle_model
Tigor EV    0.08472
Nexon EV    0.08456
Tiago EV    0.08372
Name: proportion, dtype: float64


user_type
user_type
private    0.450520
fleet      0.201080
taxi       0.148773

In [10]:
df[df.duplicated]

Unnamed: 0,session_id,vehicle_id,user_id,start_time,end_time,charging_duration_min,energy_delivered_kwh,vehicle_make,vehicle_model,battery_capacity_kwh,vehicle_age_years,battery_health_index,user_type,income_bracket,city,state,latitude,longitude,charging_station_id,station_operator,charger_type,charger_power_kw,plug_type,initial_soc_percent,final_soc_percent,soc_gained_percent,day_of_week,hour_of_day,month,season,is_weekend,is_peak_hour,festival,ambient_temperature_c,humidity_percent,weather_condition,air_quality_index,battery_temperature_c,grid_frequency_hz,grid_reliability_index,power_quality_score,load_shedding_event,grid_load_mw,tariff_per_kwh_inr,total_cost_inr,subsidy_amount_inr,payment_method,payment_success_rate,station_congestion_level,queue_wait_time_min,charger_utilization_rate,station_uptime_percent,charging_efficiency_percent,charging_curve_efficiency,cooling_system_active,thermal_management,app_reliability_rating,session_success_rate,user_satisfaction_score,trip_purpose,distance_to_station_km,range_remaining_km,next_destination_distance_km,state_ev_policy,charging_standard_compliance,safety_certification,session_type,booking_lead_time_min,session_rating,energy_efficiency_kwh_per_100km,week_of_year,day_of_year,is_month_end,is_quarter_end


In [11]:
df.isnull().sum().sum()

73410

In [12]:
df[df.isnull().any(axis=1)]

Unnamed: 0,session_id,vehicle_id,user_id,start_time,end_time,charging_duration_min,energy_delivered_kwh,vehicle_make,vehicle_model,battery_capacity_kwh,vehicle_age_years,battery_health_index,user_type,income_bracket,city,state,latitude,longitude,charging_station_id,station_operator,charger_type,charger_power_kw,plug_type,initial_soc_percent,final_soc_percent,soc_gained_percent,day_of_week,hour_of_day,month,season,is_weekend,is_peak_hour,festival,ambient_temperature_c,humidity_percent,weather_condition,air_quality_index,battery_temperature_c,grid_frequency_hz,grid_reliability_index,power_quality_score,load_shedding_event,grid_load_mw,tariff_per_kwh_inr,total_cost_inr,subsidy_amount_inr,payment_method,payment_success_rate,station_congestion_level,queue_wait_time_min,charger_utilization_rate,station_uptime_percent,charging_efficiency_percent,charging_curve_efficiency,cooling_system_active,thermal_management,app_reliability_rating,session_success_rate,user_satisfaction_score,trip_purpose,distance_to_station_km,range_remaining_km,next_destination_distance_km,state_ev_policy,charging_standard_compliance,safety_certification,session_type,booking_lead_time_min,session_rating,energy_efficiency_kwh_per_100km,week_of_year,day_of_year,is_month_end,is_quarter_end
0,IND_BEV_100000,HER5506,USR_81426,2018-01-19 03:30:07,2018-01-19 03:35:07,5.00,1.664,Hero,Vida V1,3.94,1,0.972,private,Lower_Middle,Delhi,Delhi,28.611178,77.200649,DEL_7912,Adani Total,AC_Fast,21.5,Bharat AC001,43,83,40,Friday,3,1,Winter,False,False,,22.6,42.0,Clear,371.0,23.7,49.96,0.88,0.857,Yes,0.96,7.67,12.76,10000,Subscription,0.957,Medium,5.3,,88.0,92.08,0.895,No,Passive,Good,0.945,4.2,Shopping,1.2,169.8,9.4,Delhi_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Scheduled,100.0,4.4,12.408844,3,19,False,False
1,IND_BEV_100001,TVS2519,USR_57400,2021-11-08 22:12:33,2021-11-08 22:19:41,7.13,0.838,TVS,iQube,2.25,0,0.998,private,Middle,Pune,Maharashtra,18.530614,73.853706,PUN_9785,HPCL,AC_Fast,11.7,Bharat AC001,39,73,34,Monday,22,11,Post_Monsoon,False,True,,31.4,95.0,Clear,91.0,34.2,50.08,0.94,0.880,No,2.73,9.49,7.95,10000,Card,0.933,Medium,17.0,0.320,95.9,91.10,0.894,No,Passive,Average,0.962,3.8,Personal,6.5,145.6,5.9,Maharashtra_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Scheduled,77.0,3.8,12.408844,45,312,False,False
2,IND_BEV_100002,OLA7065,USR_38785,2022-04-18 02:15:23,2022-04-18 02:20:23,5.00,1.289,Ola,S1 Pro,4.00,0,0.955,private,Middle,Gurgaon,Haryana,28.470351,77.009562,GUR_3803,Ola Electric,DC_Fast,69.0,CHAdeMO,48,80,32,Monday,2,4,Summer,False,False,,34.6,28.1,Partly_Cloudy,238.0,43.7,49.93,0.86,0.966,No,3.01,10.89,14.04,10000,Card,0.971,High,2.1,0.895,96.1,94.87,0.945,Yes,Active,Good,0.962,3.4,Tourism,1.2,174.8,16.4,Haryana_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Scheduled,43.0,4.5,12.408844,16,108,False,False
3,IND_BEV_100003,GRE4483,USR_76784,2023-08-19 13:05:25,2023-08-19 13:19:27,14.04,0.822,Greaves,Ampere Zeal,1.80,5,0.887,taxi,Middle,Kochi,Kerala,9.936656,76.259874,KOC_9830,ChargeZone,AC_Slow,5.2,Type2,16,62,46,Saturday,13,8,Monsoon,True,False,,25.3,95.0,Clear,76.0,22.5,50.09,0.96,0.996,No,3.16,9.81,8.07,0,Card,0.922,Low,0.0,0.391,92.9,89.28,0.882,No,Passive,Poor,0.967,3.5,Tourism,7.2,60.0,17.8,Kerala_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Scheduled,27.0,4.9,12.408844,33,231,False,False
4,IND_BEV_100004,BAJ1188,USR_33416,2019-04-19 12:31:40,2019-04-19 12:36:40,5.00,1.348,Bajaj,Urbanite,2.90,4,0.889,taxi,Lower_Middle,Jaipur,Rajasthan,26.902165,75.774032,JAI_2827,IOCL,AC_Fast,18.0,Bharat AC001,16,68,52,Friday,12,4,Summer,False,False,,42.8,59.1,Clear,151.0,51.7,50.21,0.85,0.868,Yes,2.56,8.19,11.04,10000,Subscription,0.939,Medium,11.6,0.757,89.7,91.48,0.919,Yes,Active,Poor,0.989,4.7,Personal,0.7,55.9,5.4,Rajasthan_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Scheduled,112.0,5.0,12.408844,16,109,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74995,IND_BEV_174995,HER2484,USR_96171,2024-05-26 23:53:51,2024-05-27 00:05:17,11.44,1.175,Hero,Vida V1,3.94,1,0.992,private,Lower_Middle,Kolkata,West Bengal,22.567413,88.359683,KOL_1539,EESL,AC_Slow,6.5,Bharat AC001,57,85,28,Sunday,23,5,Summer,True,False,,38.0,50.0,Clear,144.0,35.9,50.30,0.87,0.879,No,4.18,9.32,10.95,10000,Mobile_App,0.972,Low,0.0,0.310,,85.66,0.947,Yes,Passive,Excellent,0.946,3.6,Commute,0.4,210.1,18.9,West Bengal_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Opportunistic,111.0,4.5,12.408844,21,147,False,False
74996,IND_BEV_174996,MAH6672,USR_20570,2023-11-21 14:52:38,2023-11-21 15:40:17,47.65,16.234,Mahindra,eXUV300,34.50,0,1.002,private,High,Delhi,Delhi,28.604534,77.218759,DEL_1229,BPCL,AC_Fast,20.0,Type2,25,66,41,Tuesday,14,11,Post_Monsoon,False,False,,29.5,38.9,Light_Rain,373.0,32.4,49.93,0.88,0.953,Yes,3.63,9.76,158.45,0,Cash,0.946,Medium,4.8,0.487,88.6,87.31,0.914,No,Passive,Excellent,0.952,3.6,Shopping,1.2,75.0,16.0,Delhi_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Opportunistic,96.0,3.9,16.950283,47,325,False,False
74997,IND_BEV_174997,HER5281,USR_74947,2024-01-16 08:18:30,2024-01-16 08:26:11,7.70,1.084,Hero,Photon,1.80,0,1.017,fleet,Middle,Ahmedabad,Gujarat,23.040473,72.556788,AHM_1261,BPCL,AC_Slow,5.7,CHAdeMO,38,94,56,Tuesday,8,1,Winter,False,True,,15.7,57.2,Light_Rain,,22.4,50.29,0.89,0.842,Yes,1.52,7.10,7.70,10000,Cash,0.954,Low,0.0,0.400,92.6,94.55,0.957,No,Passive,Good,0.941,4.8,Personal,1.9,147.9,18.4,Gujarat_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Emergency,25.0,5.0,12.408844,3,16,False,False
74998,IND_BEV_174998,BAJ5079,USR_69084,2023-06-15 07:31:31,2023-06-15 07:49:35,18.07,1.539,Bajaj,Urbanite,2.90,3,0.923,private,Lower_Middle,Mumbai,Maharashtra,19.071692,72.895664,MUM_5220,ChargeZone,AC_Slow,5.4,CHAdeMO,26,82,56,Thursday,7,6,Summer,False,True,,38.6,74.5,Clear,131.0,37.3,49.93,0.92,0.934,Yes,1.59,9.79,15.07,0,Mobile_App,0.922,Low,0.0,0.407,89.6,89.56,0.954,Yes,Passive,Good,0.969,3.4,Commute,2.0,81.6,53.3,Maharashtra_EV_Policy_2023,IS_17017_Compliant,BIS_Certified,Opportunistic,30.0,4.4,12.408844,24,166,False,False


## 1. Dataset Overview & Basic Information

In [13]:
# Dataset Basic Information
print("🔍 DATASET OVERVIEW")
print(f"Total Records: {pf.shape[0]:,}")
print(f"Total Features: {pf.shape[1]}")
print(f"Memory Usage: {pf.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Total Data Points: {pf.size:,}")
print(f"Data Collection Period: {pf['start_time'].min()} to {pf['start_time'].max()}")
print("\n")

🔍 DATASET OVERVIEW
Total Records: 75,000
Total Features: 74
Memory Usage: 162.97 MB
Total Data Points: 5,550,000
Data Collection Period: 2018-01-01 07:30:45 to 2024-12-31 21:55:35




In [14]:
# Detailed Dataset Information
print("DETAILED DATASET INFORMATION")
pf.info()
print("\n")

DETAILED DATASET INFORMATION
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 74 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   session_id                       75000 non-null  object 
 1   vehicle_id                       75000 non-null  object 
 2   user_id                          75000 non-null  object 
 3   start_time                       75000 non-null  object 
 4   end_time                         75000 non-null  object 
 5   charging_duration_min            75000 non-null  float64
 6   energy_delivered_kwh             75000 non-null  float64
 7   vehicle_make                     75000 non-null  object 
 8   vehicle_model                    75000 non-null  object 
 9   battery_capacity_kwh             75000 non-null  float64
 10  vehicle_age_years                75000 non-null  int64  
 11  battery_health_index             75000 non-null  fl

## 2. Data Type Classification & Feature Analysis

In [15]:
# Feature Classification by Data Types
categorical_features = pf.select_dtypes(include=['object','category','bool']).columns.tolist()
numerical_features = pf.select_dtypes(include=['float64','int64','int32']).columns.tolist()
datetime_candidates = ['start_time', 'end_time']  # Potential datetime columns

print("FEATURE CLASSIFICATION")
print(f"Categorical Features: {len(categorical_features)}")
print(f"Numerical Features: {len(numerical_features)}")
print(f"DateTime Features: {len(datetime_candidates)}")

print("\n CATEGORICAL FEATURES:")
for i, feat in enumerate(categorical_features, 1):
    print(f"{i:2d}. {feat}")

print("\n NUMERICAL FEATURES:")
for i, feat in enumerate(numerical_features, 1):
    print(f"{i:2d}. {feat}")
print("\n")

FEATURE CLASSIFICATION
Categorical Features: 34
Numerical Features: 40
DateTime Features: 2

 CATEGORICAL FEATURES:
 1. session_id
 2. vehicle_id
 3. user_id
 4. start_time
 5. end_time
 6. vehicle_make
 7. vehicle_model
 8. user_type
 9. income_bracket
10. city
11. state
12. charging_station_id
13. station_operator
14. charger_type
15. plug_type
16. day_of_week
17. season
18. is_weekend
19. is_peak_hour
20. festival
21. weather_condition
22. load_shedding_event
23. payment_method
24. station_congestion_level
25. cooling_system_active
26. thermal_management
27. app_reliability_rating
28. trip_purpose
29. state_ev_policy
30. charging_standard_compliance
31. safety_certification
32. session_type
33. is_month_end
34. is_quarter_end

 NUMERICAL FEATURES:
 1. charging_duration_min
 2. energy_delivered_kwh
 3. battery_capacity_kwh
 4. vehicle_age_years
 5. battery_health_index
 6. latitude
 7. longitude
 8. charger_power_kw
 9. initial_soc_percent
10. final_soc_percent
11. soc_gained_percent

## 3. Target Variable Analysis (charging_duration_min)

In [None]:
# Target Variable Analysis
target_var = 'charging_duration_min'
print(" TARGET VARIABLE ANALYSIS")
print(f"Target Variable: {target_var}")
print(f"Data Type: {pf[target_var].dtype}")
print(f"Non-null Count: {pf[target_var].count():,} / {len(pf):,}")
print(f"Missing Values: {pf[target_var].isnull().sum()}")

print("\n STATISTICAL SUMMARY:")
print(pf[target_var].describe())

print("\n DISTRIBUTION CHARACTERISTICS:")
print(f"Range: {pf[target_var].min():.2f} - {pf[target_var].max():.2f} minutes")
print(f"IQR: {pf[target_var].quantile(0.75) - pf[target_var].quantile(0.25):.2f} minutes")
print(f"Skewness: {pf[target_var].skew():.3f}")
print(f"Kurtosis: {pf[target_var].kurtosis():.3f}")

# Practical insights
print("\n PRACTICAL INSIGHTS:")
print(f"Quick Charge (<15 min): {(pf[target_var] < 15).sum():,} sessions ({(pf[target_var] < 15).mean()*100:.1f}%)")
print(f"Standard Charge (15-60 min): {((pf[target_var] >= 15) & (pf[target_var] <= 60)).sum():,} sessions ({((pf[target_var] >= 15) & (pf[target_var] <= 60)).mean()*100:.1f}%)")
print(f"Long Charge (>60 min): {(pf[target_var] > 60).sum():,} sessions ({(pf[target_var] > 60).mean()*100:.1f}%)")
print("\n") 

 TARGET VARIABLE ANALYSIS
Target Variable: charging_duration_min
Data Type: float64
Non-null Count: 75,000 / 75,000
Missing Values: 0

 STATISTICAL SUMMARY:
count    75000.000000
mean        62.841921
std         89.779555
min          5.000000
25%          7.500000
50%         21.260000
75%         87.170000
max       1055.250000
Name: charging_duration_min, dtype: float64

 DISTRIBUTION CHARACTERISTICS:
Range: 5.00 - 1055.25 minutes
IQR: 79.67 minutes
Skewness: 2.844
Kurtosis: 11.662

 PRACTICAL INSIGHTS:
Quick Charge (<15 min): 31,575 sessions (42.1%)
Standard Charge (15-60 min): 18,658 sessions (24.9%)
Long Charge (>60 min): 24,767 sessions (33.0%)




## 4. Missing Value Analysis

In [17]:
# Comprehensive Missing Value Analysis
print(" MISSING VALUE ANALYSIS")

missing_data = pf.isnull().sum()
missing_percent = (missing_data / len(pf)) * 100
missing_df = pd.DataFrame({
    'Feature': missing_data.index,
    'Missing_Count': missing_data.values,
    'Missing_Percent': missing_percent.values
})

# Only show features with missing values
missing_features = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percent', ascending=False)

if len(missing_features) > 0:
    print(f" Features with Missing Values: {len(missing_features)}")
    print("\n  MISSING DATA SUMMARY:")
    for _, row in missing_features.iterrows():
        print(f"{row['Feature']:<30} | {row['Missing_Count']:>6,} | {row['Missing_Percent']:>6.2f}%")
    
    print("\n CRITICAL MISSING VALUES (>10%):")
    critical_missing = missing_features[missing_features['Missing_Percent'] > 10]
    if len(critical_missing) > 0:
        for _, row in critical_missing.iterrows():
            print(f" {row['Feature']}: {row['Missing_Percent']:.1f}% missing")
    else:
        print(" No features with >10% missing values")
else:
    print("No missing values found in any features")

print(f"\n Overall Data Completeness: {((1 - pf.isnull().sum().sum() / pf.size) * 100):.2f}%")
print("\n")

 MISSING VALUE ANALYSIS
 Features with Missing Values: 7

  MISSING DATA SUMMARY:
festival                       | 62,247 |  83.00%
charger_utilization_rate       |  1,913 |   2.55%
booking_lead_time_min          |  1,881 |   2.51%
grid_load_mw                   |  1,858 |   2.48%
next_destination_distance_km   |  1,848 |   2.46%
station_uptime_percent         |  1,841 |   2.45%
air_quality_index              |  1,822 |   2.43%

 CRITICAL MISSING VALUES (>10%):
 festival: 83.0% missing

 Overall Data Completeness: 98.68%




## 5. Data Quality Assessment

In [18]:
# Data Quality Assessment
print("DATA QUALITY ASSESSMENT")


# Duplicate Analysis
duplicates = pf.duplicated().sum()
print(f"Total Duplicate Rows: {duplicates:,}")
if duplicates > 0:
    print(f"Duplicate Percentage: {(duplicates/len(pf))*100:.2f}%")
else:
    print("No duplicate rows found")

# Unique Value Analysis
print("\n UNIQUE VALUE ANALYSIS:")
unique_counts = pf.nunique().sort_values(ascending=False)
print(f"Sessions (session_id): {unique_counts['session_id']:,}")
print(f"Vehicles (vehicle_id): {unique_counts['vehicle_id']:,}")
print(f"Users (user_id): {unique_counts['user_id']:,}")
print(f"Charging Stations: {unique_counts['charging_station_id']:,}")

# Data Consistency Checks
print("\n DATA CONSISTENCY CHECKS:")
# Check if final_soc >= initial_soc (for successful charges)
soc_consistent = (pf['final_soc_percent'] >= pf['initial_soc_percent']).sum()
print(f"SOC Logic Consistent: {soc_consistent:,} / {len(pf):,} ({(soc_consistent/len(pf))*100:.1f}%)")

# Check reasonable charging duration
reasonable_duration = ((pf['charging_duration_min'] > 0) & (pf['charging_duration_min'] < 720)).sum()  # 0-12 hours
print(f"Reasonable Duration: {reasonable_duration:,} / {len(pf):,} ({(reasonable_duration/len(pf))*100:.1f}%)")

# Check SOC ranges
valid_soc = ((pf['initial_soc_percent'] >= 0) & (pf['initial_soc_percent'] <= 100) & 
             (pf['final_soc_percent'] >= 0) & (pf['final_soc_percent'] <= 100)).sum()
print(f"Valid SOC Ranges: {valid_soc:,} / {len(pf):,} ({(valid_soc/len(pf))*100:.1f}%)")
print("\n")

DATA QUALITY ASSESSMENT
Total Duplicate Rows: 0
No duplicate rows found

 UNIQUE VALUE ANALYSIS:
Sessions (session_id): 75,000
Vehicles (vehicle_id): 44,693
Users (user_id): 50,928
Charging Stations: 53,974

 DATA CONSISTENCY CHECKS:
SOC Logic Consistent: 75,000 / 75,000 (100.0%)
Reasonable Duration: 74,943 / 75,000 (99.9%)
Valid SOC Ranges: 75,000 / 75,000 (100.0%)




## 6. Categorical Variables Deep Dive

In [19]:
# Categorical Variables Analysis
print(" CATEGORICAL VARIABLES ANALYSIS")

key_categorical = ['vehicle_make', 'charger_type', 'user_type', 'weather_condition', 
                  'season', 'station_congestion_level', 'session_type']

for feature in key_categorical:
    if feature in pf.columns:
        print(f"\n {feature.upper()}:")
        value_counts = pf[feature].value_counts()
        print(f"Unique Values: {pf[feature].nunique()}")
        print("Top 5 Categories:")
        for i, (cat, count) in enumerate(value_counts.head().items(), 1):
            percentage = (count / len(pf)) * 100
            print(f"  {i}. {cat}: {count:,} ({percentage:.1f}%)")

print("\n")

 CATEGORICAL VARIABLES ANALYSIS

 VEHICLE_MAKE:
Unique Values: 14
Top 5 Categories:
  1. Tata: 18,975 (25.3%)
  2. Mahindra: 14,873 (19.8%)
  3. Bajaj: 11,262 (15.0%)
  4. Hero: 9,002 (12.0%)
  5. TVS: 6,069 (8.1%)

 CHARGER_TYPE:
Unique Values: 3
Top 5 Categories:
  1. AC_Slow: 45,215 (60.3%)
  2. AC_Fast: 18,558 (24.7%)
  3. DC_Fast: 11,227 (15.0%)

 USER_TYPE:
Unique Values: 6
Top 5 Categories:
  1. private: 33,789 (45.1%)
  2. fleet: 15,081 (20.1%)
  3. taxi: 11,158 (14.9%)
  4. ride_share: 7,469 (10.0%)
  5. delivery: 6,014 (8.0%)

 WEATHER_CONDITION:
Unique Values: 9
Top 5 Categories:
  1. Clear: 29,798 (39.7%)
  2. Partly_Cloudy: 15,011 (20.0%)
  3. Cloudy: 11,287 (15.0%)
  4. Light_Rain: 7,552 (10.1%)
  5. Heavy_Rain: 3,835 (5.1%)

 SEASON:
Unique Values: 4
Top 5 Categories:
  1. Summer: 25,119 (33.5%)
  2. Monsoon: 18,964 (25.3%)
  3. Winter: 18,449 (24.6%)
  4. Post_Monsoon: 12,468 (16.6%)

 STATION_CONGESTION_LEVEL:
Unique Values: 3
Top 5 Categories:
  1. Low: 37,585 (50.1%)

## 7. Numerical Variables Statistical Summary

In [20]:
# Numerical Variables Analysis
print(" NUMERICAL VARIABLES ANALYSIS")


# Key numerical features for BEV analysis
key_numerical = ['charging_duration_min', 'energy_delivered_kwh', 'battery_capacity_kwh',
                'charger_power_kw', 'initial_soc_percent', 'final_soc_percent',
                'ambient_temperature_c', 'battery_health_index', 'vehicle_age_years']

numerical_summary = pf[key_numerical].describe().round(2)
print(" STATISTICAL SUMMARY (Key Features):")
print(numerical_summary)

# Outlier Analysis using IQR method
print("\n  OUTLIER ANALYSIS (IQR Method):")
for feature in key_numerical[:6]:  # Focus on most important features
    if feature in pf.columns:
        Q1 = pf[feature].quantile(0.25)
        Q3 = pf[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = ((pf[feature] < lower_bound) | (pf[feature] > upper_bound)).sum()
        outlier_percent = (outliers / len(pf)) * 100
        print(f"{feature:<25} | Outliers: {outliers:>6,} ({outlier_percent:>5.1f}%)")

print("\n")

 NUMERICAL VARIABLES ANALYSIS


 STATISTICAL SUMMARY (Key Features):
       charging_duration_min  energy_delivered_kwh  battery_capacity_kwh  \
count               75000.00              75000.00              75000.00   
mean                   62.84                  7.40                 18.98   
std                    89.78                  8.24                 19.04   
min                     5.00                  0.28                  1.80   
25%                     7.50                  1.07                  2.90   
50%                    21.26                  5.15                 15.00   
75%                    87.17                 10.99                 26.00   
max                  1055.25                 71.11                 95.00   

       charger_power_kw  initial_soc_percent  final_soc_percent  \
count          75000.00             75000.00           75000.00   
mean              20.70                31.21              69.20   
std               32.22                12.42              13.71   
min       

## 8. Temporal Pattern Analysis

In [21]:
# Temporal Pattern Analysis
print(" TEMPORAL PATTERN ANALYSIS")

# Convert datetime columns
pf['start_time'] = pd.to_datetime(pf['start_time'])
pf['end_time'] = pd.to_datetime(pf['end_time'])

# Extract temporal features for analysis
pf['year'] = pf['start_time'].dt.year
pf['hour'] = pf['start_time'].dt.hour
pf['day_name'] = pf['start_time'].dt.day_name()

print(" TEMPORAL DISTRIBUTION:")
print(f"Date Range: {pf['start_time'].min().date()} to {pf['start_time'].max().date()}")
print(f"Time Span: {(pf['start_time'].max() - pf['start_time'].min()).days} days")

print("\n CHARGING BY YEAR:")
year_dist = pf['year'].value_counts().sort_index()
for year, count in year_dist.items():
    print(f"{year}: {count:,} sessions ({(count/len(pf))*100:.1f}%)")

print("\n PEAK CHARGING HOURS:")
hour_dist = pf['hour'].value_counts().sort_index()
peak_hours = hour_dist.nlargest(5)
for hour, count in peak_hours.items():
    print(f"{hour:02d}:00 - {hour+1:02d}:00 | {count:,} sessions ({(count/len(pf))*100:.1f}%)")

print("\n CHARGING BY DAY OF WEEK:")
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_dist = pf['day_name'].value_counts().reindex(day_order)
for day, count in day_dist.items():
    print(f"{day:<9} | {count:,} sessions ({(count/len(pf))*100:.1f}%)")

print("\n  SEASONAL DISTRIBUTION:")
season_dist = pf['season'].value_counts()
for season, count in season_dist.items():
    print(f"{season:<8} | {count:,} sessions ({(count/len(pf))*100:.1f}%)")

print("\n")

 TEMPORAL PATTERN ANALYSIS
 TEMPORAL DISTRIBUTION:
Date Range: 2018-01-01 to 2024-12-31
Time Span: 2556 days

 CHARGING BY YEAR:
2018: 10,753 sessions (14.3%)
2019: 10,809 sessions (14.4%)
2020: 10,817 sessions (14.4%)
2021: 10,839 sessions (14.5%)
2022: 10,782 sessions (14.4%)
2023: 10,411 sessions (13.9%)
2024: 10,589 sessions (14.1%)

 PEAK CHARGING HOURS:
14:00 - 15:00 | 3,264 sessions (4.4%)
06:00 - 07:00 | 3,204 sessions (4.3%)
18:00 - 19:00 | 3,173 sessions (4.2%)
11:00 - 12:00 | 3,172 sessions (4.2%)
22:00 - 23:00 | 3,167 sessions (4.2%)

 CHARGING BY DAY OF WEEK:
Monday    | 10,654 sessions (14.2%)
Tuesday   | 10,847 sessions (14.5%)
Wednesday | 10,775 sessions (14.4%)
Thursday  | 10,736 sessions (14.3%)
Friday    | 10,811 sessions (14.4%)
Saturday  | 10,555 sessions (14.1%)
Sunday    | 10,622 sessions (14.2%)

  SEASONAL DISTRIBUTION:
Summer   | 25,119 sessions (33.5%)
Monsoon  | 18,964 sessions (25.3%)
Winter   | 18,449 sessions (24.6%)
Post_Monsoon | 12,468 sessions (16.6%)

## 9. Geographic Distribution Analysis

In [22]:
# Geographic Distribution Analysis
print("GEOGRAPHIC DISTRIBUTION ANALYSIS")


print("TOP 10 CITIES BY CHARGING SESSIONS:")
city_dist = pf['city'].value_counts().head(10)
for i, (city, count) in enumerate(city_dist.items(), 1):
    print(f"{i:2d}. {city:<15} | {count:,} sessions ({(count/len(pf))*100:.1f}%)")

print("\n TOP 10 STATES BY CHARGING SESSIONS:")
state_dist = pf['state'].value_counts().head(10)
for i, (state, count) in enumerate(state_dist.items(), 1):
    print(f"{i:2d}. {state:<20} | {count:,} sessions ({(count/len(pf))*100:.1f}%)")

print("\n GEOGRAPHIC COVERAGE:")
print(f"Total Cities: {pf['city'].nunique():,}")
print(f"Total States: {pf['state'].nunique():,}")
print(f"Latitude Range: {pf['latitude'].min():.2f}° to {pf['latitude'].max():.2f}°")
print(f"Longitude Range: {pf['longitude'].min():.2f}° to {pf['longitude'].max():.2f}°")

print("\n")

GEOGRAPHIC DISTRIBUTION ANALYSIS
TOP 10 CITIES BY CHARGING SESSIONS:
 1. Hyderabad       | 6,414 sessions (8.6%)
 2. Kolkata         | 6,326 sessions (8.4%)
 3. Delhi           | 6,313 sessions (8.4%)
 4. Kochi           | 6,309 sessions (8.4%)
 5. Mumbai          | 6,274 sessions (8.4%)
 6. Lucknow         | 6,271 sessions (8.4%)
 7. Bengaluru       | 6,240 sessions (8.3%)
 8. Pune            | 6,228 sessions (8.3%)
 9. Chennai         | 6,191 sessions (8.3%)
10. Gurgaon         | 6,188 sessions (8.3%)

 TOP 10 STATES BY CHARGING SESSIONS:
 1. Maharashtra          | 12,502 sessions (16.7%)
 2. Telangana            | 6,414 sessions (8.6%)
 3. West Bengal          | 6,326 sessions (8.4%)
 4. Delhi                | 6,313 sessions (8.4%)
 5. Kerala               | 6,309 sessions (8.4%)
 6. Uttar Pradesh        | 6,271 sessions (8.4%)
 7. Karnataka            | 6,240 sessions (8.3%)
 8. Tamil Nadu           | 6,191 sessions (8.3%)
 9. Haryana              | 6,188 sessions (8.3%)
10. Gujara

## 10. BEV-Specific Feature Analysis

In [23]:
# BEV-Specific Analysis
print("BEV-SPECIFIC FEATURE ANALYSIS")


print("VEHICLE ANALYSIS:")
print(f"Total Vehicle Makes: {pf['vehicle_make'].nunique()}")
print(f"Total Vehicle Models: {pf['vehicle_model'].nunique()}")
print(f"Total Unique Vehicles: {pf['vehicle_id'].nunique():,}")

print("\n BATTERY ANALYSIS:")
print(f"Battery Capacity Range: {pf['battery_capacity_kwh'].min():.1f} - {pf['battery_capacity_kwh'].max():.1f} kWh")
print(f"Average Battery Health: {pf['battery_health_index'].mean():.3f}")
print(f"Vehicle Age Range: {pf['vehicle_age_years'].min()} - {pf['vehicle_age_years'].max()} years")

print("\n⚡ CHARGING ANALYSIS:")
print(f"Charger Power Range: {pf['charger_power_kw'].min():.1f} - {pf['charger_power_kw'].max():.1f} kW")
print(f"Average Energy Delivered: {pf['energy_delivered_kwh'].mean():.2f} kWh")
print(f"Average SOC Gain: {pf['soc_gained_percent'].mean():.1f}%")

print("\n CHARGER TYPE DISTRIBUTION:")
charger_types = pf['charger_type'].value_counts()
for charger, count in charger_types.items():
    print(f"{charger:<15} | {count:,} sessions ({(count/len(pf))*100:.1f}%)")

print("\n STATION CONGESTION ANALYSIS:")
congestion_levels = pf['station_congestion_level'].value_counts()
for level, count in congestion_levels.items():
    print(f"{level:<10} | {count:,} sessions ({(count/len(pf))*100:.1f}%)")

print("\n")

BEV-SPECIFIC FEATURE ANALYSIS
VEHICLE ANALYSIS:
Total Vehicle Makes: 14
Total Vehicle Models: 28
Total Unique Vehicles: 44,693

 BATTERY ANALYSIS:
Battery Capacity Range: 1.8 - 95.0 kWh
Average Battery Health: 0.944
Vehicle Age Range: 0 - 26 years

⚡ CHARGING ANALYSIS:
Charger Power Range: 3.3 - 150.0 kW
Average Energy Delivered: 7.40 kWh
Average SOC Gain: 38.0%

 CHARGER TYPE DISTRIBUTION:
AC_Slow         | 45,215 sessions (60.3%)
AC_Fast         | 18,558 sessions (24.7%)
DC_Fast         | 11,227 sessions (15.0%)

 STATION CONGESTION ANALYSIS:
Low        | 37,585 sessions (50.1%)
Medium     | 26,151 sessions (34.9%)
High       | 11,264 sessions (15.0%)




## 11. Feature Correlation Analysis

In [24]:
# Feature Correlation with Target Variable
print(" CORRELATION ANALYSIS WITH TARGET VARIABLE")

# Calculate correlations with charging_duration_min
target_correlations = pf[numerical_features].corr()['charging_duration_min'].abs().sort_values(ascending=False)

print(f" TOP 15 FEATURES CORRELATED WITH {target_var}:")
print("Feature" + " " * 25 + "| Correlation")
print("-" * 45)

for i, (feature, corr) in enumerate(target_correlations.head(15).items(), 1):
    if feature != target_var:  # Exclude self-correlation
        print(f"{i:2d}. {feature:<30} | {corr:>7.3f}")

print("\n CORRELATION STRENGTH INTERPRETATION:")
strong_corr = target_correlations[(target_correlations >= 0.5) & (target_correlations.index != target_var)]
moderate_corr = target_correlations[(target_correlations >= 0.3) & (target_correlations < 0.5) & (target_correlations.index != target_var)]
weak_corr = target_correlations[(target_correlations >= 0.1) & (target_correlations < 0.3) & (target_correlations.index != target_var)]

print(f" Strong Correlation (≥0.5): {len(strong_corr)} features")
print(f" Moderate Correlation (0.3-0.5): {len(moderate_corr)} features")
print(f" Weak Correlation (0.1-0.3): {len(weak_corr)} features")

if len(strong_corr) > 0:
    print("\n💪 STRONGLY CORRELATED FEATURES:")
    for feature, corr in strong_corr.items():
        print(f"   • {feature}: {corr:.3f}")

print("\n")

 CORRELATION ANALYSIS WITH TARGET VARIABLE
 TOP 15 FEATURES CORRELATED WITH charging_duration_min:
Feature                         | Correlation
---------------------------------------------
 2. energy_delivered_kwh           |   0.765
 3. total_cost_inr                 |   0.694
 4. battery_capacity_kwh           |   0.686
 5. charger_power_kw               |   0.277
 6. energy_efficiency_kwh_per_100km |   0.272
 7. soc_gained_percent             |   0.234
 8. subsidy_amount_inr             |   0.200
 9. tariff_per_kwh_inr             |   0.140
10. final_soc_percent              |   0.137
11. initial_soc_percent            |   0.093
12. range_remaining_km             |   0.091
13. battery_health_index           |   0.036
14. vehicle_age_years              |   0.032
15. charging_efficiency_percent    |   0.018

 CORRELATION STRENGTH INTERPRETATION:
 Strong Correlation (≥0.5): 3 features
 Moderate Correlation (0.3-0.5): 0 features
 Weak Correlation (0.1-0.3): 6 features

💪 STRONGLY CORR

## 12. Data Understanding Summary & Research Insights

In [25]:
# Data Understanding Summary
print("DATA UNDERSTANDING SUMMARY & RESEARCH INSIGHTS")


print(" DATASET CHARACTERISTICS:")
print(f"   • Dataset Size: {len(pf):,} charging sessions across {pf.shape[1]} features")
print(f"   • Data Quality: {((1 - pf.isnull().sum().sum() / pf.size) * 100):.1f}% complete")
print(f"   • Time Span: {(pf['start_time'].max() - pf['start_time'].min()).days} days")
print(f"   • Geographic Coverage: {pf['city'].nunique()} cities, {pf['state'].nunique()} states")

print("\n TARGET VARIABLE INSIGHTS:")
print(f"   • Charging Duration Range: {pf[target_var].min():.1f} - {pf[target_var].max():.1f} minutes")
print(f"   • Average Charging Time: {pf[target_var].mean():.1f} minutes")
print(f"   • Most Common Duration: {pf[target_var].mode().iloc[0]:.1f} minutes")
print(f"   • Distribution: {'Right-skewed' if pf[target_var].skew() > 1 else 'Nearly normal' if abs(pf[target_var].skew()) < 1 else 'Left-skewed'}")

print("\n  BEV ECOSYSTEM INSIGHTS:")
print(f"   • Vehicle Diversity: {pf['vehicle_make'].nunique()} makes, {pf['vehicle_model'].nunique()} models")
print(f"   • Charging Infrastructure: {pf['charging_station_id'].nunique():,} stations, {pf['station_operator'].nunique()} operators")
print(f"   • User Base: {pf['user_id'].nunique():,} active users")
print(f"   • Technology Spread: {pf['charger_type'].nunique()} charger types, {pf['plug_type'].nunique()} plug standards")

print("\n  ANALYTICAL READINESS:")
missing_critical = (pf[key_numerical[:6]].isnull().sum() > len(pf) * 0.1).sum()
print(f"   • Critical Missing Data: {' ' + str(missing_critical) + ' features >10% missing' if missing_critical > 0 else ' No critical missing data'}")
print(f"   • Feature Diversity: {len(categorical_features)} categorical, {len(numerical_features)} numerical")
print(f"   • Correlation Strength: {len(strong_corr)} strong predictors identified")
print(f"   • Data Balance: {' Well distributed' if pf[target_var].std()/pf[target_var].mean() < 2 else '⚠️ High variance'}")

print("\n NEXT STEPS FOR DATA CLEANING:")
print("   1. Handle missing values in weather and grid data")
print("   2. Address outliers in charging duration and power data")
print("   3. Validate temporal consistency (start_time < end_time)")
print("   4. Ensure SOC logic consistency (final ≥ initial)")
print("   5. Standardize categorical variables (text consistency)")

print("\n RESEARCH MODELING INSIGHTS:")
print("   • Target is predictable with current features")
print("   • Temporal patterns suggest time-series approaches")
print("   • Multi-modal charging behavior requires segmentation")
print("   • Weather and grid factors show promising correlations")

print(" DATA UNDERSTANDING PHASE:  COMPLETED")
print(" READY FOR: Data Cleaning Phase")

DATA UNDERSTANDING SUMMARY & RESEARCH INSIGHTS
 DATASET CHARACTERISTICS:
   • Dataset Size: 75,000 charging sessions across 77 features
   • Data Quality: 98.7% complete
   • Time Span: 2556 days
   • Geographic Coverage: 12 cities, 11 states

 TARGET VARIABLE INSIGHTS:
   • Charging Duration Range: 5.0 - 1055.2 minutes
   • Average Charging Time: 62.8 minutes
   • Most Common Duration: 5.0 minutes
   • Distribution: Right-skewed

  BEV ECOSYSTEM INSIGHTS:
   • Vehicle Diversity: 14 makes, 28 models
   • Charging Infrastructure: 53,974 stations, 11 operators
   • User Base: 50,928 active users
   • Technology Spread: 3 charger types, 5 plug standards

  ANALYTICAL READINESS:
   • Critical Missing Data:  No critical missing data
   • Feature Diversity: 34 categorical, 40 numerical
   • Correlation Strength: 3 strong predictors identified
   • Data Balance:  Well distributed

 NEXT STEPS FOR DATA CLEANING:
   1. Handle missing values in weather and grid data
   2. Address outliers in cha