# Exploratory Workflow

## Steps
- Check Data Size
- Review Data Types
- Table Quick View
- General Data Quality Issues

In [4]:
# Checking the data size
import os
import pandas as pd
import numpy as np

# Define the data directory relative to the current script location
dir_path = '../../MyData'
file_name = 'Electric_Vehicle_Population_Data.csv'

# Build the absolute path
abs_path = os.path.join(dir_path, file_name)

# Read the CSV file
df = pd.read_csv(abs_path)

rows = len(df)
columns = df.shape[1]
print("Data Size: ", '\n', columns, "Columns ", '\n', rows, "rows", '\n')

print("Attributes Data Types:", '\n',df.dtypes)

Data Size:  
 17 Columns  
 121978 rows 

Attributes Data Types: 
 VIN (1-10)                                            object
County                                                object
City                                                  object
State                                                 object
Postal Code                                          float64
Model Year                                             int64
Make                                                  object
Model                                                 object
Electric Vehicle Type                                 object
Clean Alternative Fuel Vehicle (CAFV) Eligibility     object
Electric Range                                         int64
Base MSRP                                              int64
Legislative District                                 float64
DOL Vehicle ID                                         int64
Vehicle Location                                      object
Electric Utility  

## Table Quick View

In [32]:
df.head()

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,7SAYGDEE1P,King,Bothell,WA,98011.0,2023,TESLA,MODEL Y,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,0,0,1.0,226064177,POINT (-122.20563 47.76144),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033020000.0
1,WBY1Z8C57H,King,Seattle,WA,98112.0,2017,BMW,I3,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,97,0,43.0,478434827,POINT (-122.30716 47.62687),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033010000.0
2,7SAYGDEF3N,King,Bellevue,WA,98008.0,2022,TESLA,MODEL Y,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,0,0,48.0,192930685,POINT (-122.11867 47.63131),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033020000.0
3,KNDCC3LG4N,Thurston,Lacey,WA,98503.0,2022,KIA,NIRO,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,0,0,22.0,224687739,POINT (-122.82324 47.04437),PUGET SOUND ENERGY INC,53067010000.0
4,5YJ3E1EB5L,King,Snoqualmie,WA,98024.0,2020,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,322,0,5.0,124663843,POINT (-121.89086 47.56812),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033030000.0


## General Data Quality Issues

In [2]:
import os
import pandas as pd
import tabulate

dir_path = '../../MyData'
file_name = 'Electric_Vehicle_Population_Data2.csv'
abs_path = os.path.join(dir_path, file_name)
df = pd.read_csv(abs_path)

def detect_missing_values(column):
    return column.isna().sum()

def detect_duplicates(column):
    return column.duplicated().sum()

def detect_invalid_data_types(column):
    return column.apply(lambda x: not isinstance(x, (int, float, str))).sum()

def detect_zeroes(column):
    zero_count = (df[col] == 0)
    return zero_count.sum()


# Create a dictionary to store issue counts for each column
issues = {}
for col in df.columns:
    issues[col] = {
        'missing_values': detect_missing_values(df[col]),
        'duplicates': detect_duplicates(df[col]),
        'invalid_data_types': detect_invalid_data_types(df[col]),
        'irrev_zeroes': detect_zeroes(df[col])
    }

# Create a table to display the issues
table = []
for col, issue_counts in issues.items():
    row = [col] + [issue_counts[key] for key in issue_counts]
    table.append(row)

print(tabulate.tabulate(table, headers=['COLUMNS', 'MISSING', 'DUPLICATES', 'INVALID D_TYPES', 'IRREV_ZEROES'], tablefmt='grid'))

+---------------------------------------------------+-----------+--------------+-------------------+----------------+
| COLUMNS                                           |   MISSING |   DUPLICATES |   INVALID D_TYPES |   IRREV_ZEROES |
| VIN (1-10)                                        |         0 |         8218 |                 0 |              0 |
+---------------------------------------------------+-----------+--------------+-------------------+----------------+
| County                                            |         0 |        11944 |                 0 |              0 |
+---------------------------------------------------+-----------+--------------+-------------------+----------------+
| City                                              |         0 |        11670 |                 0 |              0 |
+---------------------------------------------------+-----------+--------------+-------------------+----------------+
| State                                             |   

In [14]:


mean_val = round(sum([x for x in df['Electric Range'] if x != 0]) / len([x for x in df['Electric Range'] if x != 0]), 2)
print("Mean ignoring zeroes: ", mean_val)

Mean ignoring zeroes:  133.1


## Initial Descriptive Summary

In [None]:
# For this instance we will take only the numeric columns identified in 

summary = df.describe(include=[np.number]).round(2)
print(summary)


       Postal Code  Model Year  Electric Range  Base MSRP  \
count     12000.00    12000.00        12000.00   12000.00   
mean      98157.75     2019.34           80.71    1476.88   
std        2665.93        2.96          101.32    9476.19   
min        6340.00     2000.00            0.00       0.00   
25%       98052.00     2018.00            0.00       0.00   
50%       98121.00     2020.00           25.00       0.00   
75%       98370.00     2022.00          200.00       0.00   
max       99403.00     2023.00          337.00  184400.00   

       Legislative District  DOL Vehicle ID  2020 Census Tract  
count              11975.00    1.200000e+04       1.200000e+04  
mean                  29.70    2.022408e+08       5.299599e+10  
std                   14.68    9.090876e+07       1.277854e+09  
min                    1.00    1.090490e+05       4.013610e+09  
25%                   18.00    1.517578e+08       5.303301e+10  
50%                   34.00    1.964529e+08       5.303303e+

## Data Cleaning

- Missing
- Outliers
- Duplicates
- Normalize data(replace zeroes with precalculated mean)
- Data Formatting
- Quality validation
- Filter and Sort
- 

In [None]:
import pandas as pd

def clean_data(df):

    # 1. Handle missing values and zeroes
    df.fillna(df.mean(), inplace=True)  # Replace NaN with mean
    
    # calculate mean value for the applicable column ignoring zeroes prior imputing it
    non_zero_mean1 = round(sum([x for x in df['Electric Range'] if x != 0]) / len([x for x in df['Electric Range'] if x != 0]), 2)
    df['Electric Range'] = df['Electric Range'].replace(0, non_zero_mean1, inplace=True)

    non_zero_mean2 = round(sum([x for x in df['Base MSRP'] if x != 0]) / len([x for x in df['Base MSRP'] if x != 0]), 2)
    df['Base MSRP'] = df['Base MSRP'].replace(0, non_zero_mean2, inplace=True)


