# Data Summary Workflow

## Steps
- Check Data Size
- Review Attributes Data Types
- Table Quick View
- Identify General Data Quality Issues

In [None]:
# Checking the data size
import os
import pandas as pd
import numpy as np

# Define the data directory relative to the current script location
dir_path = '../../MyData'
file_name = 'Electric_Vehicle_Population_Data.csv'

# Build the absolute path
abs_path = os.path.join(dir_path, file_name)

# Read the CSV file
df = pd.read_csv(abs_path)

rows = len(df)
columns = df.shape[1]
print("Data Size: ", '\n', columns, "Columns ", '\n', rows, "rows", '\n')

print("Attributes Data Types:", '\n',df.dtypes)

print('\n', "Table Quick View: ")
df.head()



Data Size:  
 17 Columns  
 121978 rows 

Attributes Data Types: 
 VIN (1-10)                                            object
County                                                object
City                                                  object
State                                                 object
Postal Code                                          float64
Model Year                                             int64
Make                                                  object
Model                                                 object
Electric Vehicle Type                                 object
Clean Alternative Fuel Vehicle (CAFV) Eligibility     object
Electric Range                                         int64
Base MSRP                                              int64
Legislative District                                 float64
DOL Vehicle ID                                         int64
Vehicle Location                                      object
Electric Utility  

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,5YJ3E1EB2J,Suffolk,Suffolk,VA,23435.0,2018,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,215,0,,476647986,POINT (-76.42443 36.8752),,51800080000.0
1,5YJ3E1ECXL,Yakima,Yakima,WA,98908.0,2020,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,308,0,14.0,103490145,POINT (-120.56916 46.58514),PACIFICORP,53077000000.0
2,WA1LAAGE7M,Yakima,Yakima,WA,98908.0,2021,AUDI,E-TRON,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,222,0,14.0,144941534,POINT (-120.56916 46.58514),PACIFICORP,53077000000.0
3,5YJ3E1EA1K,Danville,Danville,VA,24541.0,2019,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,220,0,,168513922,POINT (-79.4172 36.58598),,51590000000.0
4,1FADP5CU9E,Norfolk,Norfolk,VA,23518.0,2014,FORD,C-MAX,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,19,0,,150749378,POINT (-76.21549 36.92478),,51710010000.0


## General Data Quality Issues

In [3]:
for col in df.columns:
    count = (df == 0).sum()
    print(count)


VIN (1-10)                                               0
County                                                   0
City                                                     0
State                                                    0
Postal Code                                              0
Model Year                                               0
Make                                                     0
Model                                                    0
Electric Vehicle Type                                    0
Clean Alternative Fuel Vehicle (CAFV) Eligibility        0
Electric Range                                        4723
Base MSRP                                            11686
Legislative District                                     0
DOL Vehicle ID                                           0
Vehicle Location                                         0
Electric Utility                                         0
2020 Census Tract                                       

In [29]:
''' !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# IDENTIFIYING Data Quality Issues

# Summary Table
'''
import os
import pandas as pd
import tabulate

dir_path = '../../MyData'
file_name = 'Electric_Vehicle_Population_Data2.csv'
abs_path = os.path.join(dir_path, file_name)
df = pd.read_csv(abs_path)

def detect_missing_values(column):
    return column.isna().sum()

def detect_duplicates(column):
    return column.duplicated().sum()

def detect_invalid_data_types(column):
    return column.apply(lambda x: not isinstance(x, (int, float, str))).sum()

def detect_zeroes(column):
    zero_count = (df[col] == 0)
    return zero_count.sum()


# Create a dictionary to store issue counts for each column
issues = {}
for col in df.columns:
    issues[col] = {
        'missing_values': detect_missing_values(df[col]),
        'duplicates': detect_duplicates(df[col]),
        'invalid_data_types': detect_invalid_data_types(df[col]),
        'irrev_zeroes': detect_zeroes(df[col])
    }

# Create a table to display the issues
table = []
for col, issue_counts in issues.items():
    row = [col] + [issue_counts[key] for key in issue_counts]
    table.append(row)

print(tabulate.tabulate(table, headers=['Column', 'Missing', 'Duplicates', 'Invalid DTypes', 'irrev_zeroes'], tablefmt='grid'))

+---------------------------------------------------+-----------+--------------+------------------+----------------+
| Column                                            |   Missing |   Duplicates |   Invalid DTypes |   irrev_zeroes |
| VIN (1-10)                                        |         0 |         8218 |                0 |              0 |
+---------------------------------------------------+-----------+--------------+------------------+----------------+
| County                                            |         0 |        11944 |                0 |              0 |
+---------------------------------------------------+-----------+--------------+------------------+----------------+
| City                                              |         0 |        11670 |                0 |              0 |
+---------------------------------------------------+-----------+--------------+------------------+----------------+
| State                                             |         0 