In [None]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px 


## Data Loading

In [None]:
ev_data = pd.read_csv(r"C:\Users\amarj\OneDrive\Desktop\end-to-end-data-analyst-project\Electric_trend_data_analysis\Electric_Vehicle_Population_Data.csv")

In [None]:
ev_data()

## EDA : Exploratory Data Analysis

In [None]:
ev_data.info()

### Data Overview: 
- The dataset contains information about the population of electric vehicles.

### Here the dataset includes following columns:

1. VIN_(1-10): Partial VIN number (a set of 17 characters that are unique to the vehicle, like a vehicle's unique identity code)
2. County: County name
3. City: City name
4. State: State abbreviation
5. Postal_Code: Postal code
6. Model_Year: Year of the vehicle model
7. Make: Manufacturer of the vehicle
8. Model: Vehicle model
9. Clean_Alternative_Fuel_Vehicle _(CAFV)_Eligibility: Eligibility status for clean alternative fuel vehicle
10. Electric_Range: Electric range of the vehicle
11. Base_MSRP: Base manufacturer's suggested retail price
12. Legislative_District: Legislative district number
13. DOL_Vehicle_ID: Department of Licensing vehicle ID
14. Vehicl_ Location: Location coordinates
15. Electric_Utility: Electric utility provider
16. 2020_Census_Tract: Census tract for the year 2020
17. Electric_Vehicle _Type: Type of electric vehicle (e.g., PHEV, BEV)

## Data Cleaning

In [None]:
ev_data[['VIN (1-10)']].nunique()

In [None]:
ev_data[['VIN (1-10)']].value_counts()

In [None]:
ev_data.shape

In [None]:
ev_data[ev_data.duplicated()].value_counts()
# there is zero repetation of data across all the row 

In [None]:
ev_data.isnull().sum()

In [None]:
ev_data.dropna(inplace = True)

In [None]:
ev_data.groupby(by = ['VIN (1-10)' , 'County']).count()

In [None]:
ev_data.info()

## Data Visualization

In [None]:
# Distribution of Model Year

plt.figure(figsize = (6, 6))
plt.title("Distribution By Modal Year")
sns.histplot(ev_data['Model Year'] ,bins = 20 , kde = True )
plt.xlabel('Modal Year')
plt.ylabel("EV's Frequency")
plt.show()


In [None]:
# Distribution of Electric Range of Vehical 
plt.figure(figsize = (6, 6))
plt.title("Distribution of Electric Vehical by range")
sns.histplot(ev_data['Electric Range'] , bins = 20 , kde = True )
plt.xlabel("Electric Range ")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Count of clean Alternative Fueal Vehical (CAFV) Eligibility
plt.figure(figsize = (6, 6))
plt.title("Clean Alternative Fuel Vehicle (CAFV) Eligibility")
sns.countplot(y = ev_data['Clean Alternative Fuel Vehicle (CAFV) Eligibility'], palette=['#FF6347', '#4682B4', '#32CD32'])
plt.ylabel('CAFV Eligibility')
plt.show()

In [None]:
# Count of electric Vehical type
plt.figure(figsize =(6, 6))
plt.title("Count of electric Vehical type")
sns.countplot(data = ev_data , y = 'Electric Vehicle Type' , palette=['#FF6347', '#4682B4'])
plt.show()

In [None]:
import plotly.express as px 

# Grouping Country and Counting Number of Electric Vehical Entries 
country_counts = ev_data.groupby('County' ).count().sort_values(by = 'City' , ascending = False )['City']

# Extract the top 10 counties and their counts
top_ten_country = country_counts.index[: 10]
top_ten_values = country_counts.values[:10]

top_ten_values = country_counts.values[:10]
fig = px.bar(x = top_ten_country , y = top_ten_values , labels = {'x': 'County Name', 'y': 'Number of Elecrtic Vehicals'} ,color = top_ten_values ,   title='Top 10 Counties with Most Electric Vehicles ' )
fig.show()


In [None]:
# Grouping Make and Counting Number of Electric Vehical Entries 
country_counts = ev_data.groupby('Make' ).count().sort_values(by = 'City' , ascending = False )['City']

# Extract the top 10 counties and their counts
top_ten_company = country_counts.index[: 10]
top_ten_values = country_counts.values[:10]

fig = px.bar(x = top_ten_company , y = top_ten_values , labels = {'x': 'Company Name', 'y': 'Number of Electric Vehicals'} ,color = top_ten_values,   title='Top 10 Company with Most Electric Vehicles ' )
fig.show()

In [None]:
car_count_city = ev_data['City'].value_counts().nlargest(10).reset_index()
car_count_city
fig_carper_city = px.bar(x = car_count_city['City'] , y = car_count_city.iloc[ :,1] , labels = {'x':'City Name' , 'y':'Number of Car'}  , color = car_count_city.iloc[ :,1], title = "Top 10 city by Electrical Vehicals")

In [None]:

fig_carper_city

In [None]:
car_count_city

In [None]:
# Percentage of BEV vs PHEV by Country

vehical_type = ev_data['Electric Vehicle Type'].value_counts(normalize = True).reset_index()
vehical_type
px.pie(data_frame = vehical_type , names = 'Electric Vehicle Type' , values = 'proportion' , color = 'proportion' , title = "Percentage of BEV vs PHEV by Country" )

In [None]:
# Top five largest smallest Electric Vehicals Company by Electric range
def top_car_modal_byrange():
    range_order = ev_data.sort_values(by = 'Electric Range' , ascending = False).reset_index(drop = True)
    range_order = range_order[['Make' , 'Model' , 'Electric Range']]
     
    top_5 = range_order.drop_duplicates().reset_index(drop = True).head()
    print("TOP 10 ELECTRIC CAR COMPANY BY ELECTRIC RANGE : \n ")
    print(top_5)
    
top_car_modal_byrange()

In [None]:
def bottom_car_modal_byrange():
   range_order = ev_data.sort_values(by = 'Electric Range' , ascending = False).reset_index(drop = True)
   range_order = range_order[['Make' , 'Model' , 'Electric Range']]
     
   bottom_5 = range_order.drop_duplicates().reset_index(drop = True).tail()
   print("Bottom 10 ELECTRIC CAR COMPANY BY ELECTRIC RANGE : \n ")  
   print(bottom_5)
bottom_car_modal_byrange() 

In [None]:
# Which year Electric Cars Increase 
sns.histplot(data = ev_data , x = ev_data['Model Year'], bins = 20)

In [None]:
# Year wise Electric Car Sell
cars_num_year_wise = ev_data.groupby('Model Year')['VIN (1-10)'].count().reset_index()
px.line(cars_num_year_wise , x = 'Model Year' , y = 'VIN (1-10)' , markers = True , labels ={'y':'Nums_Cars'} , title = 'Year Wise Electric Car Purchase ')

Taking a step further in data cleaning or data modeling i am going to drop some fiels(columns) which in my observation are not serving any purpose towards ability to give  a menigful insights

#### Here. 
1. VIN (1-10): It is a unique identifier for each vehicle but does not contribute to understanding trends or making predictions.

2. DOL Vehicle ID: This is another unique identifier with high cardinality and likely does not add value to the analysis. A unique identification number for each vehicle present in Transactions dataset. Transactions done on the same vehicle will have the same DOL Vehicle ID.

3. Vehicl Location: Removing this column simplifies the dataset and focuses the analysis on more relevant features. and other columns like "County" and "City" provide sufficient geographical context for non-geospatial analysis.

4. Legislative District : Similar information is captured by "County" and "City" Dropping it can reduces the complexicity of the dataset and simplifies the analysis.

5. Electric Utility : Here migt be high variability and inconsistency. Hence dropping it can simplify the dataset.

6. 2020 Census Tract : The census tract data might not add significant value. Hence, removing this column simplifies the dataset.

7. Base MSRP : The MSRP may vary significantly by market and does not necessarily reflect the actual purchase price or incentives, potentially complicating the analysis without adding clear value.


#### So, i will drop these three columns which are 'VIN_(1-10)', 'DOL_Vehicle_ID', 'Vehicl_Location', 'Legislative_District' , 'Electric_Utility',  '2020_Census_Tract' and 'Base_MSRP'.

In [None]:
ev_data.columns

In [None]:
df1 = ev_data.copy()

In [None]:
ev_data.drop(['VIN (1-10)','Postal Code','Base MSRP','Legislative District','DOL Vehicle ID','Electric Utility','2020 Census Tract'],axis=1,inplace=True)

In [None]:
ev_data.shape

In [None]:
ev_data.info()

In [None]:
df = ev_data

In [None]:
df.shape

#### Conclusion : 
So, this dataset is all about Electric Vehicle Population Got from the "us.gov" website. <br>
It was very challanging to convert the dataset and was challanging for data Cleaning. <br>
But With the help of EDA it was easy to get insights from this dataset.
<br>
#### Observations:
- We observe Some Observations.
1. Distribution of Electric Cars over  Model Year.
2. Distribution of Electric Range Of the Vehicle.
3. Count of Clean Alternative Fuel Vehicle (CAFV) Eligibility.
4. Count of Electric Vehicle Type.
5. Percentage of BEV vs PHEV by Country.
6. Year wise number of cars.
<br>
- As well as we have answered some questions to generate insights from the dataset.
1. What is the Year Wise Cars sales growth?
2. What are the Top 10 count of cars per Country?
3. What is the Top 10 count of cars per city?
4. What is the Top 10 count of cars per county?
4. What are the Top 5 vs Bottom 5 Comparison?
5. In which Year Electric Vehicles increased?
6. what are the Top 10 Companies making Electric Vehicle?
<br>
<br>
#### Conclusion:
- All the top 5 electric vehicles produced by Tesla company
- Electric Cars sell shootup by 400 % in between year 2020 - 2023
- Washington had the most cars registered by state, followed by California and Virginia.
- Seattle is the top city followed by Bellevue in top 10 with electric Cars.
- King County is the top in top 10 county with more electric Vehicles followed by Snohomish and Pierce.
- Tesla is the top 10 company making Maximum Electric Vehicles followed by Nissan, Chevrolet and Ford.
- Among all the Electric vehicle sold till 2024 : 21.8 % are Battery Electric Vehicle (BEV) , 78.2 % are plug-in Hybrid Electric Vehicle (PHEV) 
- 98052 postal code contains the high electric cars.
- JAGUR have the more electric range comapre to other makes.
- From 2008 to 2022 the no of electric vehicles are increased.
- Tesla is the most popular electric car make in Washington state, followed by Nissan and Chevrolet.
- Tesla is also the most popular make in Seattle, followed by Nissan, Chevrolet, and BMW.
- Tesala, Nissan, Ford, BMW are top 5 Cars where, Think, Bently Model are bottom 5 Cars.
