#           SPATIAL PREPROCESSING on the DATASETS

## 1.1 Importing the necessary libraries and modules


In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

### 1.2 Load the dataset

In [3]:
df=pd.read_csv("../data/processed/EV_spatial.csv")

In [4]:
df.head()

Unnamed: 0,country,city,postal_code,model_year,model,electric_vehicle_type,cafv_eligibility,electric_range,legislative_district,vehicle_location,...,make_RIVIAN,make_ROLLS ROYCE,make_SMART,make_SUBARU,make_TESLA,make_TH!NK,make_TOYOTA,make_VOLKSWAGEN,make_VOLVO,make_WHEEGO ELECTRIC CARS
0,0.950721,2.185227,-0.020657,-0.172448,1.443448,0,0,2.523975,0.528607,POINT (-122.30839 47.610365),...,0,0,0,0,1,0,0,0,0,0
1,-0.777479,-0.144236,-0.062009,0.831106,1.443448,0,1,-0.639721,-1.891247,POINT (-122.179458 47.802589),...,0,0,0,0,1,0,0,0,0,0
2,0.950721,2.185227,-0.025979,-0.506966,-0.501228,0,0,2.295667,0.461389,POINT (-122.34848 47.632405),...,0,0,0,0,1,0,0,0,0,0
3,0.950721,-0.450334,-0.059553,-1.51052,-0.501228,0,0,1.643359,-1.622374,POINT (-122.03646 47.534065),...,0,0,0,0,1,0,0,0,0,0
4,-1.142709,-0.716527,0.08989,0.16207,1.443448,0,1,-0.639721,-0.412447,POINT (-122.55717 47.733415),...,0,0,0,0,1,0,0,0,0,0


In [5]:
df.shape

(177866, 98)

In [6]:
df.columns

Index(['country', 'city', 'postal_code', 'model_year', 'model',
       'electric_vehicle_type', 'cafv_eligibility', 'electric_range',
       'legislative_district', 'vehicle_location', 'electric_utility',
       '2020_census_tract', 'state_AE', 'state_AK', 'state_AL', 'state_AP',
       'state_AR', 'state_AZ', 'state_BC', 'state_CA', 'state_CO', 'state_CT',
       'state_DC', 'state_DE', 'state_FL', 'state_GA', 'state_HI', 'state_IA',
       'state_ID', 'state_IL', 'state_IN', 'state_KS', 'state_KY', 'state_LA',
       'state_MA', 'state_MD', 'state_MI', 'state_MN', 'state_MO', 'state_NC',
       'state_NE', 'state_NH', 'state_NJ', 'state_NM', 'state_NV', 'state_NY',
       'state_OH', 'state_OK', 'state_OR', 'state_PA', 'state_RI', 'state_SC',
       'state_TN', 'state_TX', 'state_UT', 'state_VA', 'state_WA', 'state_WY',
       'make_ALFA ROMEO', 'make_AUDI', 'make_AZURE DYNAMICS', 'make_BENTLEY',
       'make_BMW', 'make_CADILLAC', 'make_CHEVROLET', 'make_CHRYSLER',
       'make_DODG

### 1.3 Checking NaN values

In [7]:
df.isnull().sum().sum()

np.int64(0)

### 1.4 Extracting "Latitude" & "Longitude" from Vehicle Location column

In [8]:
# Pattern to extract lon/lat from WKT POINT
pattern = r'POINT\s*\(([-\d\.]+)\s+([-\d\.]+)\)'

# Vectorized extraction
df[['longitude', 'latitude']] = df['vehicle_location'].str.extract(pattern).astype(float)

In [9]:
df['latitude']=df['latitude'].round(0).astype('Int64')
df['longitude']=df['longitude'].round(0).astype('Int64')

In [10]:
df['latitude']= df['latitude'].fillna(df['latitude'].median())
df['longitude']= df['longitude'].fillna(df['longitude'].median())

In [11]:
df['latitude'].unique()

<IntegerArray>
[48, 47, 49, 46, 39, 36, 35, 38, 34, 40, 37, 33, 42, 28, 41, 29, 43, 31, 32,
 21, 30, 44, 22, 45, 61, 26, 27, 20]
Length: 28, dtype: Int64

### 1.5 As we extracted the Latitudes and Longitudes so we can easily drop Vehicle Location column

In [12]:
df.drop('vehicle_location', axis =1 ,inplace=True)

In [13]:
df.head()

Unnamed: 0,country,city,postal_code,model_year,model,electric_vehicle_type,cafv_eligibility,electric_range,legislative_district,electric_utility,...,make_SMART,make_SUBARU,make_TESLA,make_TH!NK,make_TOYOTA,make_VOLKSWAGEN,make_VOLVO,make_WHEEGO ELECTRIC CARS,longitude,latitude
0,0.950721,2.185227,-0.020657,-0.172448,1.443448,0,0,2.523975,0.528607,-0.290798,...,0,0,1,0,0,0,0,0,-122,48
1,-0.777479,-0.144236,-0.062009,0.831106,1.443448,0,1,-0.639721,-1.891247,-0.104446,...,0,0,1,0,0,0,0,0,-122,48
2,0.950721,2.185227,-0.025979,-0.506966,-0.501228,0,0,2.295667,0.461389,-0.290798,...,0,0,1,0,0,0,0,0,-122,48
3,0.950721,-0.450334,-0.059553,-1.51052,-0.501228,0,0,1.643359,-1.622374,1.142097,...,0,0,1,0,0,0,0,0,-122,48
4,-1.142709,-0.716527,0.08989,0.16207,1.443448,0,1,-0.639721,-0.412447,-0.104446,...,0,0,1,0,0,0,0,0,-123,48


### 1.6 Saving the Dataset

In [14]:
import os
output_path = '../data/processed/EV_spatial.csv'
compressed_output = os.path.join(os.path.dirname(output_path), "ev_spatial_preprocessed.csv.gz")
df.to_csv(compressed_output, index=False, compression="gzip")