# **Imports and DataFrame cleaning**

In [1]:
#LIBRARIES
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#Graphics
import matplotlib.pyplot as plt
import seaborn as sns
#KNeighbors
from sklearn.neighbors import NearestNeighbors
#Scalers
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
#Data selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
#METRICS
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score,accuracy_score
from sklearn.metrics import mean_squared_error
#RANDOM
import random as rd

In [2]:
df = pd.read_csv("Beev Electric Vehicle Specs Data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 70 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Full Name                               150 non-null    object 
 1   Model                                   150 non-null    object 
 2   Brand                                   150 non-null    object 
 3   Acceleration (0-100 km/h)               142 non-null    float64
 4   Range (km)                              149 non-null    float64
 5   Average Winter Range                    136 non-null    float64
 6   Highway Winter Range                    135 non-null    float64
 7   City Winter Range                       135 non-null    float64
 8   Average Summer Range                    136 non-null    float64
 9   Highway Summer Range                    135 non-null    float64
 10  City Summer Range                       134 non-null    float6

In [4]:
df.columns

Index(['Full Name', 'Model', 'Brand', 'Acceleration (0-100 km/h)',
       'Range (km)', 'Average Winter Range', 'Highway Winter Range',
       'City Winter Range', 'Average Summer Range', 'Highway Summer Range',
       'City Summer Range', 'Optimal Slow Charge', 'Optimal Fast Charge',
       'Battery Capacity (kW)', 'Useable Battery Capacity', 'Category',
       'Government Incentive Category for Help', 'Vehicle Consumption (Wh)',
       'Average Winter Consumption', 'Highway Winter Consumption',
       'City Winter Consumption', 'Average Summer Consumption',
       'Highway Summer Consumption', 'City Summer Consumption', 'Torque',
       'Availability - deleted', 'Distance', 'Main Finishes', 'Finishes',
       'Height (mm)', 'Main Image', 'Product Image', 'Images', 'Width',
       'Last Edition of Finishes Date', 'Length (mm)', 'Label',
       'Average Range', 'Seats', 'Empty Weight (kg)', 'Max Price', 'Min Price',
       'Slow Charge Port', 'Fast Charge Port', 'Main Price',
       'M

## Checking and filling for Nan Values

In [5]:
nan_values = df.isna().sum()

In [6]:
df['Full Name'].isna().sum()

0

In [7]:
df[df['Main Price'].isna()]   #Mercedes EQB 350 4MATIC around 62000 euros
df['Main Price'].fillna(62000.0, inplace = True)

In [8]:
df['Category'].isna().sum()

0

In [9]:
df['Range (km)'].isna().sum() #the car missing the range is Lexus UX 300e. Online it says 165-300km with average of 235km
df['Range (km)'].fillna(235.0, inplace = True)

# **Model for prediction**

## Creating a new DataFrame

In [10]:
#New Dataframe with just columns we discussed for display
#However, I already put on trello a question so we know which information to display at the end - answered
df_new = pd.DataFrame(zip(df['Full Name'],df['Main Price'],df['Range (km)'], df['Category']))

In [11]:
#Renaming the columns to the correct name
df_new.rename(columns = {0:'Full Name', 1:'Price',2:'Range',3:'Category'}, inplace = True)

In [12]:
df_new['Category'].value_counts(dropna = False)

SUV           68
Utilitaire    26
Citadine      21
Berline       16
Van           11
Compacte       8
Name: Category, dtype: int64

In [13]:
df_new.columns                                        

Index(['Full Name', 'Price', 'Range', 'Category'], dtype='object')

In [14]:
df_new.head()

Unnamed: 0,Full Name,Price,Range,Category
0,Aiways U5,39300.0,410.0,SUV
1,Audi e-tron 50 quattro,71900.0,280.0,SUV
2,Audi e-tron 55 quattro,84200.0,437.0,SUV
3,Audi e-tron GT quattro,103040.0,487.0,Berline
4,Audi e-tron GT RS,142830.0,472.0,Berline


In [15]:
categories = list(df['Category'].unique())
categories

['SUV', 'Berline', 'Citadine', 'Utilitaire', 'Compacte', 'Van']

## Initiating the model

In [16]:
df_model = df_new
df_model[df_model['Category'] == 'SUV']

Unnamed: 0,Full Name,Price,Range,Category
0,Aiways U5,39300.0,410.0,SUV
1,Audi e-tron 50 quattro,71900.0,280.0,SUV
2,Audi e-tron 55 quattro,84200.0,437.0,SUV
5,Audi e-tron S,96600.0,374.0,SUV
6,Audi e-tron S Sportback 55 Quattro,99200.0,380.0,SUV
...,...,...,...,...
144,Volkswagen ID.5 Pro Performance,53050.0,410.0,SUV
145,Volkswagen ID.5GTX,57950.0,410.0,SUV
147,Volvo XC40 Recharge Twin,56150.0,400.0,SUV
148,Volvo C40 Recharge Twin,59760.0,420.0,SUV


In [17]:
X = df_model[df_model['Category'] == 'SUV'][['Price', 'Range']]     #We also need the Category and TCO. Category we will filter as discussed. Need to check on TCO
distanceKNN = NearestNeighbors(n_neighbors=5).fit(X)

In [18]:
distanceKNN.kneighbors([[60000, 350]], 5, return_distance = False)



array([[66, 36, 47, 37, 64]])

In [19]:
df_new[['Full Name', 'Price', 'Range', 'Category']].iloc[[66, 36, 47, 37, 64]]

Unnamed: 0,Full Name,Price,Range,Category
66,MG Marvel R Performance,48990.0,370.0,SUV
36,Ford Mustang Mach-E ER RWD,56500.0,600.0,SUV
47,KIA e-Niro 39 kWh,37100.0,289.0,SUV
37,Ford Mustang Mach-E SR AWD,56500.0,420.0,SUV
64,MG Marvel R,39990.0,402.0,SUV


In [21]:
df_model[['Price','Range']]

Unnamed: 0,Price,Range
0,39300.0,410.0
1,71900.0,280.0
2,84200.0,437.0
3,103040.0,487.0
4,142830.0,472.0
...,...,...
145,57950.0,410.0
146,45590.0,138.0
147,56150.0,400.0
148,59760.0,420.0
