In [1]:
DATA_PATH = "../data/processed/00_preprocessed.pkl"
EXPORT_PATH = "../data/processed/01_OHE_data.pkl"

DROP_COLS = ["model", "engine_fuel_type", "driven_wheels",
             "number_of_doors", "market_category",
             "vehicle_size", "vehicle_style", "popularity"]

TRANSMISSION_DICT = {'manual': 1, 'automatic': 2,
                     'automated_manual': 3, 'direct_drive':4}



In [2]:
# Load packages
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

___
## Reading data:

In [3]:
df = pd.read_pickle(DATA_PATH)

In [4]:
df.head()

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
0,bmw,1_series_m,2011,premium_unleaded_(required),335.0,6.0,manual,rear_wheel_drive,2.0,"factory_tuner,luxury,high-performance",compact,coupe,26,19,3916,46135
1,bmw,1_series,2011,premium_unleaded_(required),300.0,6.0,manual,rear_wheel_drive,2.0,"luxury,performance",compact,convertible,28,19,3916,40650
2,bmw,1_series,2011,premium_unleaded_(required),300.0,6.0,manual,rear_wheel_drive,2.0,"luxury,high-performance",compact,coupe,28,20,3916,36350
3,bmw,1_series,2011,premium_unleaded_(required),230.0,6.0,manual,rear_wheel_drive,2.0,"luxury,performance",compact,coupe,28,18,3916,29450
4,bmw,1_series,2011,premium_unleaded_(required),230.0,6.0,manual,rear_wheel_drive,2.0,luxury,compact,convertible,28,18,3916,34500


___
### Construct a list changer for the year columns:

In [5]:
lst = sorted(df.year.unique().tolist())
YEAR_DICT = {n: i+1 for i, n in enumerate(lst)}
YEAR_DICT

{1990: 1,
 1991: 2,
 1992: 3,
 1993: 4,
 1994: 5,
 1995: 6,
 1996: 7,
 1997: 8,
 1998: 9,
 1999: 10,
 2000: 11,
 2001: 12,
 2002: 13,
 2003: 14,
 2004: 15,
 2005: 16,
 2006: 17,
 2007: 18,
 2008: 19,
 2009: 20,
 2010: 21,
 2011: 22,
 2012: 23,
 2013: 24,
 2014: 25,
 2015: 26,
 2016: 27,
 2017: 28}

___
## Dropping irrelevant features:

In [6]:
df.drop(DROP_COLS, axis=1, inplace=True)
df.head()

Unnamed: 0,make,year,engine_hp,engine_cylinders,transmission_type,highway_mpg,city_mpg,msrp
0,bmw,2011,335.0,6.0,manual,26,19,46135
1,bmw,2011,300.0,6.0,manual,28,19,40650
2,bmw,2011,300.0,6.0,manual,28,20,36350
3,bmw,2011,230.0,6.0,manual,28,18,29450
4,bmw,2011,230.0,6.0,manual,28,18,34500


___
### replace values for tranmsission type and year columns:

In [7]:
df.replace(TRANSMISSION_DICT, inplace=True)

In [8]:
df.replace(YEAR_DICT, inplace=True)

**Note we have some columns with 'unknown' value in the transmission type columns so we will need to remove it in cleaning phase**

___
## Transforming columns:
**Including the highway_mpg and city_mpg columns into one column that represents the mean speed of both and then applying minmaxscaler into it**

In [9]:
df['speed'] = (df['highway_mpg'] + df['city_mpg']) / 2

___
## OneHotEncoding the 'make' column:

In [10]:
# Building the onehotencoder
ohe = OneHotEncoder(handle_unknown='ignore')

# Fitting and transforming the data
ohe.fit(df[['make']])
transformed = ohe.transform(df[['make']])

# Merging the new columns into the dataframe
df[ohe.categories_[0]] = transformed.toarray()

In [11]:
transformed.toarray()[0]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [12]:
print(ohe.categories_[0])

['acura' 'alfa_romeo' 'aston_martin' 'audi' 'bentley' 'bmw' 'bugatti'
 'buick' 'cadillac' 'chevrolet' 'chrysler' 'dodge' 'ferrari' 'fiat' 'ford'
 'genesis' 'gmc' 'honda' 'hummer' 'hyundai' 'infiniti' 'kia' 'lamborghini'
 'land_rover' 'lexus' 'lincoln' 'lotus' 'maserati' 'maybach' 'mazda'
 'mclaren' 'mercedes-benz' 'mitsubishi' 'nissan' 'oldsmobile' 'plymouth'
 'pontiac' 'porsche' 'rolls-royce' 'saab' 'scion' 'spyker' 'subaru'
 'suzuki' 'tesla' 'toyota' 'volkswagen' 'volvo']


___
## Showing some sample from the dataframe after transforming and feature engineering:

In [13]:
df.sample(1).iloc[0]

make                 nissan
year                     26
engine_hp             261.0
engine_cylinders        6.0
transmission_type         2
highway_mpg              22
city_mpg                 16
msrp                  25670
speed                  19.0
acura                   0.0
alfa_romeo              0.0
aston_martin            0.0
audi                    0.0
bentley                 0.0
bmw                     0.0
bugatti                 0.0
buick                   0.0
cadillac                0.0
chevrolet               0.0
chrysler                0.0
dodge                   0.0
ferrari                 0.0
fiat                    0.0
ford                    0.0
genesis                 0.0
gmc                     0.0
honda                   0.0
hummer                  0.0
hyundai                 0.0
infiniti                0.0
kia                     0.0
lamborghini             0.0
land_rover              0.0
lexus                   0.0
lincoln                 0.0
lotus               

___
## Now dropping tables that wont be used in our model:

In [14]:
DROPINGS = ["make", "highway_mpg", "city_mpg"]
df.drop(DROPINGS, axis=1, inplace=True)

In [15]:
df.sample(1).iloc[0]

year                    11
engine_hp            170.0
engine_cylinders       6.0
transmission_type        1
msrp                  2234
speed                 22.0
acura                  0.0
alfa_romeo             0.0
aston_martin           0.0
audi                   0.0
bentley                0.0
bmw                    0.0
bugatti                0.0
buick                  0.0
cadillac               0.0
chevrolet              0.0
chrysler               0.0
dodge                  0.0
ferrari                0.0
fiat                   0.0
ford                   0.0
genesis                0.0
gmc                    0.0
honda                  0.0
hummer                 0.0
hyundai                0.0
infiniti               0.0
kia                    0.0
lamborghini            0.0
land_rover             0.0
lexus                  0.0
lincoln                0.0
lotus                  0.0
maserati               0.0
maybach                0.0
mazda                  1.0
mclaren                0.0
m

___
## Export DataFrame:

In [16]:
df.to_pickle(EXPORT_PATH)