In [None]:
!pip install dython
import itertools
import pandas as pd
import pandas_profiling
import numpy as np
import scipy.stats
import re
import requests
from bs4 import BeautifulSoup
import math
import matplotlib as mlp
import matplotlib.pyplot as plt
import matplotlib.backends.backend_agg
import matplotlib.figure
s
import seaborn as sns
import datetime
import warnings
import dython.nominal as dn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
vehicles_dataset = pd.read_csv(
    'https://raw.githubusercontent.com/AlejandroPenaloza/cars-value-predictor/master/cars-features-dataset.csv')
vehicles_dataset.drop(columns='Unnamed: 0', inplace=True)
print(vehicles_dataset.shape)
vehicles_dataset.head()

## Exploratory Data Analysis

In [None]:
#pandas_profiling.ProfileReport(vehicles_dataset)

In [None]:
print(vehicles_dataset.info())
print(vehicles_dataset['Year'].describe())
vehicles_dataset.drop(columns='Year').describe()

Or...

In [None]:
vehicles_dataset.dtypes

In [None]:
vehicles_dataset.isnull().sum(axis=0)

In [None]:
# the following is an array with columns sorted by nulls count within them.
columns_by_nulls = vehicles_dataset.isnull().sum().sort_values(ascending=True)
print(columns_by_nulls)
sorted_columns = columns_by_nulls.index.to_numpy()
sorted_columns

This high quantity of allegedly null values is due to lack of information when inputting specifications in each car listing from its owner or merchant.
Some of them will be dropped but most of them will be labeled as another data category.

Starting off with 'Make' and 'Model'

In [None]:
vehicles_dataset.dropna(subset=['Make', 'Model'], inplace=True)
sorted_columns = np.setdiff1d(sorted_columns, np.array(['Make', 'Model']), assume_unique=True)
vehicles_dataset.shape

Make and model are too important to have as an specific information for each row, so the ones were Make and Model are missing were dropped.

In [None]:
sorted_columns[0]

Now, 'Drive Type' will be analyzed.

In [None]:
vehicles_dataset['Drive Type'].value_counts()

In [None]:
vehicles_dataset['Drive Type'].isnull().sum()

This variable has just 23 null values. In addition to that there is one category called 'Unknown', which has only 8 occurrences.
The rows presenting these values are going to be eliminated.

In [None]:
vehicles_dataset["Drive Type"] = vehicles_dataset["Drive Type"].apply(
    lambda dt: (dt, np.NaN)[dt == 'Unknown'])
vehicles_dataset.dropna(subset=["Drive Type"], inplace=True)

In [None]:
# Function to display next feature to analyze and edited
# dataframe shape, after being done with previous one
def display_next_feature(n=1):
  global sorted_columns
  sorted_columns = sorted_columns[n:]
  return print(sorted_columns[0], "\n", vehicles_dataset.shape)
               
  
display_next_feature()

The next variable with the least amount of null values is 'Fuel Type'.

In [None]:
print(vehicles_dataset['Fuel Type'].value_counts())
vehicles_dataset['Fuel Type'].isnull().sum()

'CNG' and 'Hydrogen will be dismissed, they have but 5 presences. 

In [None]:
vehicles_dataset['Fuel Type'] = vehicles_dataset['Fuel Type'].apply(
    lambda ft: (ft, np.NaN)[ft in ['Hydrogen', 'CNG']])
vehicles_dataset.dropna(subset=['Fuel Type'], inplace=True)

In [None]:
display_next_feature()

'Mileage' column comes on.

In [None]:
print(vehicles_dataset['Mileage'].value_counts())
vehicles_dataset['Mileage'].isnull().sum()

In [None]:
vehicles_dataset.dropna(subset=['Mileage'], inplace=True)

In [None]:
vehicles_dataset['Mileage'] = vehicles_dataset['Mileage'].apply(
    lambda m: m.replace(',', ''))
vehicles_dataset['Mileage'] = vehicles_dataset['Mileage'].astype('int64')

In [None]:
mileage_median = np.median(vehicles_dataset['Mileage'])
print("Median of 'Mileage' data is", mileage_median)
scipy.stats.describe(vehicles_dataset['Mileage'])

In [None]:
ax = sns.scatterplot(x=vehicles_dataset['Mileage'].value_counts(), 
                     y=vehicles_dataset['Mileage'], data=vehicles_dataset)
ax.set_xlabel('Occurrences')

In [None]:
fig, (ax_box, ax_hist) = plt.subplots(2, sharex=True, figsize=(14, 10))
fig.suptitle('Mileage distribution', fontsize=20)
sns.boxplot(orient='h', data=vehicles_dataset['Mileage'], ax=ax_box)
sns.distplot(vehicles_dataset['Mileage'], ax=ax_hist)

It ts clear that many outliers are present here. They are to be dealt with by using 1.5 x Interquartile Range Rule and Z Score measurements.

In [None]:
# absolute value of z-score for every mileage
mileage_std = np.std(vehicles_dataset['Mileage'])
print("Threshold is to be", 3*mileage_std)
z_score = np.abs(scipy.stats.zscore(vehicles_dataset['Mileage']))
z_score

In [None]:
# Quartiles and IQR definition
mileage_iqr = scipy.stats.iqr(vehicles_dataset['Mileage'])

# mileage_iqr = vehicles_dataset['Mileage'].apply(scipy.stats.iqr)
mileage_quartile1 = np.percentile(vehicles_dataset['Mileage'], 25)
mileage_quartile3 = np.percentile(vehicles_dataset['Mileage'], 75)

Z-Score filtering applied by comparing their value with each mileage score, keeping the rows within the +-3 standard deviation range.

In [None]:
# z-score filtered dataframe
z_score_outliers_n = vehicles_dataset.shape[0] - len(vehicles_dataset[z_score <= 3]['Mileage'])
vehicles_dataset = vehicles_dataset[z_score <= 3]
print("New number of rows:", vehicles_dataset.shape[0])
vehicles_dataset['Mileage'].value_counts()

In [None]:
print("So,", z_score_outliers_n, 
      "fields have been declared as outliers, thereby left away.")
max_mileage = vehicles_dataset['Mileage'].max()

Interquartile Range usage for restricting the array values spread.

In [None]:
# Dataset taking away outliers according to 1.5 times the interquartile range.
IQRx1p5_df = vehicles_dataset[(vehicles_dataset['Mileage'] > mileage_quartile1 - 1.5 * mileage_iqr) & (
      vehicles_dataset['Mileage'] < mileage_quartile3 + 1.5 * mileage_iqr)]
IQRx1p5_rows_n = len(vehicles_dataset['Mileage']) - len(IQRx1p5_df['Mileage'])
print('Current number of rows', len(vehicles_dataset['Mileage']), 
      "- number if all rows eliminated for 1.5x IQR", len(IQRx1p5_df['Mileage']),
      "=", IQRx1p5_rows_n)

Due to limiting the values range whisker to 1.5 IQR takes away over 200 more rows (nearly 500 from original dataset, which is a lot to this standard), some of those values will be kept, by representing the median value. 

In [None]:
fig, (box1, box2) = plt.subplots(ncols=2, sharey=True)
fig.suptitle('Mileage data boxplots')
sns.boxplot(orient='v', data=vehicles_dataset['Mileage'], ax=box1, whis=1.5)
box1.set_title('With whisker = 1.5 (current)')
sns.boxplot(orient='v', data=vehicles_dataset['Mileage'], ax=box2, whis=1.75)
box2.set_title('With whisker = 1.75 (to use)')
plt.show()

In [None]:
# Dataset taking away outliers according to 1.75 times the interquartile range
IQRx1p75_df = vehicles_dataset[(vehicles_dataset['Mileage'] > mileage_quartile1 - 1.75 * mileage_iqr) & (
      vehicles_dataset['Mileage'] < mileage_quartile3 + 1.75 * mileage_iqr)]
IQRx1p75_rows_n = len(vehicles_dataset['Mileage']) - len(IQRx1p75_df['Mileage'])
print("Current number of rows ", len(vehicles_dataset['Mileage']), 
      " - number if all rows eliminated for 1.75x IQR ", len(IQRx1p75_df['Mileage']),
      " = ", IQRx1p75_rows_n, ".", sep="")
print("And the rest (will be changed instead of eliminated): ", IQRx1p5_rows_n - IQRx1p75_rows_n, ".", sep="")

Now, these 70 rows will be dropped and the rest of outliers (181) are to storage the mileage median value.

In [None]:
vehicles_dataset = IQRx1p75_df

# Given that there are no low outliers, we can determine what outliers value to replace by comparing with the new max mileage value
vehicles_dataset['Mileage'].where(vehicles_dataset['Mileage'] <= IQRx1p5_df['Mileage'].max(), other=mileage_median, inplace=True)

In [None]:
display_next_feature()

Following into the variables list with the least null values, 'MPG' comes up.

In [None]:
print(vehicles_dataset['MPG'].value_counts())
vehicles_dataset['MPG'].isnull().sum()

This column has only 18 null values, however, they are also represented as True in 'N/A cty / N/A hwy', as seen down below.
Since there are around 500 of these values, they will be categorized as well.

In [None]:
sorted(list(vehicles_dataset.dropna(subset=['MPG'])['MPG'].unique()))

In [None]:
def MPG_categorize(MPG):
    if MPG in ['N/A cty / N/A hwy', np.NaN]:
        return 'Another'
    else:
        cty = int(re.findall('[0-9]+ c', str(MPG))[0][:-2])
        hwy = re.findall('/ [0-9]+', str(MPG))
        if hwy == []:
            return 'Another'
        elif cty >= 60:
            return 'up from 60 cty / up from 50 hwy'
        else:
            hwy = int(hwy[0][2:])
            return str(cty)[0] + "0-" + str(cty)[0] + "9 cty / " + str(hwy)[0] + "0-" + str(hwy)[0] + "9 hwy"

vehicles_dataset['MPG'] = vehicles_dataset['MPG'].apply(MPG_categorize)
vehicles_dataset['MPG'].value_counts()

30, 40 and 50 cty categories are being gathered so outliers disappear.

In [None]:
def MPG_categorize2(MPG):
    if MPG in ['30-39 cty / 20-29 hwy', '30-39 cty / 30-39 hwy']:
        return '30-39 cty / 20-39 hwy'
    elif MPG in ['50-59 cty / 40-49 hwy', '50-59 cty / 50-59 hwy']:
        return '50-59 cty / 40-59 hwy'
    elif MPG in ['40-49 cty / 30-39 hwy', '40-49 cty / 40-49 hwy']:
        return '40-49 cty / 30-49 hwy'
    else:
        return MPG
    

vehicles_dataset['MPG'] = vehicles_dataset['MPG'].apply(MPG_categorize2)
vehicles_dataset['MPG'].value_counts()

In [None]:
display_next_feature()

'Transmission' feature presents only two true values, as logically thought. We will just get rid of null values then.

In [None]:
print(vehicles_dataset['Transmission'].isnull().sum())
vehicles_dataset['Transmission'].value_counts()

In [None]:
vehicles_dataset.dropna(subset=['Transmission'], inplace=True)

In [None]:
display_next_feature()

The places where the vehicles are located are defined by city and state as two variables -'Location (City)' and Location (State)'-.

These variables describing vehicle location will be analyzed later on.

In [None]:
display_next_feature(2)

'Condition (Accidents)' is the next characteristic to check out.

In [None]:
print(vehicles_dataset['Condition (Accidents)'].value_counts())
vehicles_dataset['Condition (Accidents)'].isnull().sum()

There are 148 null values, and they will be replaced by another category ('Unknown), along the value '4 reported accidents', as it has very few instances.

In [None]:
vehicles_dataset['Condition (Accidents)'] = vehicles_dataset['Condition (Accidents)'].apply(
    lambda cond: (cond, 'Unknown')[cond in ['4 reported accidents', np.NaN]])
vehicles_dataset['Condition (Accidents)'].value_counts()

In [None]:
display_next_feature()

Intuitively, vehicles model years is represented in 'Year' column.

In [None]:
print(vehicles_dataset['Year'].isnull().sum())
vehicles_dataset['Year'].value_counts()

In [None]:
vehicles_dataset['Year'] = vehicles_dataset['Year'].where(
    vehicles_dataset['Year'] > 2000, other='Other')
vehicles_dataset['Year'].fillna('Other', inplace=True)
vehicles_dataset['Year'] = vehicles_dataset['Year'].astype('object')
vehicles_dataset['Year'].value_counts()

In [None]:
display_next_feature()

Now, 'Price' feature, which is the variable to predict in the upcoming model.

In [None]:
print(vehicles_dataset['Price'].isnull().sum())
vehicles_dataset['Price'].value_counts()

Since this variable is needed to be as reliable and accurate as possible, the null values will not be rearranged so this data does not be that altered, they will be instead dropped.

In [None]:
vehicles_dataset.dropna(subset=['Price'], inplace=True)

In [None]:
vehicles_dataset['Price'] = vehicles_dataset['Price'].apply(lambda p: float(str(p).replace(',', '')))
scipy.stats.describe(vehicles_dataset['Price'])

In [None]:
fig, (ax_violin, ax_strip) = plt.subplots(2, sharex=True, figsize=(24, 12))
fig.suptitle('Prices distribution', fontsize=20)
ax_violin.set_title('Prices density', fontdict={'fontsize': 15})
ax_strip.set_title('Prices occurrences', fontdict={'fontsize': 15})
sns.violinplot(orient='h', data=vehicles_dataset['Price'], ax=ax_violin)
sns.stripplot(orient='h', data=vehicles_dataset['Price'], ax=ax_strip, color='red')

Outliers are noticeable, they have to be worked out.
As before, we are to apply Z-Score.

In [None]:
# absolute value of z-score for every price in column
price_std = np.std(vehicles_dataset['Price'])
print("Prices threshold to use is", 3*price_std)
z_score_prices = np.abs(scipy.stats.zscore(vehicles_dataset['Price']))
z_score_prices

In [None]:
# Quartiles and IQR definition
price_iqr = scipy.stats.iqr(vehicles_dataset['Price'])
price_quartile1 = np.percentile(vehicles_dataset['Price'], 25)
price_quartile3 = np.percentile(vehicles_dataset['Price'], 75)

# z-score filtered dataframe
original_rows_number = vehicles_dataset.shape[0]
vehicles_dataset = vehicles_dataset[z_score_prices <= 3]
price_outliers_number = original_rows_number - vehicles_dataset.shape[0]
print("Number of rows taken away:", price_outliers_number)
print("New number of rows:", vehicles_dataset.shape[0])

In [None]:
fig, ax = plt.subplots(figsize=(14,6))
sns.boxplot(vehicles_dataset['Price'], whis=1.75, ax=ax)

'Prices' still presents outliers.
The values spread will be restricted to 1.75 Interquartile Range measures, as used with 'Mileage'.

In [None]:
# filtered dataframe taking away values off 1.75 times the interquartile range
filtered_175_df = vehicles_dataset[(vehicles_dataset['Price'] > price_quartile1 - 1.75 * price_iqr) & (
      vehicles_dataset['Price'] < price_quartile3 + 1.75 * price_iqr)]
outliers_number = len(vehicles_dataset['Price']) - len(filtered_175_df['Price'])
print(
    "Number of rows off of threshold:", vehicles_dataset.shape[0], "-", 
    filtered_175_df.shape[0], "=", outliers_number)

Despite of modified data was not wanted, it is required to apply some procedures in some of these fields, so we do not lose this much information.
The whisker mark will be increased to 2.00 to check on the values.

In [None]:
# filtered dataframe taking away values off 2 times the interquartile range
filtered_200_df = vehicles_dataset[(vehicles_dataset['Price'] > price_quartile1 - 2 * price_iqr) & (
      vehicles_dataset['Price'] < price_quartile3 + 2 * price_iqr)]
print("Number of rows to be eliminated:", vehicles_dataset.shape[0] - filtered_200_df.shape[0])
print("Number of rows to be reassigned:", filtered_200_df.shape[0] - filtered_175_df.shape[0])

In [None]:
vehicles_dataset = filtered_200_df
price_mean = np.mean(vehicles_dataset['Price'])
vehicles_dataset['Price'].where(vehicles_dataset['Price'] > price_quartile3 + 1.75 * price_iqr, other=price_mean)
print(vehicles_dataset.shape)
sns.distplot(vehicles_dataset['Price'])

In [None]:
display_next_feature()

'Interior Color' variable is next to evaluate.

In [None]:
vehicles_dataset['Interior Color'].value_counts()

In [None]:
print("'Interior Color' has", vehicles_dataset['Interior Color'].nunique(), "unique values")
print("and 'Exterior Color' has ", vehicles_dataset['Exterior Color'].nunique(), ".", sep="")

Both 'Interior Color' and 'Exterior Color' has a lot of unique values, but they can be regrouped given that these values represent almost the same color for each case, by category.

In [None]:
print("'Interior Color' has", vehicles_dataset['Interior Color'].isnull().sum(), "null values")
print("and 'Exterior Color' has ", vehicles_dataset['Exterior Color'].isnull().sum(), "!", sep="")

In [None]:
sorted_columns = np.setdiff1d(sorted_columns, np.array(['Exterior Color']), assume_unique=True)

We will create a function which re-categorizes color values (including null values, given that there are a lot of them, 763 and 5126 respectively).

The following function will check whether the passed color names belong to the same category.

In [None]:
def color_in(color, *argc):
    checks = []
    for arg in argc:
        if arg in str(color).lower():
            checks.append(True)
        else:
            checks.append(False)
    if any(checks):
        return True
    else:
        return False

Now, this function will tell if the conditions are met so that it categorizes the original values to this set-up. 

In [None]:
def colors_categorize(ext_color):
    if color_in(ext_color, "black") or "Obsidian" == ext_color:
        return "Black"
    elif color_in(
        ext_color, "white", "glacier") or ext_color in ["Fresh Powder", "Blizzard Pearl", "Starfire Pearl"]:
        return "White"
    elif color_in(ext_color, "green"):
        return "Green"
    elif color_in(ext_color, "blue", "aqua"):
        return "Blue"
    elif color_in(
        ext_color, "silver", "gray", "magnetic", "pearlcoat", "tungsten", 
        "graphite", "gun", "platinum", "titanium", "billet"):
        return "Silver or Grey"
    elif color_in(ext_color, "red", "cherry", "ruby", "sun", "rosso", "scarlet"):
        return "Red"
    elif color_in(ext_color, "gold"):
        return "Gold"
    elif color_in(ext_color, "brown", "beige"):
        return "Beige or Brown"
    elif color_in(ext_color, "yellow"):
        return "Yellow"
    else:
        return "Other"

In [None]:
vehicles_dataset['Interior Color'] = vehicles_dataset['Interior Color'].apply(colors_categorize)
vehicles_dataset['Interior Color'].value_counts()

In [None]:
# keeping on filtering the values
vehicles_dataset['Interior Color'] = vehicles_dataset['Interior Color'].apply(
    lambda i_color: (i_color, 'Other')[i_color in ['Gold', 'Green', 'Blue']])
vehicles_dataset['Interior Color'].value_counts()

In [None]:
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
fig, (vs_intc) = plt.subplots(2, figsize=(15,10))
sns.violinplot(x='Interior Color', y='Price', data=vehicles_dataset, split=True, ax=vs_intc)
sns.swarmplot(x='Interior Color', y='Price', data=vehicles_dataset, split=True, ax=vs_intc)

In [None]:
vehicles_dataset['Exterior Color'].value_counts()

In [None]:
vehicles_dataset['Exterior Color'] = vehicles_dataset['Exterior Color'].apply(colors_categorize)
vehicles_dataset['Exterior Color'].value_counts()

In [None]:
vehicles_dataset['Exterior Color'].where(
    vehicles_dataset['Exterior Color'] != 'Yellow', other='Other', inplace=True)
vehicles_dataset['Exterior Color'].value_counts()

In [None]:
warnings.filterwarnings(action="ignore", category=DeprecationWarning)
# fig, (sw_intc, sw_extc) = plt.subplots(1, 2, sharey=True, figsize=(10, 8))
fig, (sw_intc, sw_extc) = plt.subplots(2, figsize=(10, 8))
fig.suptitle('Colors occurrences regarding vehicle Price')
sns.swarmplot(x='Interior Color', y='Price', data=vehicles_dataset, ax=sw_intc)
sns.swarmplot(x='Exterior Color', y='Price', data=vehicles_dataset, ax=sw_extc)

In [None]:
display_next_feature()

'Style' feature is up.

In [None]:
print(vehicles_dataset['Style'].isnull().sum())
vehicles_dataset['Style'].value_counts()

In [None]:
list(vehicles_dataset['Style'].unique())

In [None]:
styles_url = requests.get('https://en.wikipedia.org/wiki/Car_body_style')
styles_soup = BeautifulSoup(styles_url.content, 'html.parser')
styles_soup

In [None]:
styles_tags = styles_soup.find_all('dt')
styles_tags

In [None]:
styles = np.array([])


for style_tags in styles_tags:
    try:
        style = re.findall('e=".+"', str(style_tags))[0][3:-1]
        styles = np.append(styles, style)
    except IndexError:
        style = re.findall('>.+<', str(style_tags))[0][1:-1]
        styles = np.append(styles, style)


styles

In [None]:
styles_to_check = [
    "Buggy", "Sedan", "Roadster", "Landaulet", "Ute", "Baquet", "Phaeton"]


def style_cleaner(s):
    if "luxury" in s:
        return "Luxury"
    elif any(item in s for item in styles_to_check):
        return re.findall(".+ \(|.+ b", s)[0][:-2]
    else:
        return s

In [None]:
style_vect_cleaner = np.vectorize(style_cleaner)
style_vect_cleaner(styles)

In [None]:
def style_categorize(style):
    if 'sedan' in style.lower():
        return 'Sedan'
    elif 'convertible' in style.lower():
        return 'Convertible'
    elif 


'Bed Length' is next to go through.

In [None]:
print("'Bed Length' column has", vehicles_dataset["Bed Length"].isnull().sum(), "null values.")
print("The other ones (", vehicles_dataset["Bed Length"].notnull().sum(), ") are:", sep="")
vehicles_dataset['Bed Length'].value_counts()

As seen above the bed length is null for almost all cases, so it will be discarded.

In [None]:
vehicles_dataset.drop(columns='Bed Length', inplace=True)
vehicles_dataset.columns

Now, 

Like there are just a few cases where the vehicles are yellow or green, we will get rid of these values.

In [None]:
colors_to_drop = list(vehicles_dataset['Exterior Color'].value_counts().index[-4:])
colors_to_drop

In [None]:
indexes = []


for color in colors_to_drop:
    indexes += list(vehicles_dataset[vehicles_dataset["Exterior Color"] == color].index)
    
    
vehicles_dataset.drop(indexes, inplace=True)
vehicles_dataset.shape

In [None]:
ax = sns.countplot(x="Exterior Color", data=vehicles_dataset, palette="Set1")
ax.set_title("Exterior colors by occurrences")

Now, 'Engine' will be analyzed.

In [None]:
vehicles_dataset['Engine'].isnull().sum()

In [None]:
vehicles_dataset['Engine'].value_counts()

We are going to check on all the variants for each 'Inline', 'Flat' and 'Vs' engines, so we can filter them into more crowded categories

In [None]:
sorted(vehicles_dataset['Engine'].value_counts().index.to_list())

In [None]:
for j in ["Inline", "Flat", "V"]:

    for i in vehicles_dataset['Engine'].value_counts().index.to_list():
        if j in i:
            print(i)

In [None]:
def engine_categorize(engine_type):
    if "V-" in engine_type:
        return re.findall("V.+", engine_type)[0]
    elif "Inline" in engine_type:
        return re.findall("I.+", engine_type)[0]
    elif "Flat" in engine_type:
        return re.findall("F.+", engine_type)[0]
    else:
        return "Another"

In [None]:
re.findall("V-[1-8][0-2]?", "5.4L V-10 Gas Supercharged")

In [None]:
vehicles_dataset['Engine'] = vehicles_dataset['Engine'].astype(str)
vehicles_dataset['Engine'].apply(engine_categorize).value_counts()

Let's filter these engine types up a bit more, so outliers stay out.

In [None]:
def engine_categorize(engine_type):
    if engine_type.endswith("V-6 Gas"):
        return re.findall("V.+", engine_type)[0]
    elif "V-6" in engine_type:
        return "V-6 Gas Turbo/Supercharged"
    elif "V-8" in engine_type:
        return "V-8"
    elif "V-" in engine_type:
        return "V-4/10/12"
    elif "Inline-4 Gas" in engine_type:
        return "Inline-4 Gas"
    elif "Inline-4" in engine_type:
        return "Inline-4 Plug-In/Hybrid"
    elif "Inline-6" in engine_type:
        return "Inline-6"
    elif "Inline" in engine_type:
        return "Inline 2/3/5/8"
    elif "Flat" in engine_type:
        return re.findall("Flat-[46]", engine_type)[0]
    else:
        return "Another"

In [None]:
vehicles_dataset['Engine'] = vehicles_dataset['Engine'].apply(engine_categorize)
vehicles_dataset['Engine'].value_counts()

Now, for the location variables, in order to establish a strength of association (categorical correlation) measure between them, it is being used the Cramer's V statistic, based on Pearson's Chi-Square test for numerical data.

In [None]:
print(vehicles_dataset['Location (City)'].isnull().sum())
vehicles_dataset['Location (City)'].value_counts()

In [None]:
print(vehicles_dataset['Location (State)'].isnull().sum())
vehicles_dataset['Location (State)'].unique()

In [None]:
dn.cramers_v(vehicles_dataset['Location (City)'], vehicles_dataset['Location (State)'])

This shows a high association between location features, which is logic as they represent the same information, just that 'Location (City)' is more specific, therefore with more unique values.

However, Cramer's V is symmetrical, which means variable 'x' gives the same information over variable 'y' as viceversa.
But this cannot be the case, as some valuable data would ended up being lost.

It is to be solved by implementing the Uncertainty Coefficient, or Theil's U metric, which is a asymmetrical measure; delivering for a given 'x' the possible states for 'y'.

In [None]:
print(dn.theils_u(vehicles_dataset['Location (City)'], ['Location (States)']))
dn.theils_u(vehicles_dataset['Location (State)'], vehicles_dataset['Location (City)'])

This is more accurate for estimations.
Associations between features are now to be checked down below.

In [None]:
numerical_variables = ['Mileage', 'MPG', 'Price']
#categorical_variables = list(filter(lambda var: (False, True)[var not in numerical_variables], vehicles_dataset.columns.to_list()))
categorical_variables = list(filter(lambda var: bool(var not in numerical_variables), vehicles_dataset.columns.to_list()))
categorical_variables

In [None]:
categorical_data = pd.DataFrame(data=vehicles_dataset, columns=categorical_variables)

In [None]:
dn.associations(
    categorical_data.dropna(),
    theil_u=True, nan_strategy='drop_samples', figsize=(10,10))['ax']

Then, 'Location (City)' will be eliminated.

In [None]:
vehicles_dataset.drop(columns='Location (City)', inplace=True)

Location (State)' has still a lot of null values to deal with.
Another category representing them will is created.

In [None]:
vehicles_dataset['Location (State)'].value_counts()

In [None]:
#np.where(vehicles_dataset['Location (State)'] == 'UK', 'Other', vehicles_dataset['Location (State)'])

In [None]:
vehicles_dataset['Location (State)'].apply(lambda st: (st, 'Other')[st in ['UK', 'SO', 'WV', np.NaN]]).value_counts()