In [None]:
import itertools
import pandas as pd
import numpy as np
import scipy.stats
import re
import math
import matplotlib as mlp
import matplotlib.pyplot as plt
import matplotlib.backends.backend_agg
import matplotlib.figure
import seaborn as sns
import datetime
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
cars_dataset = pd.read_csv('C:/Users/aleja/Cars-Value-Predictor/cars-features-dataset.csv')
cars_dataset.drop(columns='Unnamed: 0', inplace=True)
print(cars_dataset.shape)
cars_dataset.head()

## Exploratory Data Analysis

In [None]:
cars_dataset.isnull().sum(axis=0)

In [None]:
cols_by_nulls = cars_dataset.isnull().sum().sort_values(ascending=False).to_dict()
cols_by_nulls

This high quantity of allegedly null values is due to lack of information when inputting specifications in each car listing from its owner or merchant.
Some of them will be dropped but most of them will be labeled as another data.

In [None]:
cars_dataset.dropna(subset=['Make', 'Model'], inplace=True)
cars_dataset.shape

Make and model are too important to have as an specific information for each row, so the ones were Make and Model are missing were dropped.

Now, 'Drive Type' will be analyzed. It has a lot less null values, so they will be eliminated.

In [None]:
cars_dataset.dropna(subset=['Drive Type'], inplace=True)

In [None]:
cars_dataset.shape

In [None]:
print("'Bed Length' column has", cars_dataset["Bed Length"].isnull().sum(), "null values.")
print("The other ones (", cars_dataset["Bed Length"].notnull().sum(), ") are:", sep="")
cars_dataset['Bed Length'].value_counts()

As seen above the bed length is null for almost all cases, so it will be discarded.

In [None]:
cars_dataset.drop(columns='Bed Length', inplace=True)
cars_dataset.columns

In [None]:
cols_by_nulls.pop("Bed Length")

In [None]:
print("Exterior Color has", cols_by_nulls['Exterior Color'], "null values")
print(cars_dataset['Exterior Color'].value_counts().sum(), "not null values and values count:", cars_dataset['Exterior Color'].nunique())
cars_dataset['Exterior Color'].value_counts()

In [None]:
ext_colors = list(cars_dataset['Exterior Color'].value_counts().index)
ext_colors

'Exterior Color' has a lot of unique values, but they can be regrouped given that these values represent almost the same color by category.

The following function will check whether the passed color names belong to the same category.

In [None]:
def color_in(color, *argc):
    checks = []
    for arg in argc:
        if arg in lower(color):
            checks.append(True)
        else:
            checks.append(False)
    if all(checks):
        return True
    else:
        return False

Now, this function will tell if the conditions are met so that it categorizes the original values to this set-up. 

In [None]:
def ext_colors_categorize(ext_color):
    if color_in(ext_color, "black") or "Obsidian" == ext_color:
        return "Black"
    elif color_in(ext_color, "white"):
        return "White"
    elif color_in(ext_color, "green"):
        return "Green"
    elif color_in(ext_color, "blue"):
        return "Blue"
    elif color_in(ext_color, "silver", "gray", "magnetic", "pearlcoat"):
        return "Silver or Grey"
    elif color_in(ext_color, "red", "Cherry", "ruby"):
        return "Red"
    elif color_in(ext_color, "gold"):
        return "Gold"
    else:
        return "Unknown"

In [None]:
cars_dataset['Exterior Color'].apply(ext_colors_categorize)

In [None]:
cars_dataset.dtypes