In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
import os

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
import seaborn as sns
# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

data = pd.read_csv("../input/carsforsale/cars_raw.csv")

# 1. DATA

In [None]:
data

# 2. PRICE TO INT
Convert string to int

In [None]:
#display price
data.Price

In [None]:
#drop Not Priced values, inplace = true truncates dataframe, .index = deletes row and shifts rest of df
data.drop(data[data["Price"]=="Not Priced"].index, inplace=True)
data['Price'] = data.Price.str.replace('$','') 
data['Price'] = data.Price.str.replace(',','.')

#price to float
data['Price'] = pd.to_numeric(data.Price, errors='coerce')

data.Price


# 3. Used/New
Convert all certifications to certified

In [None]:
#display all unique values
data['Used/New'].unique()

#define ToCertified pass in data, replace all with "Certified" with only "Certified"
def ToCertified(data):
    if "Certified" in data["Used/New"]:
        data["Used/New"] = "Certified"
    return data
#Checking functionality:
data.apply(ToCertified, axis = "columns")["Used/New"].unique()
data = data.apply(ToCertified, axis = "columns")

data['Used/New'].unique()

# 4. ConsumerRating
Find topmost variables correlated with ConsumerRating

In [None]:
corr_matrix = data.corr()
corr_matrix['ConsumerRating'].sort_values(ascending=False)
mean = data['ConsumerRating'].mean()
mean

In [None]:
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ["ValueForMoneyRating", "ReliabilityRating", "ComfortRating",
              "PerformanceRating"]
scatter_matrix(data[attributes], figsize=(12, 8))
save_fig("scatter_matrix_plot")

In [None]:
corr_matrix_price = data.corr()
corr_matrix_price['ConsumerRating'].sort_values(ascending=False)


# RESULT
ValueForMoneyRating      0.917873,

ReliabilityRating        0.914597,

ComfortRating            0.860040,

PerformanceRating        0.805849,

have high correlation to consumer rating

# 5. All CORRELATIONS
Display all correlations

In [None]:
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix

attributes_all = ["Year","Make","Model","Used/New","Price","ConsumerRating","ConsumerReviews","SellerType","SellerName","SellerRating","InteriorColor","Drivetrain","MinMPG","MaxMPG","FuelType","Transmission","Engine","VIN","Stock#","Mileage","ValueForMoneyRating", "ReliabilityRating", "ComfortRating",
              "PerformanceRating"]
scatter_matrix(data[attributes_all], figsize=(24, 16))
save_fig("scatter_matrix_plot_all")

# 6. PRICE CORRELATIONS
Display all price correlations

In [None]:
corr_matrix_price = data.corr()
corr_matrix_price['Price'].sort_values(ascending=False)

# Result 
Price has some correlations with Year and anti-correlated to Mileage

In [None]:
data.describe()


# 7. MAKE
Find most common car sold

In [None]:
data['Make'].value_counts()

# 8. HISTOGRAMS
Display all availible in histograms

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins=50, figsize=(20,15))
save_fig("cars_plot")
plt.show()

# 9. VALUE TYPES
Display types

In [None]:
data.info()

# 10. HOT ENCODING
Initiate Hot encoding for predictive modeling

In [None]:
#pandas hot encoding

hot_data = pd.get_dummies(data,prefix=['Make'], columns=['Make'], drop_first = True)
hot_data.head()