In [None]:
!git clone https://github.com/Foursteps-tech/EDA.git

Cloning into 'EDA'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 17 (delta 6), reused 10 (delta 2), pack-reused 0[K
Unpacking objects: 100% (17/17), done.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

# reading the dataset
cars = pd.read_csv("EDA/CarPrice_Assignment.csv")

In [None]:
# summary of the dataset: 205 rows, 26 columns, no null values
print(cars.info())

In [None]:
cars.head()

In [None]:
# symboling: -2 (least risky) to +3 most risky
# Most cars are 0,1,2
cars['symboling'].astype('category').value_counts()

In [None]:
# aspiration: An (internal combustion) engine property showing 
# whether the oxygen intake is through standard (atmospheric pressure)
# or through turbocharging (pressurised oxygen intake)

cars['aspiration'].astype('category').value_counts()

In [None]:
# drivewheel: frontwheel, rarewheel or four-wheel drive 
cars['drivewheel'].astype('category').value_counts()

In [None]:
cars_numeric = cars.select_dtypes(include=['float64', 'int'])
cars_numeric.head()

In [None]:
# dropping symboling and car_ID 
cars_numeric = cars_numeric.drop(['symboling', 'car_ID'], axis=1)
cars_numeric.head()

In [None]:
# correlation matrix
cor = cars_numeric.corr()
cor

In [None]:
# plotting correlations on a heatmap

# figure size
plt.figure(figsize=(16,8))

# heatmap
sns.heatmap(cor, cmap="YlGnBu", annot=True)
plt.show()

In [None]:
# variable formats
cars.info()

In [None]:
# CarName: first few entries
cars['CarName'][:30]

In [None]:
carnames = cars['CarName'].apply(lambda x: x.split(" ")[0])
carnames[:30]

In [None]:
# New column car_company
cars['car_company'] = cars['CarName'].apply(lambda x: x.split(" ")[0])

In [None]:
# look at all values 
cars['car_company'].astype('category').value_counts()

In [None]:
# replacing misspelled car_company names

# volkswagen
cars.loc[(cars['car_company'] == "vw") | 
         (cars['car_company'] == "vokswagen")
         , 'car_company'] = 'volkswagen'

# porsche
cars.loc[cars['car_company'] == "porcshce", 'car_company'] = 'porsche'

# toyota
cars.loc[cars['car_company'] == "toyouta", 'car_company'] = 'toyota'

# nissan
cars.loc[cars['car_company'] == "Nissan", 'car_company'] = 'nissan'

# mazda
cars.loc[cars['car_company'] == "maxda", 'car_company'] = 'mazda'

In [None]:
cars['car_company'].astype('category').value_counts()

In [None]:
# drop carname variable
cars = cars.drop('CarName', axis=1)

In [None]:
cars.info()

In [None]:
# split into X and y
X = cars.loc[:, ['symboling', 'fueltype', 'aspiration', 'doornumber',
       'carbody', 'drivewheel', 'enginelocation', 'wheelbase', 'carlength',
       'carwidth', 'carheight', 'curbweight', 'enginetype', 'cylindernumber',
       'enginesize', 'fuelsystem', 'boreratio', 'stroke', 'compressionratio',
       'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'car_company']]

y = cars['price']

In [None]:
# creating dummy variables for categorical variables

# subset all categorical variables
cars_categorical = X.select_dtypes(include=['object'])
cars_categorical.head()

In [None]:
# convert into dummies
cars_dummies = pd.get_dummies(cars_categorical, drop_first=True)
cars_dummies.head()

In [None]:
# drop categorical variables 
X = X.drop(list(cars_categorical.columns), axis=1)

In [None]:
# concat dummy variables with X
X = pd.concat([X, cars_dummies], axis=1)