In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/cardataset/data.csv") # importing Kaggle dataset
# "Car Features and MSRP"
df.head() # will give you first 5 records

In [None]:
df.shape # this will print the no.of rows & columns of the Data Frame

In [None]:
df.columns # this will print the names of all columns

In [None]:
df.rename(columns = {'Transmission Type':'Transmission',
                    'Driven_Wheels':'Drive Mode',
                    'Engine HP':'HP',
                    'Engine Cylinders':'Cylinders',
                    'highway MPG':'MPG-H',
                    'city mpg':'MPG-C',
                    'MSRP':'Price'}, inplace=True)

In [None]:
df.info() # details of the data type of each variable
# "int64" and "float64" data type denotes numerical data and "object" data type
# denote categorical data

In [None]:
df.describe() # to generate descriptive statistics for all columns

In [None]:
df.isnull().sum() # will show you null count for each column

In [None]:
import pandas_profiling
profile = pandas_profiling.ProfileReport(df)
profile.to_file("cardata_1.html")

In [None]:
from IPython.display import HTML
HTML(filename='cardata_1.html')

In [None]:
# as we see, in warnings: 'Model' has high cardinality & 'Market Category'
# has high cardinality as well as 31.4% missing values. We will drop this two 
# columns from our dataset. Also, we will remove the duplicate rows. 
# Some columns has missing values other the 'Model' & 'Market Category'. So we
# will such records, containing missing values. These columns are 'Engine Fuel Type', 
# 'Engine HP', 'Engine Cylinders', 'Number of Doors'.
df.drop(['Model', 'Market Category'], axis='columns', inplace=True)
df.drop_duplicates(inplace=True)
df.dropna(subset=['Engine Fuel Type', 'HP', 'Cylinders',
                 'Number of Doors'], inplace=True)
df.shape

In [None]:
# measure of center (mean, median, mode) MSRP
df_mean = df['Price'].mean()
print('Mean is ',df_mean)
df_median = df['Price'].median()
print('Median is ',df_median)
df_mode = df['Price'].mode()
print('Mode is ',df_mode)

In [None]:
msrp_median = df.groupby(['Make', 'Engine Fuel Type', 'Transmission',
                              'Drive Mode', 'Vehicle Size', 'Vehicle Style'])['Price'].median()
print(msrp_median)

In [None]:
df_sd = df['Price'].std()
print('Standard Deviation is ',df_sd)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# sns.set(style="whitegrid")
# plt.figure(figsize=(10,8))
# ax = sns.boxplot(x='Year', y='Price', data=df, orient="v")
# plt.show()
# plt.figure(figsize=(10,8))
# ax = sns.boxplot(x='HP', y='Price', data=df, orient="v")
# plt.show()
# plt.figure(figsize=(10,8))
# ax = sns.boxplot(x='Cylinders', y='Price', data=df, orient="v")
# plt.show()
# plt.figure(figsize=(10,8))
# ax = sns.boxplot(x='Number of Doors', y='Price', data=df, orient="v")
# plt.show()
# plt.figure(figsize=(10,8))
# ax = sns.boxplot(x='MPG-H', y='Price', data=df, orient="v")
# plt.show()
# plt.figure(figsize=(10,8))
# ax = sns.boxplot(x='MPG-C', y='Price', data=df, orient="v")
# plt.show()
# plt.figure(figsize=(10,8))
# ax = sns.boxplot(x='Popularity', y='Price', data=df, orient="v")
# plt.show()
# plt.figure(figsize=(10,8))
# ax = sns.boxplot(x='Price', y='Price', data=df, orient="v")
# plt.show()


In [None]:
plt.figure(figsize=(14,8))
sns.distplot(df['Year'])
plt.show()
plt.figure(figsize=(14,8))
sns.distplot(df['HP'])
plt.show()
plt.figure(figsize=(14,8))
sns.distplot(df['Cylinders'])
plt.show()
plt.figure(figsize=(14,8))
sns.distplot(df['Number of Doors'])
plt.show()
plt.figure(figsize=(14,8))
sns.distplot(df['MPG-H'])
plt.show()
plt.figure(figsize=(14,8))
sns.distplot(df['MPG-C'])
plt.show()
plt.figure(figsize=(14,8))
sns.distplot(df['Popularity'])
plt.show()
plt.figure(figsize=(14,8))
sns.distplot(df['Price'])
plt.show()

In [None]:
# Categorical Data
cd_make = df['Make'].value_counts()
cd_eft = df['Engine Fuel Type'].value_counts()
cd_tt = df['Transmission'].value_counts()
cd_dw = df['Drive Mode'].value_counts()
cd_vsz = df['Vehicle Size'].value_counts()
cd_vst = df['Vehicle Style'].value_counts()

df_make = pd.DataFrame({'Make':cd_make})
df_eft = pd.DataFrame({'Engine Fuel Type':cd_eft})
df_tt = pd.DataFrame({'Transmission':cd_tt})
df_dw = pd.DataFrame({'Drive Mode':cd_dw})
df_vsz = pd.DataFrame({'Vehicle Size':cd_vsz})
df_vst = pd.DataFrame({'Vehicle Style':cd_vst})

df_make.plot.pie(y='Make', figsize=(15,15), autopct='%1.1f%%')



In [None]:
df_eft.plot.pie(y='Engine Fuel Type', figsize=(15,15), autopct='%1.1f%%')


In [None]:
df_tt.plot.pie(y='Transmission', figsize=(15,15), autopct='%1.1f%%')


In [None]:
df_dw.plot.pie(y='Drive Mode', figsize=(15,15), autopct='%1.1f%%')


In [None]:
df_vsz.plot.pie(y='Vehicle Size', figsize=(15,15), autopct='%1.1f%%')


In [None]:
df_vst.plot.pie(y='Vehicle Style', figsize=(15,15), autopct='%1.1f%%')

In [None]:
# have a copy of dataframe where all encoding and replacement will be done
df_replaced = df.copy()

# Replace with numbers as we have plenty of different 'Make'
unique_Make = list(df.Make.unique())
replace_dict_Make = dict(zip(unique_Make, range(len(unique_Make))))
df_replaced.replace(replace_dict_Make, inplace=True)

# Encode Engine Fuel Type
df_replaced = pd.get_dummies(df_replaced, columns=['Engine Fuel Type'], prefix = ['EFT'], drop_first=True)

# Encode Transmission
df_replaced = pd.get_dummies(df_replaced, columns=['Transmission'], prefix = ['Transmission'], drop_first=True)

# Encode Drive Mode
df_replaced = pd.get_dummies(df_replaced, columns=['Drive Mode'], prefix = ['DM'], drop_first=True)

# Encode Vehicle Size
df_replaced = pd.get_dummies(df_replaced, columns=['Vehicle Size'], prefix = ['VSize'], drop_first=True)

# Encode Vehicle Style
df_replaced = pd.get_dummies(df_replaced, columns=['Vehicle Style'], prefix = ['VStyle'], drop_first=True)






# from sklearn.preprocessing import LabelEncoder
# lb_tr = LabelEncoder()
# df_replaced['Transmission'] = lb_tr.fit_transform(df_replaced['Transmission'])

print(df_replaced.head())

# unique_eft = list(df['Engine Fuel Type'].unique())
# replace_dict_eft = dict(zip(range(len(unique_eft)), unique_eft))
# print(replace_dict_eft)

# df.shape
# df_replaced.boxplot('Price','Make',rot = 30,figsize=(5,6))
# df.boxplot('Price','Transmission',rot = 30,figsize=(5,6))
# df.boxplot('Price','Engine Fuel Type',rot = 30,figsize=(5,6))
# df.boxplot('Price','Drive Mode',rot = 30,figsize=(5,6))
# df.boxplot('Price','Vehicle Size',rot = 30,figsize=(5,6))
# df.boxplot('Price','Vehicle Style',rot = 30,figsize=(5,6))

# **Categorical values to numerical values**

In [None]:
# cat_col = ['Make', 'Engine Fuel Type', 'Transmission', 'Drive Mode', 'Vehicle Size', 'Vehicle Style']
# df[cat_col].head()
# #pd.get_dummies(df[cat_col], drop_first=True)
# for i in range(6):
#     print("\n\nUnique values in ", cat_col[i], df[cat_col[i]].unique())


# **Train & Test**

In [None]:
# store the feature matrix (X)
X = df_replaced.drop('Price', axis=1)
X.head()

In [None]:
X.shape

In [None]:
# store the response vector (y)
y = df_replaced['Price']
y.head()

In [None]:
y.shape
df_replaced.info()


In [None]:
import sklearn
from sklearn.model_selection import train_test_split
# splitting X & y into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X,  y, test_size=0.3,random_state = 101, shuffle = True)

In [None]:
print("X_train: ",X_train.shape)
print("X_test: ",X_test.shape)
print("y_train: ",y_train.shape)
print("y_test: ",y_test.shape)


# **Train**

In [None]:
df_replaced.describe()

In [None]:
# training model on training set
import xgboost as xgb

d_train = xgb.DMatrix(X_train, y_train)
d_valid = xgb.DMatrix(X_test, y_test)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
xgb_params = {'objective' : 'reg:squarederror', 
              'eval_metric' : 'rmse', 
              'seed' : 5, 
              'silent' : True}
model = xgb.train(xgb_params, d_train,1000, watchlist,maximize = False, verbose_eval = 10,early_stopping_rounds = 100)

# making predictions on the testing set
y_pred = model.predict(d_valid)


In [None]:
xgb.plot_importance(model)
xgb.plot_tree(model, num_trees=2)
xgb.to_graphviz(model, num_trees=2)