In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'used-car-price-prediction-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3742543%2F6478229%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240224%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240224T020407Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D8222b9125d72a6e9ba2fdd3a64b02c10526cd7f94013cc7f524f73e4bf0e07b3a7ad0fa00ced11234dea40fd9347f9ae5bccd8722cdfe4e21441df4671113187b8a8773a7feb729535bfa37ea5ea5d781dffe3b8b81e8e60beb07f4482cc8647e0c914ee706d690720ea541540662063309084665100c3be5edf77ba7cc2edaf3b3efbec6eb84ca3567b6ba800c9b7a674e72cf79a41f6ac43f996017ebd004635ff1cb840c02521d2d450e542c7513789f5bdadf2e9bc08e0dcb9fec7c92228da1baf366efce8830eca9233c6470f9c814333df8f890b0a19a8dbe9fc3cf251e2aafd570a6c5e624363f86f257051b02f58b5003ee1738f05ba0b5e4b1e2e3e'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('/kaggle/input/used-car-price-prediction-dataset/used_cars.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
(df.isnull().sum()/len(df))*100

**Unique car Brands in the Dataset**

In [None]:
df['brand'].unique()

**Number of cars in each brand in the dataset**

In [None]:
brand_count = df['brand'].value_counts()
brand_count

In [None]:
brand_count.sum()

In [None]:
brand_count[brand_count > 25].sum()

In [None]:
(brand_count.sum() - brand_count[brand_count > 25].sum())/len(df)

**Filter and find car brands with sample size more then 25**

In [None]:
brand_count_more_25 = brand_count[brand_count > 25]
brand_count_more_25

**Function to filter dataset with cars having sample more then 25**

In [None]:
def filter_car(x):
    return x in brand_count_more_25

In [None]:
filter_car('MINI')

In [None]:
filter_car('Suzuki')

In [None]:
df1 = df[df['brand'].apply(filter_car)]
df1.head()

In [None]:
df1.shape

In [None]:
df1.isnull().sum()

In [None]:
df1['model'].nunique()

In [None]:
df1['fuel_type'].unique()

In [None]:
df1['fuel_type'].value_counts()

In [None]:
df1[df1['fuel_type'] == 'not supported']

In [None]:
df1[df1['fuel_type'] == '–']

In [None]:
#replace all '–' with null values

def replace_dash(x):
    if x == '–':
        return np.nan
    else:
        return x

In [None]:
df2 = df1.applymap(replace_dash)

In [None]:
df2.isnull().sum()

In [None]:
#df2.dropna()

In [None]:
df2['fuel_type'].value_counts()

**Remove car samples with Plug-In Hybrid & not supported fuel type as their sample size is very less**

In [None]:
df3 = df2[df2['fuel_type'].isin(['Gasoline','Hybrid','E85 Flex Fuel','Diesel'])]
df3.head()

In [None]:
df3.groupby('brand')['fuel_type'].value_counts().unstack().fillna(0)

**Fill all the missing values in fuel type with "Gasoline" as all brands have most number of gasoline cars**

In [None]:
df3['fuel_type'] = df3['fuel_type'].fillna('Gasoline')

In [None]:
df3.isnull().sum()

In [None]:
df3[df3['transmission'].isnull()]

In [None]:
df3['transmission'] = df3['transmission'].fillna('_')

In [None]:
df3['transmission'].unique()

In [None]:
df3['transmission'].nunique()

In [None]:
df3['transmission'].value_counts()

In [None]:
def fix_transmission_category(x):
    if 'A/T' in x or 'Automatic' in x:
        return 'Automatic'
    elif 'M/T' in x or 'Manual' in x:
        return 'Manual'
    else:
        return x

In [None]:
fix_transmission_category('Automatic, 8-Spd PDK Dual-Clutch')

In [None]:
df3.isnull().sum()

In [None]:
df3['transmission'] = df3['transmission'].astype(str).apply(fix_transmission_category)

In [None]:
df3['transmission'].value_counts()

In [None]:
df3['transmission'] = df3['transmission'].apply(lambda x : x if x in ['Automatic','Transmission w/Dual Shift Mode',
                                                                      'Manual' , 'CVT Transmission','_'] else 'Others')

In [None]:
df3['transmission'].value_counts()

In [None]:
df3[df3['transmission'] == '_']

In [None]:
df3.groupby('brand')['transmission'].value_counts().unstack().fillna(0)

In [None]:
df3['transmission'] = df3['transmission'].str.replace('_','Automatic')

In [None]:
df3['transmission'].value_counts()

In [None]:
df3.isnull().sum()

In [None]:
df4 = df3.fillna('_')

In [None]:
(df4 == '_').sum()

In [None]:
df4[df4['ext_col'] == '_']

In [None]:
df4['ext_col'].nunique()

In [None]:
df4['int_col'].nunique()

In [None]:
df4['ext_col'].value_counts().head(10)

In [None]:
df4['ext_col'] = df4['ext_col'].apply(lambda x : x if x in ['Black','White','Gray','Silver','Blue','Red','_'] else 'Others')

In [None]:
df4['ext_col'].value_counts()

In [None]:
df4['int_col'].value_counts().head(10)

In [None]:
df4['int_col'] = df4['int_col'].apply(lambda x : x if x in ['Black','Beige','Gray','Brown','Red','_'] else 'Others')

In [None]:
df4['int_col'].value_counts()

In [None]:
df4.groupby('brand')['int_col'].value_counts().unstack().fillna(0)

In [None]:
df4.groupby('brand')['ext_col'].value_counts().unstack().fillna(0)

In [None]:
df4.groupby('ext_col')['int_col'].value_counts().unstack().fillna(0)

In [None]:
df4['ext_col'] = df4['ext_col'].str.replace('_','Black')
df4['int_col'] = df4['int_col'].str.replace('_','Black')

In [None]:
#We replaced all null values with '_'
(df4 == '_').sum()

In [None]:
df4['accident'].unique()

In [None]:
df4['accident'].value_counts()

In [None]:
df4['accident'] = df4['accident'].str.replace('_','None reported')
df4['accident'] = df4['accident'].str.replace('At least 1 accident or damage reported','reported')

In [None]:
df4['accident'].value_counts()

In [None]:
(df4 == '_').sum()

In [None]:
df4['clean_title'].unique()

In [None]:
df4['clean_title'].value_counts()

In [None]:
#Only 'Yes' was given info in the clean title column so drop it
df5 = df4.drop('clean_title' , axis = 'columns')
df5.head()

In [None]:
df5.head()

In [None]:
df5.nunique()

In [None]:
age = 2023 - df5['model_year']
df5.insert(2,'Age',age)

In [None]:
df5.head()

In [None]:
df6 = df5.drop('model_year' , axis = 'columns')
df6.head()

In [None]:
df6['milage'] = df6['milage'].str.replace(',','')

In [None]:
df6.head()

In [None]:
k = '9835 mi.'
int(k.split(' ')[0])

In [None]:
df6['milage'] = df6['milage'].apply(lambda x : int(x.split(' ')[0]))

In [None]:
df6.head()

In [None]:
df6.dtypes

In [None]:
df6['price'] = df6['price'].str.replace('$','').str.replace(',','').astype(float)

In [None]:
df6.head()

In [None]:
df6.dtypes

In [None]:
df6.nunique()

In [None]:
df6['model'].value_counts()

In [None]:
df6['engine'].value_counts().head(10)

In [None]:
hp = df6['engine'].apply(lambda x : x.split('HP')[0]).apply(pd.to_numeric, errors='coerce')

In [None]:
litres = df6['engine'].apply(lambda x : x.split('L')[0].split(' ')[-1]).apply(pd.to_numeric, errors='coerce')

In [None]:
df6.insert(6,"HP",hp)
df6.insert(7,"Liters",litres)

In [None]:
df6.head()

In [None]:
df7 = df6.drop(['model','engine'], axis = 'columns')
df7.head()

In [None]:
df7.isnull().sum()

In [None]:
hp_mean = round(df7.groupby('brand')['HP'].mean(),2)
hp_mean

In [None]:
hp_mean['Mazda']

In [None]:
#filter the car into two parts- with missing HP & non-missing HP
df7_hp0 = df7[df7['HP'].isnull()]
df7_hp1 = df7[~df7['HP'].isnull()]

In [None]:
df7_hp0['HP'] = df7_hp0['brand'].apply(lambda x: hp_mean[x])

In [None]:
df8 = pd.concat((df7_hp0,df7_hp1),axis='rows',ignore_index = True)
df8.head()

In [None]:
df8.isnull().sum()

In [None]:
liters_mean = round(df8.groupby('brand')['Liters'].mean(),1)
liters_mean

In [None]:
liters_mean['Volvo']

In [None]:
#Seperate data into two parts => with missing values of liters & non missing values of liters
df8_lit0 = df8[df8['Liters'].isnull()]
df8_lit1 = df8[~df8['Liters'].isnull()]

In [None]:
def fix_missing_liters(brand):
    return liters_mean[brand]

In [None]:
fix_missing_liters('Porsche')

In [None]:
df8_lit0.head()

In [None]:
df8_lit0['Liters'] = df8_lit0['brand'].apply(fix_missing_liters)

In [None]:
#df8_lit0['Liters'] = df8_lit0['brand'].apply(lambda x : liters_mean[x])

In [None]:
df9 = pd.concat((df8_lit1,df8_lit0) , axis = 'rows' , ignore_index = True)
df9.head()

**Label Encoding**

In [None]:
df10 = df9.copy()

In [None]:
df10.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df10['brand'] = le.fit_transform(df10['brand'])
df10['fuel_type'] = le.fit_transform(df10['fuel_type'])
df10['transmission'] = le.fit_transform(df10['transmission'])
df10['ext_col'] = le.fit_transform(df10['ext_col'])
df10['int_col'] = le.fit_transform(df10['int_col'])
df10['accident'] = le.fit_transform(df10['accident'])

In [None]:
df10.head()

In [None]:
df10['milage'].hist()

In [None]:
import seaborn as sns
sns.pairplot(df10[['Age','milage','accident','HP','Liters','price']])

In [None]:
df10.corr()

In [None]:
X = df10.drop('price',axis='columns')
Y = df10['price']

In [None]:
Y.hist()

In [None]:
np.log(Y).hist()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

xtrain,xtest,ytrain,ytest = train_test_split(X,Y)
model0 = LinearRegression()

model0.fit(xtrain,ytrain)

from sklearn.metrics import mean_absolute_error,r2_score

ytrainP = model0.predict(xtrain)
ytestP = model0.predict(xtest)

maeTrain = mean_absolute_error(ytrain,ytrainP)
maeTest = mean_absolute_error(ytest,ytestP)

r2Train = r2_score(ytrain,ytrainP)
r2Test = r2_score(ytest,ytestP)

print({"MAE_Train" : maeTrain , "MAE_Test" : maeTest})
print({"R2_Train" : r2Train , "R2_Test" : r2Test})

In [None]:
#Log transformation of target Y => to get a Normal distributed target value
Yt = np.log(Y)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

xtrain,xtest,ytrain,ytest = train_test_split(X,Yt)
model1 = LinearRegression()

model1.fit(xtrain,ytrain)

In [None]:
from sklearn.metrics import mean_absolute_error,r2_score

ytrainP = model1.predict(xtrain)
ytestP = model1.predict(xtest)

maeTrain = mean_absolute_error(ytrain,ytrainP)
maeTest = mean_absolute_error(ytest,ytestP)

r2Train = r2_score(ytrain,ytrainP)
r2Test = r2_score(ytest,ytestP)

print({"MAE_Train" : maeTrain , "MAE_Test" : maeTest})
print({"R2_Train" : r2Train , "R2_Test" : r2Test})

**One Hot Encoding**

In [None]:
df11 = df9.copy()

In [None]:
df11.head()

In [None]:
#Using feature Intersaction
#hpl = df11['HP'] * df11['Liters']
#df11.insert(6,"HPL",hpl)

#aml = df11['Age'] * df11['milage']
#df11.insert(3,"AML",aml)

In [None]:
df11.head()

In [None]:
df11.columns

In [None]:
df11[['Age','milage','HP', 'Liters','price']].corr()

In [None]:
'''
from sklearn.preprocessing import MinMaxScaler
scl = MinMaxScaler()

scl.fit(df11[['Age','milage','AML','HP', 'Liters', 'HPL']])

df11[['Age','milage','AML','HP', 'Liters', 'HPL']] = pd.DataFrame(scl.transform(df11[['Age','milage','AML',
                                                                                          'HP', 'Liters', 'HPL']]),
                                                                  columns = ['Age','milage','AML','HP', 'Liters', 'HPL'])
df11.head()
'''

In [None]:
df11[['Age','milage','HP', 'Liters','price']].corr()

In [None]:
df12 = pd.get_dummies(df11).applymap(lambda x : float(x))
df12.head()

In [None]:
X = df12.drop('price' , axis = 'columns')
Y = df12['price']

In [None]:
Yt = np.log(Y)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

xtrain,xtest,ytrain,ytest = train_test_split(X,Yt)
model2 = LinearRegression()

model2.fit(xtrain,ytrain)

In [None]:
from sklearn.metrics import mean_absolute_error,r2_score

ytrainP = model2.predict(xtrain)
ytestP = model2.predict(xtest)

maeTrain = mean_absolute_error(ytrain,ytrainP)
maeTest = mean_absolute_error(ytest,ytestP)

r2Train = r2_score(ytrain,ytrainP)
r2Test = r2_score(ytest,ytestP)

print({"MAE_Train" : maeTrain , "MAE_Test" : maeTest})
print({"R2_Train" : r2Train , "R2_Test" : r2Test})

In [None]:
feature_coef = pd.DataFrame(model2.coef_.reshape(1,-1) , columns = xtrain.columns)
feature_coef

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize = (16,8))
plt.bar(height = feature_coef.loc[0] , x = feature_coef.columns)
plt.xticks(rotation = 90)
plt.show()

**Save the model for deployment**

In [None]:
import pickle

In [None]:
with open("carPprice.pkl","wb") as f:
    f.write(pickle.dumps(model2))

In [None]:
'''
with open("carScale.pkl","wb") as f:
    f.write(pickle.dumps(scl))
'''

In [None]:
data_columns = {'columns' : list(xtrain.columns)}
data_columns

In [None]:
import json

with open("data_columns.json","w") as f:
    f.write(json.dumps(data_columns))

**Predictions**

In [None]:
df9.head(2)

In [None]:
df11.head(2)

In [None]:
brand = 'Hyundai'
age = 2
milage = 9835
fueltype = 'Gasoline'
hp = 320
liters = 2
transmission = 'Automatic'
extcol = 'Black'
intcol= 'Black'
accident = 'None reported'

In [None]:
input = np.zeros(len(xtrain.columns))
input

In [None]:
xtrain.columns

In [None]:
input[0] = age
input[1] = milage
input[2] = hp
input[3] = liters


#reshape=>convert into two dimension => (1,-1) => -1 will use higher dimension i.e. 6
#input_scaled = scl.transform(input[0:6].reshape(1,-1))
#input[0:6] = input_scaled


print(input)

In [None]:
list(xtrain.columns).index('brand_'+brand)

In [None]:
input[list(xtrain.columns).index('brand_'+brand)] = 1
input[list(xtrain.columns).index('fuel_type_'+fueltype)] = 1
input[list(xtrain.columns).index('transmission_'+transmission)] = 1
input[list(xtrain.columns).index('ext_col_'+extcol)] = 1
input[list(xtrain.columns).index('int_col_'+intcol)] = 1
input[list(xtrain.columns).index('accident_'+accident)] = 1

print(input)

In [None]:
input = pd.DataFrame(input.reshape(1,-1) , columns = xtrain.columns)
print(model2.predict(input))      #log value of price predicted

In [None]:
np.exp(model2.predict(input))

In [None]:
predicted_price = round(np.exp(model2.predict(input))[0])
print("Predicted Price-",predicted_price)  #take exp value of log value generated to get real predicted price