In [None]:
import re
import sklearn
import  numpy  as  np 
import  pandas  as  pd 
import matplotlib.pyplot as vis

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv('CarData.csv')
data.head()

### Prepare the dataset to try to predict the correct price of the car.
The Target is : Current_price


In [None]:
data.isna().sum()

In [None]:
data.info()

In [None]:
def remove_outliers(data):
    Q1= data.quantile(0.25)
    Q3= data.quantile(0.75)  
    IQ_range= Q3-Q1
    data_clean= data[~((data<(Q1 - 3.0*IQ_range)) | (data>(Q3 + 3.0*IQ_range)))]
    return data_clean

In [None]:
data = remove_outliers(data)
data['Current_price'].fillna(data.Current_price.median(), inplace = True)
data['Selling_Price'].fillna(data.Selling_Price.median(), inplace = True)
data['Kms_Driven'].fillna(data.Kms_Driven.median(), inplace = True)
data['Owner'].fillna(data.Owner.median(), inplace = True)

In [None]:
data.isna().sum()

In [None]:
# data.Fuel_Type = data.Fuel_Type.apply(lambda x: 1 if x == 'Petrol' else (0 if x =='Diesel' else 2))
data.Seller_Type = data.Seller_Type.apply(lambda x: 1 if x == 'Dealer' else 0)
data.Transmission = data.Transmission.apply(lambda x: 1 if x == 'Manual' else 0)

In [None]:
fuel = pd.get_dummies(data['Fuel_Type'], drop_first=True)
print(fuel.head(2))

In [None]:
data = pd.concat([data, fuel], axis=1)

In [None]:
data.drop(columns=['Fuel_Type','Car_Name','Owner'],inplace=True)

In [None]:
data.head()

### Normalization 

In [None]:
scaler = MinMaxScaler()
norm_data = scaler.fit_transform(data)
norm_data = pd.DataFrame(norm_data, columns = data.columns)
norm_data.describe().round(2)

In [None]:
norm_data.head(1)

In [None]:
X_norm = norm_data[['Year','Selling_Price','Kms_Driven','Diesel','Petrol','Seller_Type','Transmission']]
y_norm = norm_data['Current_price']

X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_norm, y_norm, test_size=0.3)

In [None]:
lr = LinearRegression()
lr.fit(X_train_norm, y_train_norm)
pred = lr.predict(X_test_norm)
score=r2_score(y_test_norm,pred)

print(f"R2 score:{score}") 
print(f'Mean absolute error is {mean_absolute_error(y_test_norm, pred)}')

In [None]:
corr = norm_data.corr()
fig = vis.figure(figsize=(5,5))
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='inferno', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(norm_data.columns),1)
ax.set_xticks(ticks)
vis.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(norm_data.columns)
ax.set_yticklabels(norm_data.columns)
vis.show()

In [None]:
vis.scatter(y_test_norm,pred)
vis.show()

### Standartization

In [None]:
scaler_std = StandardScaler()
std_data = scaler_std.fit_transform(data)
std_data = pd.DataFrame(std_data, columns = data.columns)
std_data.describe().round(2)

In [None]:
X_std = std_data[['Year','Selling_Price','Kms_Driven','Diesel','Petrol','Seller_Type','Transmission']]
y_std = std_data['Current_price']

X_train_std, X_test_std, y_train_std, y_test_std = train_test_split(X_std, y_std, test_size=0.3)

In [None]:
lr = LinearRegression()
lr.fit(X_train_std, y_train_std)
pred_std = lr.predict(X_test_std)
score_std=r2_score(y_test_std,pred_std)

print(f"R2 score:{score_std}") 
print(f'Mean absolute error is {mean_absolute_error(y_test_std, pred_std)}')

In [None]:
corr = std_data.corr()
fig = vis.figure(figsize=(5,5))
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='inferno', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(std_data.columns),1)
ax.set_xticks(ticks)
vis.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(std_data.columns)
ax.set_yticklabels(std_data.columns)
vis.show()

In [None]:
vis.scatter(y_test_std,pred_std)
vis.show()