<a href="https://colab.research.google.com/github/ACExpo/AI/blob/main/AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# imports
# Follow python style guidelines (pep-8) on the order of imports (https://pep8.org/#imports)

import os
import warnings
import zipfile
from io import StringIO
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_csv('data.csv')

In [None]:
le = LabelEncoder()
le.fit(df['model'])
le.transform(df['model'])
df['model'] = le.transform(df['model'])

In [None]:
X = df[['age', 'engineSize',
        'Automatic', 'Manual', 'Semi-Auto', 'Diesel',
        'Petrol']]

In [None]:
y = df["price"]

In [None]:
age_base_prediction = df.groupby("age").mean()

In [None]:
age_base_prediction = age_base_prediction["price"]

In [None]:
age_base_prediction

age
0     23800.058239
1     21114.948417
2     15477.629908
3     14039.964181
4     13110.853662
5     11476.681418
6      9851.019645
7      8429.079869
8      7448.929496
9      6447.544681
10     5187.675824
Name: price, dtype: float64

In [None]:
baseline = df.copy()
baseline.head()

Unnamed: 0.1,Unnamed: 0,model,age,engineSize,price,Automatic,Manual,Semi-Auto,Diesel,Petrol
0,0,100,3,1.4,10550,0,1,0,0,1
1,1,41,2,1.0,8200,0,1,0,0,1
2,2,100,1,2.0,15650,1,0,0,1,0
3,3,161,5,2.0,14000,1,0,0,1,0
4,4,132,1,1.5,18350,0,1,0,0,1


In [None]:
baseline = baseline.merge(age_base_prediction, left_on=df.age, right_on=age_base_prediction.index)

In [None]:
baseline

Unnamed: 0.1,key_0,Unnamed: 0,model,age,engineSize,price_x,Automatic,Manual,Semi-Auto,Diesel,Petrol,price_y
0,3,0,100,3,1.4,10550,0,1,0,0,1,14039.964181
1,3,5,161,3,2.0,13250,1,0,0,1,0,14039.964181
2,3,9,100,3,2.0,18990,0,1,0,0,1,14039.964181
3,3,11,100,3,1.6,8695,0,1,0,1,0,14039.964181
4,3,19,50,3,1.0,11290,0,1,0,0,1,14039.964181
...,...,...,...,...,...,...,...,...,...,...,...,...
99940,10,102223,46,10,2.1,8290,1,0,0,1,0,5187.675824
99941,10,102241,27,10,2.1,7490,1,0,0,1,0,5187.675824
99942,10,102299,46,10,3.0,5995,1,0,0,1,0,5187.675824
99943,10,102302,7,10,2.0,1350,0,1,0,1,0,5187.675824


In [None]:
r2_score(baseline.price_x, baseline.price_y)

0.3452811931304316

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [None]:
X.shape, y.shape

((99945, 7), (99945,))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((79956, 7), (19989, 7), (79956,), (19989,))

In [None]:
reg = LinearRegression().fit(X_train, y_train)

In [None]:
y_hat = reg.predict(X_test)

In [None]:
r2_score(y_test.values, y_hat)

0.7386376581670273

In [None]:
reg.coef_

array([-2.19348019e+03,  7.08957801e+03, -8.99151250e+15, -8.99151250e+15,
       -8.99151250e+15,  4.15823939e+16,  4.15823939e+16])

In [None]:
reg.intercept_

-3.259088143093232e+16

In [None]:
X_train.columns

Index(['age', 'engineSize', 'Automatic', 'Manual', 'Semi-Auto', 'Diesel',
       'Petrol'],
      dtype='object')

In [None]:
from sklearn.linear_model import Ridge, Lasso

In [None]:
reg_lasso = Lasso().fit(X_train, y_train)
reg_ridge = Ridge().fit(X_train, y_train)

In [None]:
y_hat_ridge = reg_ridge.predict(X_test)
y_hat_lasso = reg_lasso.predict(X_test)

In [None]:
r2_score(y_test.values, y_hat_ridge)

0.738712664297503

In [None]:
r2_score(y_test.values, y_hat_lasso)

0.7387203345966815

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor().fit(X_train, y_train)

In [None]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [None]:
rf_predictions = rf.predict(X_test)

In [None]:
r2_score(y_test.values, rf_predictions)

0.8164426290367323

In [None]:
# Save the model to file
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import joblib
from sklearn import tree
joblib.dump(rf_predictions,'cars.joblib')

['cars.joblib']