In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.compose import ColumnTransformer
from predict_function import predict

cars = pd.read_csv('CarPrice_Assignment.csv')


X = cars.drop('price', axis=1).copy()
y = cars.price

X.drop(['car_ID', 'symboling', 'stroke', 'compressionratio', 'peakrpm', 'CarName'], axis=1, inplace=True)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


numerical_features = list(X.select_dtypes(['int', 'float']).columns)
nom_features = ['fueltype', 'aspiration', 'enginelocation', 'cylindernumber', 'fuelsystem']
ord_features = ['doornumber', 'carbody', 'drivewheel', 'enginetype']


num_pipline = Pipeline([
    ('impute', SimpleImputer()),
    ('scale', StandardScaler())
])

ord_pipeline = Pipeline([
    ('ordinal_encoder', OrdinalEncoder())
])

nom_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(sparse_output=False))
])


final_pipeline = ColumnTransformer([
    ('num_pipeline', num_pipline, numerical_features),
    ('ord_pipeline', ord_pipeline, ord_features),
    ('nom_pipeline', nom_pipeline, nom_features)
])


final_pipeline.fit(X_train)
X_train_tr = final_pipeline.transform(X_train)
X_test_tr = final_pipeline.transform(X_test)

models = [
    ('linear_regression', LinearRegression()),
    ('SGD', SGDRegressor()),
    ('Random_forest', RandomForestRegressor()),
    ('Tree', DecisionTreeRegressor()),
    ('neighbores', KNeighborsRegressor()),
    ('SVM', SVR())
]

predict(models, X_train_tr, X_test_tr, y_train, y_test)

linear_regression
Training error: 1890.47
Training accuracy: 0.88
____________________________________________________________________________________________________
Testing error: 1554.82
Testing accuracy: 0.89

SGD
Training error: 2393.25
Training accuracy: 0.84
____________________________________________________________________________________________________
Testing error: 1785.89
Testing accuracy: 0.84

Random_forest
Training error: 625.80
Training accuracy: 0.99
____________________________________________________________________________________________________
Testing error: 1003.06
Testing accuracy: 0.95

Tree
Training error: 70.15
Training accuracy: 1.00
____________________________________________________________________________________________________
Testing error: 1247.05
Testing accuracy: 0.94

neighbores
Training error: 1723.98
Training accuracy: 0.90
____________________________________________________________________________________________________
Testing error: 136

In [2]:
X_train_tr

array([[ 0.84689794,  1.11336524,  0.53279808, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.43806434,  0.9445443 ,  1.08350534, ...,  0.        ,
         0.        ,  0.        ],
       [-0.76238838, -0.75170413, -1.02753916, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.94542567,  0.00397053,  0.02798309, ...,  0.        ,
         0.        ,  0.        ],
       [-1.58345282, -0.48641409, -0.4768319 , ...,  1.        ,
         0.        ,  0.        ],
       [ 2.47260554,  1.92531354,  1.08350534, ...,  1.        ,
         0.        ,  0.        ]])

In [3]:
X_test_tr

array([[-0.46680518, -0.20504587, -0.29326281, ...,  0.        ,
         1.        ,  0.        ],
       [-0.41754131,  0.43004241,  2.04724305, ...,  1.        ,
         0.        ,  0.        ],
       [ 2.47260554,  1.92531354,  1.08350534, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.20646767,  0.65513699,  0.39512127, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.84689794,  1.11336524,  0.53279808, ...,  1.        ,
         0.        ,  0.        ],
       [-0.99228642, -0.65523503, -0.84397007, ...,  0.        ,
         0.        ,  0.        ]])

In [4]:
final_pipeline

In [5]:
X_train_tr

array([[ 0.84689794,  1.11336524,  0.53279808, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.43806434,  0.9445443 ,  1.08350534, ...,  0.        ,
         0.        ,  0.        ],
       [-0.76238838, -0.75170413, -1.02753916, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.94542567,  0.00397053,  0.02798309, ...,  0.        ,
         0.        ,  0.        ],
       [-1.58345282, -0.48641409, -0.4768319 , ...,  1.        ,
         0.        ,  0.        ],
       [ 2.47260554,  1.92531354,  1.08350534, ...,  1.        ,
         0.        ,  0.        ]])

In [6]:
final_pipeline.named_transformers_

{'num_pipeline': Pipeline(steps=[('impute', SimpleImputer()), ('scale', StandardScaler())]),
 'ord_pipeline': Pipeline(steps=[('ordinal_encoder', OrdinalEncoder())]),
 'nom_pipeline': Pipeline(steps=[('one_hot_encoder', OneHotEncoder(sparse_output=False))])}

In [7]:
final_pipeline.get_feature_names_out()


array(['num_pipeline__wheelbase', 'num_pipeline__carlength',
       'num_pipeline__carwidth', 'num_pipeline__carheight',
       'num_pipeline__curbweight', 'num_pipeline__enginesize',
       'num_pipeline__boreratio', 'num_pipeline__horsepower',
       'num_pipeline__citympg', 'num_pipeline__highwaympg',
       'ord_pipeline__doornumber', 'ord_pipeline__carbody',
       'ord_pipeline__drivewheel', 'ord_pipeline__enginetype',
       'nom_pipeline__fueltype_diesel', 'nom_pipeline__fueltype_gas',
       'nom_pipeline__aspiration_std', 'nom_pipeline__aspiration_turbo',
       'nom_pipeline__enginelocation_front',
       'nom_pipeline__enginelocation_rear',
       'nom_pipeline__cylindernumber_eight',
       'nom_pipeline__cylindernumber_five',
       'nom_pipeline__cylindernumber_four',
       'nom_pipeline__cylindernumber_six',
       'nom_pipeline__cylindernumber_three',
       'nom_pipeline__cylindernumber_twelve',
       'nom_pipeline__cylindernumber_two',
       'nom_pipeline__fuelsys

In [8]:
final_pipeline.named_transformers_.nom_pipeline.get_feature_names_out()

array(['fueltype_diesel', 'fueltype_gas', 'aspiration_std',
       'aspiration_turbo', 'enginelocation_front', 'enginelocation_rear',
       'cylindernumber_eight', 'cylindernumber_five',
       'cylindernumber_four', 'cylindernumber_six',
       'cylindernumber_three', 'cylindernumber_twelve',
       'cylindernumber_two', 'fuelsystem_1bbl', 'fuelsystem_2bbl',
       'fuelsystem_4bbl', 'fuelsystem_idi', 'fuelsystem_mfi',
       'fuelsystem_mpfi', 'fuelsystem_spdi', 'fuelsystem_spfi'],
      dtype=object)

In [9]:
cols = numerical_features + ord_features + list(final_pipeline.named_transformers_.nom_pipeline.get_feature_names_out())
print(cols)

['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize', 'boreratio', 'horsepower', 'citympg', 'highwaympg', 'doornumber', 'carbody', 'drivewheel', 'enginetype', 'fueltype_diesel', 'fueltype_gas', 'aspiration_std', 'aspiration_turbo', 'enginelocation_front', 'enginelocation_rear', 'cylindernumber_eight', 'cylindernumber_five', 'cylindernumber_four', 'cylindernumber_six', 'cylindernumber_three', 'cylindernumber_twelve', 'cylindernumber_two', 'fuelsystem_1bbl', 'fuelsystem_2bbl', 'fuelsystem_4bbl', 'fuelsystem_idi', 'fuelsystem_mfi', 'fuelsystem_mpfi', 'fuelsystem_spdi', 'fuelsystem_spfi']


In [10]:
pd.DataFrame(X_train_tr, columns=cols)

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,horsepower,citympg,highwaympg,...,cylindernumber_twelve,cylindernumber_two,fuelsystem_1bbl,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,0.846898,1.113365,0.532798,1.507432,0.852706,0.266108,1.564788,0.152993,-0.099459,-0.330910,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.438064,0.944544,1.083505,1.182732,1.257531,0.522823,1.274447,-0.308057,0.510311,0.402458,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.762388,-0.751704,-1.027539,-0.197246,-1.102019,-0.760755,-0.721648,-0.938968,0.967638,0.989153,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.121958,0.100440,-0.201478,-0.724884,-0.093813,0.382797,0.984106,0.201525,-0.099459,-0.037563,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.302592,-0.261320,-0.247371,0.776856,-0.647073,-0.760755,-1.229745,-1.351487,1.882293,2.309215,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,1.782911,1.772571,2.230812,1.020381,1.745248,1.853077,0.984106,1.803068,-1.471442,-1.504299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
160,-0.663861,-1.008955,-1.027539,-0.197246,-1.140573,-0.760755,-0.721648,-0.938968,0.967638,0.989153,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
161,0.945426,0.003971,0.027983,0.249218,0.193420,0.102743,0.294546,-0.866171,0.967638,1.282500,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
162,-1.583453,-0.486414,-0.476832,-0.887234,0.301373,1.503011,1.419617,2.409713,-1.166557,-0.770931,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
