In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline                               # to build data preprocessing pipeline
from sklearn.impute import SimpleImputer                            # to impute missing values in data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler                    # to scale numerical value
from sklearn.model_selection import train_test_split                # to split data randomly into test and train part
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder     # to convert categorical feature into numerical feature

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, SGDRegressor


def rmse(y, y_hat):
    return np.sqrt(mean_squared_error(y, y_hat))

def model_report(models, X_train, X_test, y_train, y_test):
    for name, model in models:
        model.fit(X_train, y_train)
        y_hat_train = model.predict(X_train)
        y_hat_test = model.predict(X_test)
        print('_'*80)
        print(f'Report For {name}'.center(80))
        print()
        print(f'Training RMSE Error : {rmse(y_train, y_hat_train):.2f}')
        print(f'Test RMSE Error : {rmse(y_test, y_hat_test):.2f}')
        print()
        print(f'Training Accuracy :  {r2_score(y_train, y_hat_train):.2f}')
        print(f'Test Accuracy : {r2_score(y_test, y_hat_test):.2f}')
        print('\n\n')

def load_data():
    car = pd.read_csv('CarPrice_Assignment.csv')
    car['horse_engine'] = (car.enginesize + car.horsepower)    

    X = car.drop(['price'], axis=1).copy()
    y = car['price']
    return X, y 


if __name__ == '__main__':
     
    # to randomly split data into train & test set
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


    num_features = ['symboling', 'wheelbase', 'carlength','carwidth', 'carheight', 'curbweight','enginesize', 
                       'boreratio', 'stroke', 'compressionratio','horsepower', 'peakrpm', 'citympg', 'highwaympg', 'horse_engine']

    ordinal_feature = ['carbody', 'enginetype', 'cylindernumber', 'fuelsystem']
    onehotencoding_features = ['fueltype', 'aspiration', 'doornumber', 'drivewheel', 'enginelocation']


    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),   # T1
        ('standard scaler', StandardScaler())            # T2
    ])

    ordinal_pipeline = OrdinalEncoder()

    onehot_pipeline = OneHotEncoder()    

    final_pipeline = ColumnTransformer([
        ('numerical_pipeline', num_pipeline, num_features),                       # name, Transform, column / feature list
        ('ordinal_pipeline', ordinal_pipeline, ordinal_feature),
        ('onehot_pipeline', OneHotEncoder(), onehotencoding_features),
        # ('remove features', 'drop', drop_features),
        # ('pass through', 'passthrough', pass_through)
    ])

    X_train_tr = final_pipeline.fit_transform(X_train)

    X_test_tr = final_pipeline.transform(X_test)
    
    models = [
        #('Linear Regression', LinearRegression()),
        #('SGD Regresser', SGDRegressor(max_iter=10000)),
        ('Decision Tree', DecisionTreeRegressor(max_depth=7, min_samples_split=10)),
        #('Support Vector Machines', SVR(kernel='linear')),
        ('Random Forest', RandomForestRegressor(max_depth=9)),
        ('K-Nearest Neighbors', KNeighborsRegressor(n_neighbors=5))
    ]


    model_report(models, X_train_tr, X_test_tr, y_train, y_test)

________________________________________________________________________________
                            Report For Decision Tree                            

Training RMSE Error : 1351.17
Test RMSE Error : 2005.15

Training Accuracy :  0.97
Test Accuracy : 0.90



________________________________________________________________________________
                            Report For Random Forest                            

Training RMSE Error : 902.98
Test RMSE Error : 1292.03

Training Accuracy :  0.99
Test Accuracy : 0.96



________________________________________________________________________________
                         Report For K-Nearest Neighbors                         

Training RMSE Error : 2724.93
Test RMSE Error : 2799.84

Training Accuracy :  0.89
Test Accuracy : 0.80



