In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from warnings import filterwarnings
filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor
import re

In [None]:
data=pd.read_csv("Automobile_data.csv",na_values=["?"])

In [None]:
data.info()

In [None]:
data.head()

In [None]:
df=data.copy()
df.info()

In [None]:
sns.pairplot(data, diag_kind='kde', markers='+')

In [None]:
for i in data._get_numeric_data().columns:
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 4))
    sns.histplot(data[i], bins=10, ax=axes[0])
    axes[0].set_title(i)
    
    sns.boxplot(data[i], ax=axes[1])
    axes[1].set_title(i)
   
    sns.kdeplot(data[i], ax=axes[2],fill=True)
    axes[2].set_title(i)
    plt.show()

# Null Values

In [None]:
data.drop(data[data['price'].isnull()].index,inplace=True)

In [None]:
data.reset_index(inplace=True)

In [None]:
data.drop("index",axis=1,inplace=True)

In [None]:
data.isnull().sum()/len(data)*100
# normalized-losses has null values of 18%, so it should be dropped

In [None]:
data.drop("normalized-losses",axis=1,inplace=True)

In [None]:
data.info()

In [None]:
data['num-of-doors'].value_counts()

In [None]:
data['num-of-doors'].replace({"four":4,'two':2},inplace=True)

In [None]:
data['num-of-doors'].fillna(data['num-of-doors'].mode()[0],inplace=True)

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data["horsepower"].fillna(data["horsepower"].mean(),inplace=True)
data["bore"].fillna(data["bore"].mean(),inplace=True)
data["stroke"].fillna(data["stroke"].mean(),inplace=True)
data["peak-rpm"].fillna(data["peak-rpm"].mode()[0],inplace=True)

In [None]:
data.isnull().sum()
# no missing values remained

# Outliers

In [None]:
df=data.copy()

In [None]:
num_cols=data._get_numeric_data().columns

In [None]:
for col in num_cols:
    sns.boxplot(data[col])
    plt.show()

In [None]:
# As it seems, there are plenty of outliers
# Two functions for managing outliers
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def replace_with_thresholds(dataframe, col_name,up=0.25, low=0.75):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name,up,low)
    if low_limit > 0:
        dataframe.loc[(dataframe[col_name] < low_limit), col_name] = low_limit
        dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit
    else:
        dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit
        
    return dataframe

In [None]:
# Replacing all outliers except in 'price' column
for col in num_cols[:-1]:
    data=replace_with_thresholds(data,col)

In [None]:
for col in num_cols:
    sns.boxplot(data[col])
    plt.show()

In [None]:
obj_cols=data.select_dtypes("object").columns
obj_cols

In [None]:
data["make"].value_counts()

In [None]:
for col in obj_cols:
    print(data[col].value_counts())
    print("################")
    
# Some values take place less than 5 percentage of the column

# Correlation

In [None]:
# Correlation dataframe of df
corr=data.corr()

In [None]:
# Correlation dataframe of df where correlation is higher than 0.5 (for price)
corr=corr[abs(corr['price'])>0.5]
corr

In [None]:
# Correlation matrix of highest corr_rate for price
f, ax = plt.subplots(figsize= [20,13])
sns.heatmap(corr, annot=True, fmt=".2f", ax=ax, cmap = "magma" )
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()

# ML

In [None]:
df=data.copy()

In [None]:
#data=df.copy()

In [None]:
data.describe().T
# Price column has a very large std compared to its mean
# So, price column needs to be normalized

In [None]:
data=replace_with_thresholds(data,"price")

In [None]:
data.describe().T

In [None]:
df=data.copy()

In [None]:
data=pd.get_dummies(data)

# Modeling

In [None]:
from sklearn.metrics import mean_squared_error,r2_score,accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
def reg_modeling(df, Y, algo):
    X=df.drop(Y, axis=1)
    Y=df[[Y]]
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.20, random_state=42)
    model=algo.fit(X_train, Y_train)
    Y_pred=algo.predict(X_test)
    error=np.sqrt(mean_squared_error(Y_test,Y_pred))
    score_algo=algo.score(X_test, Y_test)
    r2=r2_score(Y_test,Y_pred)
    print(f'{type(model).__name__}\n---Error: {error}\n---Algo_Score: {score_algo}\n---R2_Score: {r2}')    
    print('----------------------------------------------------')
    return (type(model).__name__, error,score_algo,r2)

In [None]:
models=[LinearRegression(),Ridge(),Lasso(),ElasticNet()]
results={ 'model_name':[], 'Error':[],"Model_Score":[],"R2_Score":[]}

In [None]:
for m in models:
    res=reg_modeling(data,"price",m)
    results['model_name'].append(res[0])
    results["Error"].append(res[1])
    results["Model_Score"].append(res[2])
    results["R2_Score"].append(res[3])

In [None]:
model_parameters={
    'Ridge' : {
        'model': Ridge(),
        'params': {'alpha': [0.1, 0.01, 0.005, 0.05, 0.001 ,0.2, 0.3, 0.5, 0.8, 0.9, 1], 
                   'solver': ['auto', 'svd', 'cholesky']}},
    'Lasso': {
        'model': Lasso(),
        'params': {'selection': ['cyclic', 'random'], 
                  'alpha': [0.1,0.01, 0.005, 0.05, 0.001 ,0.2, 0.3, 0.5, 0.8, 0.9, 1]}},
    'Decision Tree': {
        'model': DecisionTreeRegressor(),
        'params': {'criterion': ['mse', 'friedman_mse', 'mae', 'poisson'],
                   'splitter': ['best', 'random']}}, 
    'KNeighbors': {
        'model': KNeighborsRegressor(), 
        'params': {'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
                  'weights': ['uniform', 'distance'],
                  'n_neighbors': [3, 4, 5]}},
    'ElasticNet': {
        'model': ElasticNet(), 
        'params': {'alpha': [0.1,0.01, 0.005, 0.05, 0.001 ,0.2, 0.3, 0.5, 0.8, 0.9, 1],
                  'selection': ['cyclic', 'random']}}
}

In [None]:
def model_tunings(df,Y, model_params):
    X=df.drop(Y, axis=1)
    Y=df[[Y]]
    results=[]
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
    for model_name, mp in model_params.items():
        clf=GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=True)
        clf.fit(X_train, Y_train)
        Y_train_pred=clf.predict(X_train)
        Y_test_pred=clf.predict(X_test)
        train_error=np.sqrt(mean_squared_error(Y_train, Y_train_pred))
        train_score=clf.score(X_train, Y_train)
        test_error=np.sqrt(mean_squared_error(Y_test, Y_test_pred))
        test_score=clf.score(X_test, Y_test)
        #score_algo=algo.score(X_test, Y_test)
        r2=r2_score(Y_test,Y_test_pred)
        
        print(model_name)
        #print('Train Rmse: {}'.format(train_error))
        #print('Train Score: {}'.format(train_score))
        print('Test Rmse : {}'.format(test_error))
        print('Test Score: {}'.format(test_score))
        print("R2_score  : {}".format(r2))
        print('------------------------------------------------------')
        results.append({'model': model_name, 
                        #'Train Error': train_error,
                        #'Train Score': train_score,
                        'Test Error': test_error,
                        'Test Score': test_score,
                        'best_score': clf.best_score_,
                        'best_params': clf.best_params_})
    return results

In [None]:
tunings=model_tunings(data,'price',model_parameters)

In [None]:
def light_gbm(df, Y):
    #df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    lgbm=LGBMRegressor()
    X=df.drop(Y, axis=1)
    Y=df[[Y]]
    X_train, X_test, Y_train, Y_test=train_test_split(X, Y, random_state=42, test_size=0.20)
    lgbm.fit(X_train, Y_train)

    Y_pred=lgbm.predict(X_test,num_iteration=lgbm.best_iteration_)
    print("Light_Score:: ",lgbm.score(X_test,Y_test))
    print("Error:: ",(np.sqrt(mean_squared_error(Y_test, Y_pred))))
    print("R2_Score:: ",(r2_score(Y_test,Y_pred)))
    #print(Y_pred)
    return Y_pred

In [None]:
Y_pred=light_gbm(data,"price")

In [None]:
def light_gbm_tuning(df, Y):
    #df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    X=df.drop(Y, axis=1)
    Y=df[[Y]]
    X_train, X_test, Y_train, Y_test=train_test_split(X, Y, random_state=42, test_size=0.20)
    lgbm_grid={
    'colsample_bytree':[0.4, 0.5, 0.6, 0.9, 1],
    'learning_rate':[0.01, 0.1, 0.5, 1],
           'n_estimators':[20, 40, 100, 200, 500, 1000],
           'max_depth':[1, 2, 3, 4, 5, 6, 7, 8]}

    lgbm=LGBMRegressor()

    lgbm_cv_model=GridSearchCV(lgbm, lgbm_grid, cv=10,
                           n_jobs=-1, verbose=2)

    lgbm_cv_model.fit(X_train, Y_train)

    print("Parameters:: ",lgbm_cv_model.best_params_)
    print("Score::      ",lgbm_cv_model.best_score_)
    return lgbm_cv_model.best_params_,lgbm_cv_model.best_score_

In [None]:
params,light_score=light_gbm_tuning(data,"price")

In [None]:
light_score

In [None]:
##############################

In [None]:
##############################

Best result so far is below::

model: ElasticNet (GridSearch CV tuning)    

***Test Rmse : 1531.7349767798682***                                                                                           

***Test Score: 0.9674273751102496***       

***R2_score  : 0.9674273751102496***
