In [None]:
import pandas as pd
import numpy as np


In [None]:
df = pd.read_csv('cardekho_imputated.csv')

df.head()

## Feature Engineering

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.drop(columns=['car_name','brand','Unnamed: 0'],inplace=True,axis=1)

In [None]:
df.head()

In [None]:
df['model'].unique()

In [None]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O' ]
print("Number of numeric features : ", len(num_features))

cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print(f"Number of ctegorical features : {len(cat_features)}")

disc_features = [feature for feature in num_features if len(df[feature].unique()) <= 25] 
print(f"Number of discrete features : {len(disc_features)}" )

conti_features = [feature for feature in num_features if feature not in disc_features]
print(f"Number of continuous features : {len(conti_features)}")


In [None]:
# Independent and Dependent feature
from sklearn.model_selection import train_test_split
X = df.drop('selling_price',axis=1)
y = df['selling_price']

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

label = LabelEncoder()

X['model'] = label.fit_transform(X['model'])

In [None]:
X.head()

In [None]:
from sklearn.compose import ColumnTransformer

num_features = [feature for feature in X.columns if X[feature].dtype != 'O']
onehot_columns = ['seller_type','fuel_type','transmission_type']

onehot_encoder = OneHotEncoder(drop='first')
scaler = StandardScaler()

preprocessor = ColumnTransformer([
    ("OneHotEncoder",onehot_encoder,onehot_columns),
    ("StandardScaler",scaler,num_features)
],remainder='passthrough')

In [None]:
X = preprocessor.fit_transform(X)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [None]:
def evaluate_model(true,predicted):
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true,predicted)
    r2score = r2_score(true,predicted)

    return rmse,mae,r2score
    

In [None]:
models = {
    "LinearRegression" : LinearRegression(),
    "Ridge" : Ridge(),
    "Lasso" : Lasso(),
    "RandomForest" : RandomForestRegressor(),
    "DecisionTree" : DecisionTreeRegressor(),
    "KNN" : KNeighborsRegressor()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train) # Train model

    # make predictions

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # evaluate train and test dataset

    model_train_rmse, model_train_mae, model_train_r2score = evaluate_model(y_train,y_train_pred)
    model_test_rmse, model_test_mae, model_test_r2score = evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])

    print("Model Performance for Training Set")
    print("RMSE : {:.4f}".format(model_train_rmse))
    print("MAE : {:.4f}".format(model_train_mae))
    print("R2 Score  : {:.4f}".format(model_train_r2score))

    print("------------------------------------------")

    print("Model Performance for Test Set")
    print("RMSE : {:.4f}".format(model_test_rmse))
    print("MAE : {:.4f}".format(model_test_mae))
    print("R2 Score  : {:.4f}".format(model_test_r2score))

    print("="*50)

    print("\n")



In [None]:
# hyperparameter tuning

rf_params = {
    "n_estimators" : [50,100,200,300,500,1000],
    "criterion" : ["squared_error", "absolute_error", "friedman_mse", "poisson"],
    "min_samples_split" : [0,1,2,3,5,10],
    "max_features" :[5,7,"auto",8],
    "max_depth" : [5,10,15,None,8]
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

randomCV = RandomizedSearchCV(estimator=models['RandomForest'],param_distributions=rf_params,cv=5,n_iter=100,verbose=True)

randomCV.fit(X_train,y_train)


In [None]:
models = {
    "RandomForest" : RandomForestRegressor(),
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train) # Train model

    # make predictions

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # evaluate train and test dataset

    model_train_rmse, model_train_mae, model_train_r2score = evaluate_model(y_train,y_train_pred)
    model_test_rmse, model_test_mae, model_test_r2score = evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])

    print("Model Performance for Training Set")
    print("RMSE : {:.4f}".format(model_train_rmse))
    print("MAE : {:.4f}".format(model_train_mae))
    print("R2 Score  : {:.4f}".format(model_train_r2score))

    print("------------------------------------------")

    print("Model Performance for Test Set")
    print("RMSE : {:.4f}".format(model_test_rmse))
    print("MAE : {:.4f}".format(model_test_mae))
    print("R2 Score  : {:.4f}".format(model_test_r2score))

    print("="*50)

    print("\n")

