****
**`Import Necessary Libraries:`**

In [15]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

****
**`Load and Prepare Data:`**

In [37]:
len(data.columns)

39

In [16]:
data = pd.read_csv("data_for_model.csv")

X = data.drop(columns=['price'])  # Drop the target column to get features
y = data['price']  # Select the target column as y

In [17]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4832, 38)
(1209, 38)
(4832,)
(1209,)


****
**`Define Pipeline and Grid Search:`**

In [19]:
from xgboost import XGBRegressor

# Define pipelines for different models
pipelines = {
    'linear_regression': Pipeline([('scaler', StandardScaler()), ('regressor', LinearRegression())]),
    'decision_tree': Pipeline([('scaler', StandardScaler()), ('regressor', DecisionTreeRegressor())]),
    'random_forest': Pipeline([('scaler', StandardScaler()), ('regressor', RandomForestRegressor())]),
    'xgboost': Pipeline([('scaler', StandardScaler()), ('regressor', XGBRegressor())])
}

# Define parameter grids for hyperparameter tuning
param_grids = {
    'linear_regression': {},
    'decision_tree': {'regressor__max_depth': [None, 10, 20, 30]},
    'random_forest': {'regressor__n_estimators': [100, 200, 300], 'regressor__max_depth': [None, 10, 20]},
    'xgboost': {'regressor__n_estimators': [100, 200, 300], 'regressor__max_depth': [3, 5, 7]}
}

# Perform Grid Search and find the best model for each pipeline
best_models = {}
for model_name, pipeline in pipelines.items():
    grid_search = GridSearchCV(pipeline, param_grid=param_grids[model_name], cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_


****
**`Evaluating Model`**

In [20]:
X.shape

(6041, 38)

In [21]:
# Evaluate models on test set
evaluation_results = {}
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    evaluation_results[model_name] = {'Mean Squared Error': mse, 'Mean Absolute Error': mae, 'R^2 Score': r2}

# Print evaluation results
for model_name, metrics in evaluation_results.items():
    print(f"Metrics for {model_name}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value}")
    print()


Metrics for linear_regression:
Mean Squared Error: 0.1754374113492733
Mean Absolute Error: 0.33609269006521103
R^2 Score: 0.7519380633786792

Metrics for decision_tree:
Mean Squared Error: 0.19919801489661765
Mean Absolute Error: 0.28249542479773937
R^2 Score: 0.7183414588351291

Metrics for random_forest:
Mean Squared Error: 0.15151079434395062
Mean Absolute Error: 0.2713968926923188
R^2 Score: 0.7857694047413295

Metrics for xgboost:
Mean Squared Error: 0.14579921858617476
Mean Absolute Error: 0.2905660137213301
R^2 Score: 0.7938453591956081



**`Finally Model XG Boost comes up with Best R2 Score of 0.80. And I am going to build model on that`**

In [22]:
import pickle

# Identify the best model and its best parameters
best_model_name = max(evaluation_results, key=lambda k: evaluation_results[k]['R^2 Score'])
best_model = best_models[best_model_name]
best_params = best_model.get_params()

# Save the best model and its best parameters to a pickle file
with open('best_model.pkl', 'wb') as f:
    pickle.dump({'model': best_model, 'params': best_params}, f)

In [23]:
best_models[best_model_name]

In [24]:
best_model.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()),
  ('regressor',
   XGBRegressor(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=None, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=3, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                multi_strategy=None, n_estimators=100, n_jobs=None,
                num_parallel_tree=None, random_state=None, ...))],
 'verbose': False,
 'scaler': StandardScaler(),
 'regressor': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_b

In [25]:
best_model_name

'xgboost'

In [34]:
import pickle
import numpy as np
from scipy.stats import boxcox

# Load the trained model
with open('best_model.pkl', 'rb') as f:
    model_data = pickle.load(f)
    best_model = model_data['model']

# Define input features (example values)
kilo_meter = 120000
owner_no = 3
model_year = 2015
registration_year = 2015
fuel_type = 'Petrol'
seats = 5
engine_displacement = 998
transmission = 'Manual'
year_manufacture = 2015
mileage = 23.1
no_of_cylinder = 3
valves_per_cylinder = 4
max_power = 67.04
max_torque = 90
build_type = 'Hatchback'
oem = 'Maruti'
insurance_validity = 'Third Party insurance'

# Convert categorical features to binary
fuel_type_bin = 1 if fuel_type == 'Diesel' else 0
transmission_bin = 1 if transmission == 'Automatic' else 0

# Perform one-hot encoding for categorical features
build_type_encoded = np.array([1 if build_type == b_type else 0 for b_type in ['Hatchback', 'MUV', 'SUV', 'Sedan']])
oem_encoded = np.array([1 if oem == o else 0 for o in ['Honda', 'Maruti', 'Tata', 'Hyundai', 'Chevrolet', 'Toyota','Ford', 'Skoda', 'Volkswagen', 'Nissan', 'Mahindra', 'Renault','Datsun', 'Jeep', 'Kia', 'MG']])
insurance_validity_encoded = np.array([1 if insurance_validity == i else 0 for i in ['Comprehensive', 'Not Available', 'Third Party insurance', 'Zero Dep']])

# Apply Box-Cox transformation to kilometer
Km_transformed_value = boxcox(kilo_meter + 1, lmbda=0.5577997790766174) # Adding 1 to avoid zero values

# Create features array
features1 = np.array([Km_transformed_value, owner_no, model_year, registration_year, fuel_type_bin, seats, engine_displacement, transmission_bin, year_manufacture, mileage, engine, no_of_cylinder, valves_per_cylinder, max_power, max_torque])
features = np.concatenate([features1, build_type_encoded, oem_encoded, insurance_validity_encoded])

# Perform prediction
predicted_price = best_model.predict(features.reshape(1, -1))

# Display predicted price
print('Predicted Car Price:', predicted_price)



ValueError: X has 39 features, but StandardScaler is expecting 38 features as input.

In [35]:
X.columns

Index(['Kilo Meter', 'ownerNo', 'modelYear', 'Registration Year', 'Fuel Type',
       'Seats', 'Engine Displacement', 'Transmission', 'Year of Manufacture',
       'Mileage', 'No of Cylinder', 'Valves per Cylinder', 'Max Power(bhp)',
       'Max Torque(Nm)', 'Build Type_Hatchback', 'Build Type_MUV',
       'Build Type_SUV', 'Build Type_Sedan', 'OEM_Chevrolet', 'OEM_Datsun',
       'OEM_Ford', 'OEM_Honda', 'OEM_Hyundai', 'OEM_Jeep', 'OEM_Kia', 'OEM_MG',
       'OEM_Mahindra', 'OEM_Maruti', 'OEM_Nissan', 'OEM_Renault', 'OEM_Skoda',
       'OEM_Tata', 'OEM_Toyota', 'OEM_Volkswagen',
       'Insurance Validity_Comprehensive', 'Insurance Validity_Not Available',
       'Insurance Validity_Third Party insurance',
       'Insurance Validity_Zero Dep'],
      dtype='object')

In [29]:
data['Kilo Meter'].max()

1391

In [None]:
a=0.55

In [33]:
from scipy.stats import boxcox

# Single value to transform
value_to_transform = 120000

# Known lambda value
lambda_value = 0.555

# Apply Box-Cox transformation
transformed_value = boxcox(value_to_transform + 1, lmbda=lambda_value)  # Adding 1 to avoid zero values

print('Transformed Value:', transformed_value)


Transformed Value: 1185.7556168580975
