## Data Injection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

%matplotlib inline
pd.set_option("display.max_rows", None,"display.max_columns", None)
warnings.simplefilter(action='ignore')
plt.style.use('seaborn-v0_8')
df_main = pd.read_csv('car details v4.csv')
print("Shape: ", df_main.shape)
print("\nSamples:")
display(df_main.head())
print("\nDatatype:")
df_main.info()
print("\nNaN count:")
display(df_main.isna().sum())


Shape:  (2059, 20)

Samples:


Unnamed: 0,Make,Model,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,Owner,Seller Type,Engine,Max Power,Max Torque,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity
0,Honda,Amaze 1.2 VX i-VTEC,505000,2017,87150,Petrol,Manual,Pune,Grey,First,Corporate,1198 cc,87 bhp @ 6000 rpm,109 Nm @ 4500 rpm,FWD,3990.0,1680.0,1505.0,5.0,35.0
1,Maruti Suzuki,Swift DZire VDI,450000,2014,75000,Diesel,Manual,Ludhiana,White,Second,Individual,1248 cc,74 bhp @ 4000 rpm,190 Nm @ 2000 rpm,FWD,3995.0,1695.0,1555.0,5.0,42.0
2,Hyundai,i10 Magna 1.2 Kappa2,220000,2011,67000,Petrol,Manual,Lucknow,Maroon,First,Individual,1197 cc,79 bhp @ 6000 rpm,112.7619 Nm @ 4000 rpm,FWD,3585.0,1595.0,1550.0,5.0,35.0
3,Toyota,Glanza G,799000,2019,37500,Petrol,Manual,Mangalore,Red,First,Individual,1197 cc,82 bhp @ 6000 rpm,113 Nm @ 4200 rpm,FWD,3995.0,1745.0,1510.0,5.0,37.0
4,Toyota,Innova 2.4 VX 7 STR [2016-2020],1950000,2018,69000,Diesel,Manual,Mumbai,Grey,First,Individual,2393 cc,148 bhp @ 3400 rpm,343 Nm @ 1400 rpm,RWD,4735.0,1830.0,1795.0,7.0,55.0



Datatype:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Make                2059 non-null   object 
 1   Model               2059 non-null   object 
 2   Price               2059 non-null   int64  
 3   Year                2059 non-null   int64  
 4   Kilometer           2059 non-null   int64  
 5   Fuel Type           2059 non-null   object 
 6   Transmission        2059 non-null   object 
 7   Location            2059 non-null   object 
 8   Color               2059 non-null   object 
 9   Owner               2059 non-null   object 
 10  Seller Type         2059 non-null   object 
 11  Engine              1979 non-null   object 
 12  Max Power           1979 non-null   object 
 13  Max Torque          1979 non-null   object 
 14  Drivetrain          1923 non-null   object 
 15  Length              1995 non-null   float64


Make                    0
Model                   0
Price                   0
Year                    0
Kilometer               0
Fuel Type               0
Transmission            0
Location                0
Color                   0
Owner                   0
Seller Type             0
Engine                 80
Max Power              80
Max Torque             80
Drivetrain            136
Length                 64
Width                  64
Height                 64
Seating Capacity       64
Fuel Tank Capacity    113
dtype: int64

## Data Cleaning & Preprocessing

In [2]:
df_main = df_main.drop(columns=["Model"])
df_main["Engine"] = df_main["Engine"].str.extract(r'(\d+)').astype(float)
df_main["Power_bhp"] = df_main["Max Power"].str.extract(r'(\d+\.?\d*)').astype(float)
df_main["Power_rpm"] = df_main["Max Power"].str.extract(r'@ *(\d+)').astype(float)
df_main.drop(columns=["Max Power"], inplace=True)
df_main["Power_bhp"].fillna(df_main["Power_bhp"].median(), inplace=True)
df_main["Power_rpm"].fillna(df_main["Power_rpm"].median(), inplace=True)
df_main["Torque_Nm"] = df_main["Max Torque"].str.extract(r'(\d+\.?\d*)').astype(float)
df_main["Torque_rpm"] = df_main["Max Torque"].str.extract(r'@ *(\d+)').astype(float)
df_main.drop(columns=["Max Torque"], inplace=True)
df_main["Torque_Nm"].fillna(df_main["Torque_Nm"].median(), inplace=True)
df_main["Torque_rpm"].fillna(df_main["Torque_rpm"].median(), inplace=True)
display(df_main.head())
# Categorical columns
categorical_cols = [
    "Make", "Fuel Type", "Transmission", "Location",
    "Color", "Owner", "Seller Type", "Drivetrain"
]

# Numerical columns
numeric_cols = [
    "Year", "Kilometer", "Engine", "Length", "Width", "Height",
    "Seating Capacity", "Fuel Tank Capacity",
    "Power_bhp", "Power_rpm", "Torque_Nm", "Torque_rpm"
]

def remove_outliers_iqr(df, cols):
    for col in cols:
        Q1 = df[col].quantile(0.10)
        Q3 = df[col].quantile(0.90)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df
print("Before:", df_main.shape)
df_main = remove_outliers_iqr(df_main, numeric_cols)
print("After removing outliers:", df_main.shape)
scaler = StandardScaler()
df_main[numeric_cols] = scaler.fit_transform(df_main[numeric_cols])
for col in categorical_cols:
    df_main[col] = LabelEncoder().fit_transform(df_main[col])
display(df_main.head())

Unnamed: 0,Make,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,Owner,Seller Type,Engine,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity,Power_bhp,Power_rpm,Torque_Nm,Torque_rpm
0,Honda,505000,2017,87150,Petrol,Manual,Pune,Grey,First,Corporate,1198.0,FWD,3990.0,1680.0,1505.0,5.0,35.0,87.0,6000.0,109.0,4500.0
1,Maruti Suzuki,450000,2014,75000,Diesel,Manual,Ludhiana,White,Second,Individual,1248.0,FWD,3995.0,1695.0,1555.0,5.0,42.0,74.0,4000.0,190.0,2000.0
2,Hyundai,220000,2011,67000,Petrol,Manual,Lucknow,Maroon,First,Individual,1197.0,FWD,3585.0,1595.0,1550.0,5.0,35.0,79.0,6000.0,112.7619,4000.0
3,Toyota,799000,2019,37500,Petrol,Manual,Mangalore,Red,First,Individual,1197.0,FWD,3995.0,1745.0,1510.0,5.0,37.0,82.0,6000.0,113.0,4200.0
4,Toyota,1950000,2018,69000,Diesel,Manual,Mumbai,Grey,First,Individual,2393.0,RWD,4735.0,1830.0,1795.0,7.0,55.0,148.0,3400.0,343.0,1400.0


Categorical Features: ['Make', 'Fuel Type', 'Transmission', 'Location', 'Color', 'Owner', 'Seller Type', 'Drivetrain']
Numeric Features: ['Year', 'Kilometer', 'Engine', 'Length', 'Width', 'Height', 'Seating Capacity', 'Fuel Tank Capacity', 'Power_bhp', 'Power_rpm', 'Torque_Nm', 'Torque_rpm']


## Machine Learning

In [10]:
from sklearn.model_selection import train_test_split
# Target column
y = df_main["Price"]

# Features (everything except Price)
X = df_main.drop("Price", axis=1)

print("X shape:", X.shape)
print("y shape:", y.shape)

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Training set:", X_train.shape, y_train.shape)
print("Testing set:", X_test.shape, y_test.shape)


X shape: (1914, 20)
y shape: (1914,)
Training set: (1531, 20) (1531,)
Testing set: (383, 20) (383,)


In [None]:
import mlflow
import mlflow.xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment("MLOPS_practice")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

with mlflow.start_run():
    params = {
        "n_estimators": 200,
        "learning_rate": 0.1,
        "max_depth": 6,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "random_state": 42
    }
    # Define MLP
    model = xgb.XGBRegressor(**params)

    # Train
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Log hyperparameters
    mlflow.log_params(params)

    # Log metrics
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    input_example = pd.DataFrame(X_test[:1], columns=X.columns)

    mlflow.xgboost.log_model(
        model,
        name="xgboost-model",
        input_example=input_example
    )
print("✅ Regression MLP model with hyperparameters and loss curve logged to MLflow!")


🏃 View run marvelous-elk-760 at: http://127.0.0.1:8080/#/experiments/620030937936320143/runs/dd9931e69a7343cdb81f5bc07c546a81
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/620030937936320143
✅ Regression MLP model with hyperparameters and loss curve logged to MLflow!
