In [1]:
pip install gradio pandas lightgbm catboost scikit-learn plotly

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import gradio as gr                     # For building the web interface
import pandas as pd                    # For data manipulation
import lightgbm as lgb                 # LightGBM regression model
from catboost import CatBoostRegressor # CatBoost regression model
from sklearn.linear_model import LinearRegression, Ridge  # Linear & Ridge regression models
from sklearn.preprocessing import LabelEncoder            # For encoding categorical variables
from sklearn.metrics import r2_score, mean_absolute_error # For model evaluation metrics
import plotly.graph_objects as go      # For interactive plotting
import re                              # For regex operations

In [4]:
# Load data
df = pd.read_csv("/content/Clean Data_pakwheels (1).csv")
raw_df = pd.read_csv("/content/Clean Data_pakwheels (1).csv")

In [5]:
# --- Encode Categorical Features Using LabelEncoder ---

# List of categorical columns to encode
categorical_cols = ['Company Name', 'Model Name', 'Location', 'Engine Type',
                    'Color', 'Assembly', 'Body Type', 'Transmission Type', 'Registration Status']

# Dictionary to store LabelEncoder objects for each column
label_encoders = {}

# Loop through each categorical column
for col in categorical_cols:
    le = LabelEncoder()                                # Create a LabelEncoder instance
    df[col] = le.fit_transform(df[col].astype(str))    # Convert column to string and apply label encoding
    label_encoders[col] = le                           # Store the fitted encoder for later use (e.g., during prediction)


In [6]:
# Prepare features and target

# Separate the dataset into features (X) and target variable (y)

# Drop the 'Price' column to create the feature set X — these are the input variables for the model
X = df.drop("Price", axis=1)

# The target variable y is the 'Price' column — this is what we want to predict
y = df["Price"]

# Clean the column names in X by replacing any non-word characters (e.g., spaces, punctuation) with underscores
# This ensures compatibility with libraries and avoids syntax issues in further processing
X.columns = [re.sub(r'\W+', '_', col) for col in X.columns]


In [7]:
# Train models

# Train a CatBoost Regressor — an advanced gradient boosting algorithm optimized for categorical features
# Set verbose=0 to suppress training logs
cat_model = CatBoostRegressor(verbose=0).fit(X, y)

# Train a LightGBM Regressor — another efficient gradient boosting algorithm particularly good with large datasets
lgb_model = lgb.LGBMRegressor().fit(X, y)

# Train a Ridge Regression model — a linear model with L2 regularization to handle multicollinearity
ridge_model = Ridge().fit(X, y)

# Train a standard Linear Regression model — simple baseline linear model without regularization
linear_model = LinearRegression().fit(X, y)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 827
[LightGBM] [Info] Number of data points in the train set: 46022, number of used features: 13
[LightGBM] [Info] Start training from score 2014153.231063


In [8]:
# --- Tab 1: Model Comparison ---
def plot_comparison():
    # Generate predictions for all models using the full dataset
    preds = {
        "Actual": y,
        "CatBoost": cat_model.predict(X),
        "LightGBM": lgb_model.predict(X),
        "Ridge": ridge_model.predict(X),
        "Linear Regression": linear_model.predict(X)
    }

    # Create an interactive line plot comparing model predictions with actual prices
    fig = go.Figure()
    for name, pred in preds.items():
        fig.add_trace(go.Scatter(y=pred, mode='lines', name=name))

    # Set plot titles and labels
    fig.update_layout(title="Model Comparison", xaxis_title="Index", yaxis_title="Price")

    # Calculate R² and MAE scores for each model
    scores = [
        f"CatBoost R²: {r2_score(y, preds['CatBoost']):.4f}, MAE: {mean_absolute_error(y, preds['CatBoost']):,.0f}",
        f"LightGBM R²: {r2_score(y, preds['LightGBM']):.4f}, MAE: {mean_absolute_error(y, preds['LightGBM']):,.0f}",
        f"Ridge R²: {r2_score(y, preds['Ridge']):.4f}, MAE: {mean_absolute_error(y, preds['Ridge']):,.0f}",
        f"Linear Regression R²: {r2_score(y, preds['Linear Regression']):.4f}, MAE: {mean_absolute_error(y, preds['Linear Regression']):,.0f}"
    ]
    return fig, *scores

# Create the Gradio interface for Tab 1
tab1 = gr.Interface(
    fn=plot_comparison,
    inputs=[],
    outputs=[gr.Plot(), "text", "text", "text", "text"],
    title="Model Comparison"
)


# --- Tab 2: Prediction Interface ---
def predict_price(company, model, year, location, mileage, engine_type, capacity,
                  color, assembly, body_type, transmission, registration, model_choice):

    # Create a dictionary from user inputs and encode categorical values using trained LabelEncoders
    input_dict = {
        'Company Name': label_encoders['Company Name'].transform([company])[0],
        'Model Name': label_encoders['Model Name'].transform([model])[0],
        'Model Year': year,
        'Location': label_encoders['Location'].transform([location])[0],
        'Mileage': mileage,
        'Engine Type': label_encoders['Engine Type'].transform([engine_type])[0],
        'Engine Capacity': capacity,
        'Color': label_encoders['Color'].transform([color])[0],
        'Assembly': label_encoders['Assembly'].transform([assembly])[0],
        'Body Type': label_encoders['Body Type'].transform([body_type])[0],
        'Transmission Type': label_encoders['Transmission Type'].transform([transmission])[0],
        'Registration Status': label_encoders['Registration Status'].transform([registration])[0],
    }

    # Convert input dictionary to DataFrame
    input_df = pd.DataFrame([input_dict])

    # Clean column names to match training data
    input_df.columns = [re.sub(r'\W+', '_', col) for col in input_df.columns]

    # Add missing columns with default values if any (ensures alignment with training data)
    for col in X.columns:
        if col not in input_df.columns:
            input_df[col] = 0

    # Reorder columns to match training data
    input_df = input_df[X.columns]

    # Map selected model name to the actual model object
    model_dict = {
        "LightGBM": lgb_model,
        "CatBoost": cat_model,
        "Ridge": ridge_model,
        "Linear Regression": linear_model
    }
    model = model_dict[model_choice]

    # Predict price and return formatted result
    pred = model.predict(input_df)[0]
    return f"Predicted Price: PKR {pred:,.0f}"


# Helper to filter model names based on selected company
def filter_models(company_name):
    filtered_models = raw_df[raw_df['Company Name'] == company_name]['Model Name'].unique().tolist()
    return gr.update(choices=filtered_models)


# Create Tab 2 layout using Gradio Blocks
with gr.Blocks(css="button {background-color: #40E0D0 !important;}") as tab2:
    gr.Markdown("### Car Price Predictor")

    with gr.Row():
        company = gr.Dropdown(choices=raw_df['Company Name'].unique().tolist(), label="Company Name")
        model = gr.Dropdown(label="Model Name")

    # Input widgets for all car features
    year = gr.Slider(raw_df['Model Year'].min(), raw_df['Model Year'].max(), step=1, label="Model Year")
    location = gr.Dropdown(choices=raw_df['Location'].unique().tolist(), label="Location")
    mileage = gr.Number(label="Mileage (km)")
    engine_type = gr.Dropdown(choices=raw_df['Engine Type'].unique().tolist(), label="Engine Type")
    capacity = gr.Number(label="Engine Capacity (cc)")
    color = gr.Dropdown(choices=raw_df['Color'].unique().tolist(), label="Color")
    assembly = gr.Dropdown(choices=raw_df['Assembly'].unique().tolist(), label="Assembly")
    body_type = gr.Dropdown(choices=raw_df['Body Type'].unique().tolist(), label="Body Type")
    transmission = gr.Dropdown(choices=raw_df['Transmission Type'].unique().tolist(), label="Transmission")
    registration = gr.Dropdown(choices=raw_df['Registration Status'].unique().tolist(), label="Registration")

    # Model selection radio button
    model_choice = gr.Radio(["CatBoost", "LightGBM", "Ridge", "Linear Regression"], label="Choose Model")

    # Prediction button and output box
    predict_btn = gr.Button("Predict")
    output_text = gr.Textbox(label="Output")

    # Bind the predict function to the button click
    predict_btn.click(
        fn=predict_price,
        inputs=[company, model, year, location, mileage, engine_type, capacity,
                color, assembly, body_type, transmission, registration, model_choice],
        outputs=output_text
    )

    # Update model dropdown when company changes
    company.change(fn=filter_models, inputs=company, outputs=model)


# --- Tab 3: A* Search (placeholder) ---
tab3 = gr.Interface(fn=lambda: "This is A* Search Tab", inputs=[], outputs="text", title="A* Search")


# --- Tab 3: A* Search Results (Updated) ---

# A* score as sum of already preprocessed features
df_combined = X.copy()  # Use processed feature set (X) directly
df_combined["Score"] = df_combined.sum(axis=1)
best_index = df_combined["Score"].idxmin()

# Retrieve the best car from the original data for display purposes
best_car = raw_df.loc[[best_index]].copy()
input_df_astar = X.loc[[best_index]]  # Ensure input matches model feature format

# Predict price using CatBoost for best car
best_car["Predicted_Price"] = cat_model.predict(input_df_astar)[0]
best_car["Score"] = df_combined.loc[best_index, "Score"]

# Display A* Search recommendation
def show_astar():
    if 'best_car' in globals():
        company_name = best_car['Company Name'].values[0]
        model_name = best_car['Model Name'].values[0]
        model_year = int(best_car['Model Year'].values[0])
        mileage = int(best_car['Mileage'].values[0])
        predicted_price = best_car['Predicted_Price'].values[0]
        score = best_car['Score'].values[0]

        return (
            f"🚘 **Recommended Car Based on A* Search**\n\n"
            f"**Company Name:** {company_name}\n"
            f"**Model Name:** {model_name}\n"
            f"**Model Year:** {model_year}\n"
            f"**Mileage:** {mileage:,} km\n"
            f"**Predicted Price:** PKR {predicted_price:,.0f}\n"
            f"**A* Score:** {score:.4f}"
        )
    else:
        return "A* Search results not available. Please run the A* code first."

# Gradio interface for A* Search tab
tab3 = gr.Interface(
    fn=show_astar,
    inputs=[],
    outputs=gr.Markdown(),
    title="A* Search Results"
)


# --- Tab 4: About ---
tab4 = gr.Interface(
    fn=lambda: """### About
This app predicts car prices using various regression models including CatBoost, LightGBM, Ridge, and Linear Regression.
It is designed to help users get an estimated price based on car features such as company, model, engine, mileage, etc.
Models are trained on real-world data from Pakistani automobile listings.""",
    inputs=[],
    outputs="markdown",
    title="About"
)


# --- Tab 5: Feedback ---
tab5 = gr.Interface(
    fn=lambda feedback: f"Thank you for your feedback:\n\n{feedback}",
    inputs=gr.Textbox(lines=4, placeholder="Enter your feedback here...", label="Feedback"),
    outputs="text",
    title="Feedback"
)


# --- Combine All Tabs ---
app = gr.TabbedInterface(
    [tab1, tab2, tab3, tab4, tab5],
    tab_names=["Model Comparison", "Predict Price", "A* Search", "About", "Feedback"]
)

app.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ba583ecf34f2a4602e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


