<a href="https://colab.research.google.com/github/DMNDKDasanayaka/Price-Prediction-Model/blob/main/Carrot_Price_Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [37]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [4]:
data=pd.read_csv("carrot_dataset.csv")

In [5]:
data

Unnamed: 0,Date,Average_Carrot_Price,Fuel_Price,Average_Rainfall,Average_Temperature
0,2016.01.01,175.0,109,13.94325,21.275
1,2016.01.02,91.5,109,2.01325,20.875
2,2016.01.03,200.0,109,0.29300,21.900
3,2016.01.04,175.0,109,0.13325,22.150
4,2016.01.05,160.0,109,0.12350,22.375
...,...,...,...,...,...
3648,2025.12.27,215.0,277,0.17500,19.225
3649,2025.12.28,225.0,277,1.20000,19.900
3650,2025.12.29,200.0,277,1.05000,20.500
3651,2025.12.30,190.0,277,1.77500,21.150


In [47]:
# Ensure date column is datetime
data['Date'] = pd.to_datetime(data['Date'])

# Create numeric features from date
data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.day


data['Average_Carrot_Price'] = data['Average_Carrot_Price'].astype(int)

In [58]:
data.drop(columns=['day_of_week'], inplace=True)
data.drop(columns=['week_of_year'], inplace=True)

In [59]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3653 entries, 0 to 3652
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  3653 non-null   datetime64[ns]
 1   Average_Carrot_Price  3653 non-null   int64         
 2   Fuel_Price            3653 non-null   int64         
 3   Average_Rainfall      3653 non-null   float64       
 4   Average_Temperature   3653 non-null   float64       
 5   year                  3653 non-null   int32         
 6   month                 3653 non-null   int32         
 7   day                   3653 non-null   int32         
dtypes: datetime64[ns](1), float64(2), int32(3), int64(2)
memory usage: 185.6 KB


In [60]:
data

Unnamed: 0,Date,Average_Carrot_Price,Fuel_Price,Average_Rainfall,Average_Temperature,year,month,day
0,2016-01-01,175,109,13.94325,21.275,2016,1,1
1,2016-01-02,91,109,2.01325,20.875,2016,1,2
2,2016-01-03,200,109,0.29300,21.900,2016,1,3
3,2016-01-04,175,109,0.13325,22.150,2016,1,4
4,2016-01-05,160,109,0.12350,22.375,2016,1,5
...,...,...,...,...,...,...,...,...
3648,2025-12-27,215,277,0.17500,19.225,2025,12,27
3649,2025-12-28,225,277,1.20000,19.900,2025,12,28
3650,2025-12-29,200,277,1.05000,20.500,2025,12,29
3651,2025-12-30,190,277,1.77500,21.150,2025,12,30


In [87]:
def preprocess_inputs(df):
    df = df.copy()

# Split df into X and y
    y = df['Average_Carrot_Price']
    X = df.drop(columns=['Date', 'Average_Carrot_Price'], axis=1)

# Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

# Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)


    return X_train, X_test, y_train, y_test


In [86]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [88]:
X_train

Unnamed: 0,Fuel_Price,Average_Rainfall,Average_Temperature,year,month,day
1902,104,0.62500,22.775,2021,3,17
1002,123,65.23950,22.225,2018,9,29
2792,306,0.70000,24.750,2023,8,24
3552,283,1.02500,18.925,2025,9,22
3502,289,24.56600,19.525,2025,8,3
...,...,...,...,...,...,...
2763,308,2.10700,23.950,2023,7,26
905,109,1.07600,23.300,2018,6,24
1096,101,0.10750,20.950,2019,1,1
235,109,12.81675,23.025,2016,8,23


In [89]:
y_train

Unnamed: 0,Average_Carrot_Price
1902,100
1002,105
2792,265
3552,100
3502,310
...,...
2763,365
905,105
1096,102
235,55


**Training**

In [90]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor(),
    "                              CatBoost": CatBoostRegressor(verbose=0)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 594
[LightGBM] [Info] Number of data points in the train set: 2557, number of used features: 6
[LightGBM] [Info] Start training from score 164.151740
                              LightGBM trained.
                              CatBoost trained.


In [91]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_test, y_test)))

                     Linear Regression R^2 Score: 0.36226
 Linear Regression (L2 Regularization) R^2 Score: 0.36226
 Linear Regression (L1 Regularization) R^2 Score: 0.36280
                   K-Nearest Neighbors R^2 Score: 0.70657
                        Neural Network R^2 Score: 0.34869
Support Vector Machine (Linear Kernel) R^2 Score: 0.03990
   Support Vector Machine (RBF Kernel) R^2 Score: -0.04868
                         Decision Tree R^2 Score: 0.79251
                         Random Forest R^2 Score: 0.87315
                     Gradient Boosting R^2 Score: 0.77335
                               XGBoost R^2 Score: 0.86171
                              LightGBM R^2 Score: 0.83694
                              CatBoost R^2 Score: 0.86046


In [92]:
len(y_test)

1096