In [82]:
pip install catboost

Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4


In [83]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [86]:
data=pd.read_csv("/content/Vegetable_market.csv")

In [87]:
data

Unnamed: 0,Vegetable,Season,Month,Temp,Deasaster Happen in last 3month,Vegetable condition,Price per kg
0,potato,winter,jan,15,no,fresh,20
1,tomato,winter,jan,15,no,fresh,50
2,peas,winter,jan,15,no,fresh,70
3,pumkin,winter,jan,15,no,fresh,25
4,cucumber,winter,jan,15,no,fresh,20
...,...,...,...,...,...,...,...
116,brinjal,winter,jan,15,yes,fresh,33
117,ginger,winter,jan,15,no,fresh,88
118,potato,summer,apr,32,no,fresh,24
119,peas,summer,apr,33,no,fresh,33


In [88]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 7 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Vegetable                        121 non-null    object
 1   Season                           121 non-null    object
 2   Month                            121 non-null    object
 3   Temp                             121 non-null    int64 
 4   Deasaster Happen in last 3month  121 non-null    object
 5   Vegetable condition              121 non-null    object
 6   Price per kg                     121 non-null    int64 
dtypes: int64(2), object(5)
memory usage: 6.7+ KB


In [89]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [90]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Clean Vegetable condition column
    df['Vegetable condition'] = df['Vegetable condition'].replace({'scarp': 'scrap'})
    
    # Binary encoding
    df['Deasaster Happen in last 3month'] = df['Deasaster Happen in last 3month'].replace({'no': 0, 'yes': 1})
    
    # Ordinal encoding
    df['Month'] = df['Month'].replace({
        'jan': 1,
        'apr': 4,
        'july': 7,
        'sept': 9,
        'oct': 10,
        'dec': 12,
        'may': 5,
        'aug': 8,
        'june': 6,
        ' ': np.NaN,
        'march': 3
    })
    
    # Fill missing month values with column mode
    df['Month'] = df['Month'].fillna(df['Month'].mode()[0])
    
    # One-hot encoding
    for column in ['Vegetable', 'Season', 'Vegetable condition']:
        df = onehot_encode(df, column)
    
    # Split df into X and y
    y = df['Price per kg']
    X = df.drop('Price per kg', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [91]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [92]:
X_train

Unnamed: 0,Month,Temp,Deasaster Happen in last 3month,Vegetable_Bitter gourd,Vegetable_Raddish,Vegetable_brinjal,Vegetable_cabage,Vegetable_califlower,Vegetable_chilly,Vegetable_cucumber,...,Vegetable_radish,Vegetable_tomato,Season_autumn,Season_monsoon,Season_spring,Season_summer,Season_winter,Vegetable condition_avarage,Vegetable condition_fresh,Vegetable condition_scrap
80,-0.807171,0.135584,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,-0.408248,-0.19245,-0.669534,1.024100,2.236068,-1.452966,-0.427900
38,1.067549,0.569452,1.628550,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,2.449490,-0.19245,-0.669534,-0.976467,-0.447214,0.688247,-0.427900
19,0.130189,0.894854,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,-0.408248,-0.19245,1.493576,-0.976467,-0.447214,0.688247,-0.427900
120,0.130189,0.786387,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,4.472136,-0.346410,-0.156174,-0.408248,-0.19245,1.493576,-0.976467,-0.447214,0.688247,-0.427900
27,0.130189,1.437189,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,-0.408248,-0.19245,1.493576,-0.976467,-0.447214,0.688247,-0.427900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,-0.807171,-1.057554,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,2.886751,-0.156174,-0.408248,-0.19245,-0.669534,1.024100,-0.447214,0.688247,-0.427900
72,2.629816,-1.057554,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,-0.408248,-0.19245,-0.669534,1.024100,-0.447214,-1.452966,2.336993
12,-0.807171,-1.057554,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,3.316625,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,-0.408248,-0.19245,-0.669534,1.024100,-0.447214,0.688247,-0.427900
107,2.629816,-0.406752,-0.614043,3.974921,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,-0.408248,-0.19245,-0.669534,1.024100,-0.447214,-1.452966,2.336993


In [93]:
y_train

80      32
38     250
19     100
120      9
27      20
      ... 
9       45
72      10
12      20
107     32
37      40
Name: Price per kg, Length: 84, dtype: int64

In [94]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor(),
    "                              CatBoost": CatBoostRegressor(verbose=0)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
                              LightGBM trained.
                              CatBoost trained.


In [95]:
##results

In [96]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_test, y_test)))

                     Linear Regression R^2 Score: 0.71175
 Linear Regression (L2 Regularization) R^2 Score: 0.71197
 Linear Regression (L1 Regularization) R^2 Score: 0.70461
                   K-Nearest Neighbors R^2 Score: 0.24744
                        Neural Network R^2 Score: -0.35216
Support Vector Machine (Linear Kernel) R^2 Score: 0.41825
   Support Vector Machine (RBF Kernel) R^2 Score: -0.12694
                         Decision Tree R^2 Score: 0.59517
                         Random Forest R^2 Score: 0.64304
                     Gradient Boosting R^2 Score: 0.62804
                               XGBoost R^2 Score: 0.69530
                              LightGBM R^2 Score: 0.15834
                              CatBoost R^2 Score: 0.62010
