### Random Forest Model

#### Note: There are misspellings and mistakes in the csv file, these were originally going to be dealt with separately, but we will just fix them in the dataframe preprocessing section

In [25]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('./Vegetable_market.csv')

### PreProcessing
Look at "Printing How Many Unique Values in each Column" to see why we are doing binary encoding

In [29]:
setOfMonths = {
        'jan': 1,
        'apr': 4,
        'july': 7,
        'sept': 9,
        'oct': 10,
        'dec': 12,
        'may': 5,
        'aug': 8,
        'june': 6,
        ' ': np.NaN,
        'march': 3
}

def onehot(df, column):
    df = df.copy() #So that we don't mess withe OG df
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df


def preprocess_inputs(df):
    df = df.copy() #Creates a fresh copy of the dataframe
    
    # Ordinal Encoding with the months and Fixing empty months
    df['Month'] = df['Month'].replace(setOfMonths)
    df['Month'] = df['Month'].fillna(df['Month'].mode()[0])    
    
    # Fixing the Vegetable Condition column: "scarp" ->  "scrap"
    df['Vegetable condition'] = df['Vegetable condition'].replace({'scarp': 'scrap'})
    
    #Binary Encoding on Disaster Happen in the last 3 months
    df['Deasaster Happen in last 3month'] = df['Deasaster Happen in last 3month'].replace({'no': 0, 'yes': 1})
    
    # Onehot encoding to create only 1's and 0's for ease of compution of categorical data
    for column in ['Vegetable', 'Season', 'Vegetable condition']:
        df = onehot(df, column)

    Y = df['Price per kg'] # What we're finding
    X = df.drop('Price per kg', axis=1) # Everything that goes into Y

    # Split into Test and Train
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, shuffle=True, random_state=1)

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = preprocess_inputs(data)

X_train

Unnamed: 0,Month,Temp,Deasaster Happen in last 3month,Vegetable_Bitter gourd,Vegetable_Raddish,Vegetable_brinjal,Vegetable_cabage,Vegetable_califlower,Vegetable_chilly,Vegetable_cucumber,...,Vegetable_radish,Vegetable_tomato,Season_autumn,Season_monsoon,Season_spring,Season_summer,Season_winter,Vegetable condition_avarage,Vegetable condition_fresh,Vegetable condition_scrap
80,-0.807171,0.135584,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,-0.408248,-0.19245,-0.669534,1.024100,2.236068,-1.452966,-0.427900
38,1.067549,0.569452,1.628550,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,2.449490,-0.19245,-0.669534,-0.976467,-0.447214,0.688247,-0.427900
19,0.130189,0.894854,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,-0.408248,-0.19245,1.493576,-0.976467,-0.447214,0.688247,-0.427900
120,0.130189,0.786387,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,4.472136,-0.346410,-0.156174,-0.408248,-0.19245,1.493576,-0.976467,-0.447214,0.688247,-0.427900
27,0.130189,1.437189,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,-0.408248,-0.19245,1.493576,-0.976467,-0.447214,0.688247,-0.427900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,-0.807171,-1.057554,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,2.886751,-0.156174,-0.408248,-0.19245,-0.669534,1.024100,-0.447214,0.688247,-0.427900
72,2.629816,-1.057554,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,-0.408248,-0.19245,-0.669534,1.024100,-0.447214,-1.452966,2.336993
12,-0.807171,-1.057554,-0.614043,-0.251577,-0.251577,-0.156174,-0.301511,3.316625,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,-0.408248,-0.19245,-0.669534,1.024100,-0.447214,0.688247,-0.427900
107,2.629816,-0.406752,-0.614043,3.974921,-0.251577,-0.156174,-0.301511,-0.301511,-0.156174,-0.156174,...,-0.223607,-0.346410,-0.156174,-0.408248,-0.19245,-0.669534,1.024100,-0.447214,-1.452966,2.336993


#### Binary Encoding: Printing How Many Unique Values in each Column
The reason why we do this is so we can see if we need to do binary encoding if there are 2 unique values.  
"Disaster Happen in the last 3 months" has only two values, yes and no, and we will be encoding these

### OneHot Encoding: Unordered, categorical values
The "Vegetables" column are unordered but categorical, so we send it to onehot encoding.  
We will also use this encoding for the seasons since there isn't any particular spacing.  
We will use the *get_dummies()* function to convert to essentially a sparse matrix with *1* as the indicator of vegetable  

### Ordinal Encoding: Issues with the Months
There are some encoding issues with Months, so we are going to set them to their numerical key value. We will do this by just setting a set to replace the current column.  
There are some empty months (i.e. ' ' instead of an actual month).  
We will fix the empty values is to use the *mode* of the set. This will give us more realistic values.   

### Training the Model

In [46]:
from sklearn.metrics import  mean_squared_error 
models = {
    "Random Forest": RandomForestRegressor(),
}
for name, model in models.items():
    model.fit(X_train, Y_train)
    print(name + " trained.")
for name, model in models.items():
    predictions = model.predict(X_test)
    mse = mean_squared_error(Y_test, predictions)
    rmse = np.sqrt(mse)
    print(name + " RMSE: {:.5f}".format(rmse))

Random Forest trained.
Random Forest RMSE: 28.58190
