# Random Forest Regression

First algorithm created by Tin Kam Ho in 1995

What it is: a supervised machine learning algorithm and uses ensemble learning method, can do both classification and regression

is a bagging technique

combines multiple decision trees

Pros: one of the most accurate learning algorithms, runs efficiently on large databases, has effective method for estimating missing data

Cons: sometimes overfits for some datasets


In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
def set_up():
    df = pd.read_csv('avocados.csv')
    df = df.loc[((df['geography'] == 'Total U.S.') & (df['type'] == 'conventional'))]
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].apply(date_to_month)
    df['day'] = df['date'].apply(date_to_day)
    df = df[['average_price', 'year', 'month', 'day']]
    return df

In [3]:
def train(df):
    labels = df['average_price']

    features = df.drop('average_price', axis=1)

    train_features, test_features, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.2, random_state=42)

    rf = RandomForestRegressor(n_estimators=1000, random_state=42)
    rf.fit(train_features, train_labels)

    display_test_features = test_features.apply(nums_to_date, axis=1)
    display_df = pd.DataFrame(display_test_features, columns=['date'])
    display_df['actual'] = test_labels
    display_df['predict'] = rf.predict(test_features)
    display_df = display_df.sort_values(by='date')

    fig1 = px.scatter(display_df, x='date', y='actual')

    fig2 = px.line(display_df, x='date', y='predict')
    fig2.update_traces(line_color='orange')

    fig3 = go.Figure(data=fig1.data + fig2.data)
    fig3.show()

    return rf

In [4]:
def predict(rf):

    predict_df = pd.DataFrame(pd.date_range(start='1-16-2021', end='1-16-2029', freq='8D'), columns=['date'])
    predict_df['date'] = pd.to_datetime(predict_df['date'])
    predict_df['year'] = predict_df['date'].apply(date_to_year)
    predict_df['month'] = predict_df['date'].apply(date_to_month)
    predict_df['day'] = predict_df['date'].apply(date_to_day)
    other = predict_df.drop('date', axis=1)
    predict_df['predict'] = rf.predict(other)

    fig4 = px.line(predict_df, x='date', y='predict')
    fig4.show()

In [5]:
def date_to_year(date):
    return date.year

In [6]:
def date_to_month(date):
    return date.month

In [7]:
def date_to_day(date):
    return date.day

In [8]:
def nums_to_date(row):
    return pd.to_datetime(str(row['year']) + str(row['month']) + str(row['day']), format='%Y%m%d')

In [9]:
def main():
    df = set_up()
    rf = train(df)
    predict(rf)

In [10]:
main()