In [1]:
import pandas as pd
import plotly.express as px
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import city_to_state
import us_state_abbrev
from enum import Enum

In [2]:
class AvoType(Enum):
    ORGANIC = 'organic'
    CONVENTIONAL = 'conventional'

In [3]:
class MapType(Enum):
    MONTHLY = 'year_n_month'
    YEARLY = 'year'

In [4]:
city_to_state_dict = city_to_state.city_to_state_dict
us_state_abbrev_dict = us_state_abbrev.us_state_to_abbrev_dict

In [5]:
regions = {
    'West': ['Utah', 'New Mexico'],
    'Plains': ['Oklahoma', 'Kansas', 'Nebraska', 'South Dakota', 'Wyoming', 'Montana', 'North Dakota'],
    'Southeast': ['West Virginia', 'Virginia', 'Mississippi', 'Alabama'],
    'Northeast': ['New Jersey', 'Rhode Island', 'Vermont', 'New Hampshire', 'Maine', 'Delaware'],
    'Northern New England': ['New Jersey'],
    'Great Lakes': ['Wisconsin', 'Iowa']
}

In [6]:
df = pd.DataFrame()

In [7]:
def set_up():
    avocado_df = pd.read_csv('avocados.csv')

    avocado_df = pd.concat(
        [avocado_df,
         get_predicted_prices_from_all(avocado_df)],
        join='inner')

    # float year and month column (year.month)
    avocado_df['year_n_month'] = avocado_df['date'].replace('-', '', regex=True)
    avocado_df['year_n_month'] = pd.to_numeric(avocado_df['year_n_month'])
    avocado_df['year_n_month'] = avocado_df['year_n_month'].apply(to_readable_date)

    # state column from city
    avocado_df['state'] = avocado_df['geography'].apply(to_state)
    # get states not included from regions (plains, northern new england, great lakes)
    avocado_df = pd.concat([avocado_df, get_all_non_included_states_from_regions(avocado_df)], ignore_index=True)
    # code column from state names (for map visualization)
    avocado_df['code'] = avocado_df['state'].apply(to_abbrev)

    # print(avocado_df.loc[(avocado_df['state'] == 'NaC')].geography.unique())
    # print(avocado_df.loc[(avocado_df['code'] == 'NaS')].geography.unique())

    return avocado_df

In [8]:
def get_predicted_prices_from_all(avocado_df):
    from_all_df = avocado_df[0:0]
    count = (len(avocado_df['geography'].unique()))
    for geography in avocado_df['geography'].unique():
        temp = get_predicted_prices_from(avocado_df.loc[(avocado_df['geography'] == geography)].copy(deep=True))
        temp['geography'] = geography
        from_all_df = pd.concat(
            [from_all_df,
             temp],
            join='inner')
        count -= 1
        print(count)
    from_all_df['date'] = pd.to_datetime(from_all_df['date'])
    from_all_df['date'] = from_all_df['date'].dt.date
    from_all_df['date'] = from_all_df['date'].astype(str)
    return from_all_df

In [9]:
def get_predicted_prices_from(region_df):
    res_df = region_df[0:0]
    organic_train_df = region_df.loc[(region_df['type'] == 'organic')]
    conventional_train_df = region_df.loc[(region_df['type'] == 'conventional')]
    organic_train_df = date_to_number_columns(organic_train_df.copy(deep=True))
    conventional_train_df = date_to_number_columns(conventional_train_df.copy(deep=True))
    organic_predict = predict(train(organic_train_df))
    conventional_predict = predict(train(conventional_train_df))
    organic_predict['type'] = 'organic'
    conventional_predict['type'] = 'conventional'
    res_df = pd.concat([res_df, organic_predict], join='inner')
    res_df = pd.concat([res_df, conventional_predict], join='inner')
    return res_df

In [10]:
def train(train_df):
    labels = train_df['average_price']

    features = train_df.drop('average_price', axis=1)

    train_features, test_features, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.2, random_state=42)

    rf = RandomForestRegressor(n_estimators=1000, random_state=42)
    rf.fit(train_features, train_labels)

    return rf

In [11]:
def predict(model):
    predict_df = pd.DataFrame(pd.date_range(start='1-16-2021', end='1-16-2025', freq='8D'), columns=['date'])
    predict_df['year'] = predict_df['date'].apply(date_to_year)
    predict_df['month'] = predict_df['date'].apply(date_to_month)
    predict_df['day'] = predict_df['date'].apply(date_to_day)
    other = predict_df.drop('date', axis=1)
    predict_df['average_price'] = model.predict(other)

    return predict_df

In [12]:
def get_all_non_included_states_from_regions(avocado_df):
    total_df = avocado_df[0:0]
    for region in regions.keys():
        curr_df = avocado_df.loc[(avocado_df['geography'] == region)]
        for x in range(0, len(regions[region])):
            temp = curr_df.assign(state=regions[region][x])
            total_df = pd.concat([total_df, temp], ignore_index=True)
    return total_df

In [13]:
def show_map(map_type, avo_typ):
    show_df = get_from_all(map_type, avo_typ)

    color_scale = 'deep'
    if avo_typ == AvoType.CONVENTIONAL:
        color_scale = 'amp'

    max_val = show_df.loc[(df['type'] == avo_typ.value), :]['average_price'].max()
    min_val = show_df.loc[(df['type'] == avo_typ.value), :]['average_price'].min()

    fig = px.choropleth(
        show_df,
        locations='code',
        locationmode='USA-states',
        hover_name='state',
        color='average_price',
        color_continuous_scale=color_scale,
        range_color=(min_val, max_val),
        animation_frame=map_type.value,
        scope='usa',
        title=f'Prices of {avo_typ.name} Avocados across the U.S. ({map_type.name})',
        labels={'average_price': 'average price', 'year_n_month': 'date'}
    )
    fig.show()

In [14]:
def get_from_all(map_typ, avo_typ):
    chosen_df = df.copy()

    from_all = chosen_df[0:0]
    for x in range(0, len(chosen_df[map_typ.value].unique())):
        from_all = (pd.concat(
            [from_all,
             get_from(map_typ, chosen_df[map_typ.value].unique()[x], avo_typ)],
            ignore_index=True))
    return from_all

In [15]:
def get_from(map_typ, val, avo_typ):
    vals_df = df.loc[((df[map_typ.value] == val) & (df['type'] == avo_typ.value))]
    temp = vals_df.groupby(vals_df.state)['average_price'].mean()

    res = pd.DataFrame(temp.index.values, columns=['state'])

    res['code'] = res['state'].apply(to_abbrev)
    res['average_price'] = temp.values
    res[map_typ.value] = val
    res = res.drop(res[res['state'] == 'NaC'].index)

    return res

In [16]:
def to_abbrev(state):
    if state in us_state_abbrev_dict:
        return us_state_abbrev_dict[state]
    else:
        return 'NaS'

In [17]:
def to_state(city):
    if city in city_to_state_dict:
        return city_to_state_dict[city]

    city = city.split("/")[0]
    if city in city_to_state_dict:
        return city_to_state_dict[city]
    if city in us_state_abbrev_dict:
        return city
    elif city == 'Hartford':
        return city_to_state_dict['West Hartford']
    elif city == 'Roanoke':
        return city_to_state_dict['Roanoke Rapids']
    else:
        return 'NaC'

In [18]:
def date_to_year(date):
    return date.year

In [19]:
def date_to_month(date):
    return date.month

In [20]:
def date_to_day(date):
    return date.day

In [21]:
def date_to_number_columns(chosen_df):
    chosen_df['date'] = pd.to_datetime(chosen_df['date'])
    chosen_df['month'] = chosen_df['date'].apply(date_to_month)
    chosen_df['day'] = chosen_df['date'].apply(date_to_day)
    chosen_df = chosen_df[['average_price', 'year', 'month', 'day']]
    return chosen_df

In [22]:
def to_readable_date(date):
    date = date // 100
    year = date // 100
    month = date % 100
    return year + (month / 100)

In [23]:
def show_scatter():
    us_df = df.loc[(df['geography'] == 'Total U.S.'), :].copy(deep=False)
    us_df['date'] = pd.to_datetime(us_df['date'])
    us_fig = px.scatter(
        us_df,
        x='date',
        y='total_volume',
        color='type',
        size=us_df['average_price'].values,
        title='Total Volume of Avocados Sold in the U.S.',
        trendline="ols",
    )
    us_fig.show()

In [24]:
def main():
    global df
    df = set_up()

    show_map(MapType.YEARLY, AvoType.ORGANIC)
    show_map(MapType.YEARLY, AvoType.CONVENTIONAL)
    show_map(MapType.MONTHLY, AvoType.ORGANIC)
    show_map(MapType.MONTHLY, AvoType.CONVENTIONAL)

    # show_scatter()

In [25]:
main()

53


52


51


50


49


48


47


46


45


44


43


42


41


40


39


38


37


36


35


34


33


32


31


30


29


28


27


26


25


24


23


22


21


20


19


18


17


16


15


14


13


12


11


10


9


8


7


6


5


4


3


2


1


0


ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['date', 'average_price', 'type', 'year', 'geography', 'year_n_month', 'state', 'code'] but received: total_volume