# Avacado Prices Analysis And Forecasting

### Python Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

%matplotlib inline


from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier,RandomForestRegressor,GradientBoostingRegressor,ExtraTreesRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold,KFold,RepeatedStratifiedKFold,train_test_split,learning_curve
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler,RobustScaler
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC

from sklearn.calibration import CalibratedClassifierCV

from sklearn.metrics import accuracy_score,mean_squared_error

from sklearn.tree import DecisionTreeRegressor

import lightgbm as lgbm
from sklearn import utils
from lightgbm.sklearn import LGBMRegressor

import warnings
warnings.filterwarnings("ignore")

In [None]:
avacado = pd.read_csv("avocado.csv")
df = avacado.copy()
df.drop_duplicates()
df

**Alright,We need to auxiliary functions in order to get better coding concept.Let's describe it. :=)**

In [3]:
def load(dataset_path):
    data = pd.read_csv(dataset_path)
    return data
def data_sampling(dataframe):
    seedcount = int(len(dataframe)/8)
    np.random.seed(seedcount)
    return dataframe.sample(n=seedcount, random_state=1, replace=True)

def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)

    print("##################### Types #####################")
    print(dataframe.dtypes)

    print("##################### Head #####################")
    print(dataframe.head(head))

    print("##################### Tail #####################")
    print(dataframe.tail(head))

    print("##################### NA #####################")
    print(dataframe.isnull().sum())

    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

def grab_col_names(dataframe, cat_th=12, car_th=20):
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car,num_but_cat

def cat_summary(dataframe, col_name, plot=False):

    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("##########################################")

    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show()

def num_summary(dataframe, numerical_col, plot=False):

    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]

    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=20)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show(block=True)

def target_summary_with_cat(dataframe, target, categorical_col):

    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean()}), end="\n\n\n")

def target_summary_with_num(dataframe, target, numerical_col):
    print(dataframe.groupby(target).agg({numerical_col: "mean"}), end="\n\n\n")


def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

def grab_outliers(dataframe, col_name, index=False):
    low, up = outlier_thresholds(dataframe, col_name)

    if dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].shape[0] > 10:
        print(dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].head())
    else:
        print(dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))])

    if index:
        outlier_index = dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].index
        return outlier_index

def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")

    if na_name:
        return na_columns

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

def label_encoder(dataframe, binary_col):
    le = LabelEncoder()
    for i in binary_col:
        dataframe[i]=le.fit_transform(df[i])
    return dataframe

def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

def plot_importance(model, features, num, save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                      ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')

def showlineplot(style, x, y, dataframe, alhp,rot,color, xlabel, ylabel, title):
    sns.set_style(style)
    sns.lineplot(x=x,
                 y=y,
                 data=dataframe,
                 alpha=alhp,
                 color=color
                 )
    plt.xticks(rotation=rot)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.show()

**We have 18249 observations.We need to edit that tiny problem.This problem will be arising issue in next times.Therefore,we are diving to data sampling.Yaaay!**

## Data Sampling

In [4]:
df=data_sampling(df)

In [None]:
check_df(df)

- Finally,we have minimized our datas  in order to obtain result faster

*There is a missing column name.We should remove unnecessary column in order to edit our data*

In [6]:
df.drop(['Unnamed: 0','Date'],axis=1,inplace=True)

***Let's examine our data structure***

In [None]:
df.columns=df.columns.str.lower()
df.columns=df.columns.str.strip()
df.head()

In [None]:
cat_cols,num_cols,cat_but_car,num_but_car=grab_col_names(df)
num_but_car

In [None]:
cat_but_car

In [None]:
for col in num_cols:
    num_summary(df,col,plot=True)

In [None]:
sns.catplot(x="year",y="averageprice",hue="region",data=df);

*Absolutely it looks like so confused for everyone.However,we need to examine our datas by groupby function*

In [None]:
df.groupby(['region','year']).sum()

**as you can see,there are so many region on our data set :(**

# Data Preparation

### Outlier Analysis

**Alright, let's  look inside of our data whether there are outliers.We check by analyzing graphs**

In [None]:
for col in num_cols:
    print("{} :  {}".format(col,check_outlier(df,col)))
    grab_outliers(df,col)

## Outlier Processing

In [12]:
for col in num_cols:
    replace_with_thresholds(df,col)

***Let's check whether outliers of variables one more time***

In [None]:
for col in num_cols:
    print(check_outlier(df,col))

In [None]:
sns.boxplot(x=df.averageprice);

**We have checked value of price_wheat_ton both functionally and visually**

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True,fmt=".3f",linewidth=.5,cbar=True);

## Feature Engineering

In [17]:
categories=["Very Low","Low","Avarage","High","Very High"]

In [None]:
df["total_income"]=df["averageprice"]*df["total volume"]
df["reveneu_status"]=pd.cut(df["total_income"],len(categories),labels=categories)
df.head(30)

In [None]:
df.reveneu_status.value_counts()

## Encoding

### One hot Encoding

In [None]:
df.info()

In [21]:
ohe_cols=[col for col in df.columns if 80 >=df[col].nunique()>=2]
df=one_hot_encoder(df,ohe_cols,True)


In [None]:
df.info()

**We need to check what  kind of data  we have.Apparently, variable of region has so many different types.Therefore,we have associated  variable counts of region to onehotencoding**

In [None]:
ohe_cols

In [None]:
df.describe()

## Feature Scaling

In [None]:
transform_power = RobustScaler()
df[num_cols] = transform_power.fit_transform(df[num_cols])
df.head()

<a id="11"></a>
# Modeling

In [26]:
from sklearn.linear_model import LinearRegression

y = df.averageprice
X = df.drop(['averageprice'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=17)

In [27]:
def train_model(model, x_train, y_train, x_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    return model, y_pred

**Ok,we gonna use  regression models**

In [28]:
# Cross validate model with Kfold stratified cross val
kfold = KFold(n_splits=10,shuffle=True)

In [29]:
Lin_reg,y_pred=train_model(LinearRegression(),X_train,y_train,X_test)

In [None]:
Lin_reg.score(X_test,y_test)

In [31]:
RFM_reg,y_pred=train_model(RandomForestRegressor(),X_train,y_train,X_test)

In [None]:
RFM_reg.score(X_test,y_test)

In [33]:
Gradient_reg,y_pred=train_model(GradientBoostingRegressor(),X_train,y_train,X_test)

In [None]:
Gradient_reg.score(X_test,y_test)

In [35]:
Extra_reg,y_pred=train_model(ExtraTreesRegressor(),X_train,y_train,X_test)

In [None]:
Extra_reg.score(X_test,y_test)

In [37]:
Dec_reg,y_pred=train_model(DecisionTreeRegressor(),X_train,y_train,X_test)

In [None]:
Dec_reg.score(X_test,y_test)

In [39]:
Adabost_reg,y_pred=train_model(AdaBoostRegressor(),X_train,y_train,X_test)

In [None]:
Adabost_reg.score(X_test,y_test)

In [41]:
Lgbm_reg,y_pred=train_model(LGBMRegressor(),X_train,y_train,X_test)

In [None]:
Lgbm_reg.score(X_test,y_test)