In [None]:
import pandas as pd # for data wrangling purpose
import numpy as np # Basic computation library
import seaborn as sns # For Visualization 
import matplotlib.pyplot as plt # ploting package
%matplotlib inline
import warnings # Filtering warnings
warnings.filterwarnings('ignore')

In [None]:
AV=pd.read_csv('avocado.csv')

In [None]:
print('No of Rows:',AV.shape[0])
print('No of Columns:',AV.shape[1])
AV.head()

In [None]:
AV.columns

In [None]:
AV.drop(['Unnamed: 0'], axis=1, inplace=True

In [None]:
AV.head()

Statistical Analysis

In [None]:
AV.duplicated().sum()  # This will check the duplicate data for all columns.

Missing value check

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(AV.isnull())

In [None]:
missing_values = AV.isnull().sum().sort_values(ascending = False)
percentage_missing_values =(missing_values/len(AV))*100
print(pd.concat([missing_values, percentage_missing_values], axis =1, keys =['Missing Values', '% Missing data']))

In [None]:
AV.info()

In [None]:
# Converting datatype of date column
AV['Date']=pd.to_datetime(AV.Date)

In [None]:
AV['Date'].dtype

Statistical Matrix

In [None]:
# Visualizing the statistics of the columns using heatmap.
plt.figure(figsize=(20,8))
sns.heatmap(AV.describe(),linewidths = 0.1,fmt='0.1f',annot = True,cmap='PiYG')

In [None]:
AV.describe().T.round(3)

In [None]:
print("\033[1m"+'Minimum Price of Avocado :'+"\033[0m",AV.AveragePrice.min(),'USD')
print("\033[1m"+'Maximum Price of avacado :'+"\033[0m",AV.AveragePrice.max(),'USD')
print("\033[1m"+'Average Price of avacado :'+"\033[0m",AV.AveragePrice.mean(),'USD')

In [None]:
plt.figure(figsize=(12,7))
plt.title('Distribution Price')
sns.distplot(AV["AveragePrice"], color='b')

In [None]:
AV['type'].value_counts()

In [None]:
AV.groupby('type')['AveragePrice'].mean()

In [None]:
print("\033[1m"+'Percentage difference in price of avacado :'+"\033[0m",((1.65399-1.1580)/1.1580)*100,"%")

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(y="type", x="AveragePrice", data=AV, palette = 'hsv')

In [None]:
labels = 'Conventional','Organic',
fig, ax = plt.subplots()
ax.pie(AV.groupby('type')['Total Volume'].mean(),labels = labels,radius =2,autopct = '%2.2f%%',explode=[0.3,0.2], shadow=True,)
plt.show()

In [None]:
AV.groupby('type')['Total Volume'].mean()

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x='year', y='Total Volume', hue='type',data=AV, palette='coolwarm',ci=68)
plt.show()

In [None]:
pd.crosstab([AV.type,AV.year],Av.AveragePrice

In [None]:
AV.groupby(['year','type'])['AveragePrice'].agg([min,max,np.mean,np.median])

In [None]:
AV.groupby(['year','type'])['Total Volume','Total Bags','Small Bags','Large Bags','XLarge Bags'].agg([sum])

In [None]:
AV.sort_values(by=['Date'], inplace=True, ascending=True)

In [None]:
# Average price of Conventional Avocados over time

mask = AV['type']== 'conventional'
plt.rc('figure', titlesize=20)
fig = plt.figure(figsize = (27, 12))
fig.suptitle('Average Price of Conventional Avocados Over Time', fontsize=25)
ax = fig.add_subplot(111)

dates = AV[mask]['Date'].tolist()
avgPrices = AV[mask]['AveragePrice'].tolist()

plt.scatter( dates,avgPrices, c=avgPrices, cmap='plasma')
ax.set_xlabel('Date',fontsize = 15,)
ax.set_ylabel('Average Price (USD)', fontsize = 15)
ax.set_xlim()
ax.tick_params(labelrotation=90)

plt.show()

In [None]:
# Average price of Organic Avocados over time
mask = AV['type']== 'organic'
plt.rc('figure', titlesize=20)
fig = plt.figure(figsize = (27, 12))
fig.suptitle('Average Price of Organic Avocados Over Time', fontsize=25)
ax = fig.add_subplot(111)
fig.subplots_adjust(top=0.93)

dates = AV[mask]['Date'].tolist()
avgPrices = AV[mask]['AveragePrice'].tolist()

plt.scatter(dates,avgPrices, c=avgPrices, cmap='plasma')
ax.set_xlabel('Date',fontsize = 15)
ax.set_ylabel('Average Price (USD)', fontsize = 15)
plt.xlim()
plt.show()

In [None]:
AV.groupby(['region'])['Total Volume','AveragePrice'].agg([sum])

EDA

In [None]:
AV.head()

In [None]:
AV2=AV.copy()

In [None]:
# Creating Datframe for numeric features
AV2.drop(['Date','type','region','year'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(20,25), facecolor='white')
plotnumber =1
for column in AV2:
    if plotnumber <=9:
        ax = plt.subplot(3,3,plotnumber)
        sns.distplot(AV2[column], color='r',hist=False,kde_kws={"shade": True})
        plt.xlabel(column,fontsize=20)
    plotnumber+=1
plt.show()

In [None]:
sns.set_palette('gist_rainbow_r')
plt.figure(figsize=(20,20), facecolor='white')
plotnumber =1
for column in AV2:
    if plotnumber <=9:
        ax = plt.subplot(3,3,plotnumber)
        sns.violinplot(AV[column])
        plt.xlabel(column,fontsize=20)
    plotnumber+=1
plt.show()

In [None]:
AV=pd.read_csv('avocado.csv')

In [None]:
# Copy of original dataframe
AV3=AV.copy()

In [None]:
AV3['Year'], AV3['Month'], AV3['Day'] = AV3['Date'].str.split('-').str
AV3.drop(columns=['Date','year'], inplace=True)
AV3

In [None]:
# Converting Datetime datatypes to int
AV3['Year']=AV3['Year'].astype(int)
AV3['Month']=AV3['Month'].astype(int)
Av3['Date']=AV3['Day'].astype(int)

In [None]:
AV3.drop(columns=['Day'],axis=1, inplace=True)

In [None]:
AV3.drop(columns=['Date'],axis=1, inplace=True)

In [None]:
AV3.head()

In [None]:
# Checking unique values and Yearwise total No of sale entry
AV3['Year'].value_counts()

In [None]:
sns.countplot('Year',data=AV3)

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x='Month', data=AV3, palette="spring")

In [None]:
AV3['Month'].value_counts().max()

In [None]:
AV3['Month'].value_counts().min()

In [None]:
plt.figure(figsize=(15,7))
sns.barplot(x="Month", y="AveragePrice", hue='type', data=AV3)
plt.show()

In [None]:
AV3.groupby(['Month','type'])['AveragePrice'].agg([np.mean]).T

In [None]:
plt.figure(figsize=(10,10))
sns.catplot(x = "Month", y = "AveragePrice", kind ='violin', data=AV3, linewidth=2)
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.barplot(x=df['region'],y=df['AveragePrice'], data=AV3)
plt.title(f'Bar Plot for regions')
plt.xticks(rotation=90)
plt.show()

In [None]:
region = AV.groupby('region')['Total Volume'].sum().sort_values(ascending=False).reset_index()
plt.figure(figsize=(15,13))
sns.barplot(x=region["Total Volume"], y=region["region"], data=AV, palette="Set1")
plt.show()

In [None]:
AV.groupby(['region'])['Total Volume','Total Bags','Small Bags','Large Bags','XLarge Bags'].agg([np.mean])

In [None]:
plt.figure(figsize=(20,25),facecolor='white')
plotnumber=1
y = AV[['4046','4225','4770']]
X = AV['year']
for col in y:
    if plotnumber<=9:
        plt.subplot(3,3,plotnumber)
        sns.barplot(X,y[col])
        plt.xlabel('Year',fontsize=20)
        plt.ylabel(col,fontsize=20)
    plotnumber+=1
plt.tight_layout()

In [None]:
plt.figure(figsize=(20,25),facecolor='white')
plotnumber=1
y = AV[['Small Bags','Large Bags','XLarge Bags']]
X = AV['year']
for col in y:
    if plotnumber<=9:
        plt.subplot(3,3,plotnumber)
        sns.barplot(X,y[col])
        plt.xlabel('Year',fontsize=20)
        plt.ylabel(col,fontsize=20)
    plotnumber+=1
plt.tight_layout()

In [None]:
mask = AV['type']=='conventional'
g = sns.factorplot('AveragePrice','region',data=AV[mask],
                   hue='year',
                   size=13,
                   aspect=0.8,
                   palette='magma',
                   join=False,
                  )

In [None]:
mask = AV['type']=='organic'
g = sns.factorplot('AveragePrice','region',data=AV[mask],
                   hue='year',
                   size=13,
                   aspect=0.8,
                   palette='magma',
                   join=False,
                  )

In [None]:
sns.pairplot(AV2)

Encoding categorical data

In [None]:
# Using Label Encoder on target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
AV3['region']=le.fit_transform(AV3['region'])
AV3['type']=le.fit_transform(AV3['type'])
AV3.head()

Outliers Detection and Removal

In [None]:
plt.figure(figsize=(18,15), facecolor='white')
plotnumber =1
for column in AV2:
    if plotnumber <=9:
        ax = plt.subplot(3,3,plotnumber)
        sns.boxplot(AV2[column], palette='hsv')
        plt.xlabel(column,fontsize=20)
    plotnumber+=1
plt.tight_layout()
plt.show()

In [None]:
from scipy.stats import zscore
z = np.abs(zscore(AV3))
threshold = 3
AV4 = AV3[(z<3).all(axis = 1)]

In [None]:
print("\033[1m"+'Shape of dataset after removing outliers :'+"\033[0m",AV4.shape)

In [None]:
print("\033[1m"+'Percentage Data Loss :'+"\033[0m",((18249-17651)/18249)*100,'%')

Corrleation

In [None]:
AV4.corr()

In [None]:
plt.figure(figsize=(25,18))
sns.heatmap(AV4.corr(), vmin=-1, vmax=1, annot=True, square=True, fmt='0.3f', 
            annot_kws={'size':10}, cmap="gist_stern")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
plt.figure(figsize = (18,6))
AV4.corr()['AveragePrice'].drop(['AveragePrice']).plot(kind='bar',color = 'c')
plt.xlabel('Features',fontsize=15)
plt.ylabel('AveragePrice',fontsize=15)
plt.title('Correlation of features with Target Variable Average Price',fontsize = 18)
plt.show()

In [None]:
AV4.head()

In [None]:
AV4.drop(columns=['4046','4225','4770','region','TotalBags'],axis=1, inplace=True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif= pd.DataFrame()
vif['VIF']= [variance_inflation_factor(AV4.values,i) for i in range(AV4.shape[1])]
vif['Features']= AV4.columns
vif

In [None]:
AV4.drop(columns=['Year'],axis=1, inplace=True)

In [None]:
plt.figure(figsize = (14,5))
AV4.corr()['AveragePrice'].drop(['AveragePrice']).plot(kind='bar',color = 'c')
plt.xlabel('Features',fontsize=15)
plt.ylabel('AveragePrice',fontsize=15)
plt.title('Correlation of features with Target Variable Average Price',fontsize = 18)
plt.show()

In [None]:
Skewness of features

In [None]:
AV4.skew()

In [None]:
for col in df4.columns:
    if AV4.skew().loc[col]>0.55:
        AV4[col]=np.log1p(AV4[col])

Standard Scaling

In [None]:
X = AV4.drop('AveragePrice', axis=1)
Y = AV4['AveragePrice']

In [None]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
X_scale = scaler.fit_transform(X)

Machine Learning Model Building

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import  GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import  Ridge
from sklearn.linear_model import  Lasso
from xgboost import XGBRegressor

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state=42, test_size=.33)
print('Training feature matrix size:',X_train.shape)
print('Training target vector size:',Y_train.shape)
print('Test feature matrix size:',X_test.shape)
print('Test target vector size:',Y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
maxR2_score=0
maxRS=0
for i in range(1,1000):
    X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state=i, test_size=.33)
    lin_reg=LinearRegression()
    lin_reg.fit(X_train,Y_train)
    y_pred=lin_reg.predict(X_test)
    R2=r2_score(Y_test,y_pred)
    if R2>maxR2_score:
        maxR2_score=R2
        maxRS=i
print('Best R2 Score is', maxR2_score ,'on Random_state', maxRS)

Linear Regression : Base model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state=557, test_size=.33)
lin_reg=LinearRegression()
lin_reg.fit(X_train,Y_train)
lin_reg.score(X_train,Y_train)
y_pred=lin_reg.predict(X_test)
print('\033[1m'+'Predicted Wins:'+'\033[0m\n',y_pred)
print('\n')
print('\033[1m'+'Actual Wins:'+'\033[0m\n',Y_test)

Linear Regression Evaluation Matrix

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
print('\033[1m'+' Error :'+'\033[0m')
print('Mean absolute error :', mean_absolute_error(Y_test,y_pred))
print('Mean squared error :', mean_squared_error(Y_test,y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(Y_test,y_pred)))
print('\n')
from sklearn.metrics import r2_score
print('\033[1m'+' R2 Score :'+'\033[0m')
print(r2_score(Y_test,y_pred,multioutput='variance_weighted'))

In [None]:
# Cross Validation
from sklearn.model_selection import cross_val_score
score = cross_val_score(lin_reg, X_scale, Y, cv =5)
print('\033[1m'+'Cross Validation Score :',lin_reg,":"+'\033[0m\n')
print("Mean CV Score :",score.mean())

Applying other Regression Model, Evaluation & Crossvalidation

In [None]:
rf = RandomForestRegressor(n_estimators = 70 ,max_depth=25)
dtc = DecisionTreeRegressor()
adb=AdaBoostRegressor(learning_rate=0.1)
gradb=GradientBoostingRegressor(max_depth=25,learning_rate=0.1)
rd=Ridge(alpha=0.01)
xgb=XGBRegressor()
model = [rf,rd,dtc,adb,gradb,xgb]

for m in model:
    m.fit(X_train,Y_train)
    m.score(X_train,Y_train)
    y_pred = m.predict(X_test)
    print('\n')                                        
    print('\033[1m'+' Error of ', m, ':' +'\033[0m')
    print('Mean absolute error :', mean_absolute_error(Y_test,y_pred))
    print('Mean squared error :', mean_squared_error(Y_test,y_pred))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(Y_test,y_pred)))
    print('\n')

    print('\033[1m'+' R2 Score :'+'\033[0m')
    print(r2_score(Y_test,y_pred)) 
    print('==============================================================================================================')

In [None]:
# Cross Validation
model = [rf,rd,dtc,adb,gradb,xgb]

for m in model:
    score = cross_val_score(m, X_scale, Y, cv =5)
    print('\n')
    print('\033[1m'+'Cross Validation Score :',m,":"+'\033[0m\n')
    print("Mean CV Score :",score.mean())
    print('==============================================================================================================')

Hyper Parameter Tuning : GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
parameter = {'n_estimators':[30,60,80],'max_depth': [10,20,40],
             'min_samples_leaf':[5,10],'criterion':['mse','mae'],
             'max_features':["auto","sqrt","log2"]}

In [None]:
GCV = GridSearchCV(RandomForestRegressor(),parameter,cv=5,n_jobs = -1,verbose = 3)

In [None]:
GCV.fit(X_train,Y_train)

Final Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
Final_mod =  RandomForestRegressor(n_estimators=60 ,criterion = 'mse', max_depth= 20, max_features = 'auto',
             min_samples_leaf = 5, min_samples_split = 10)
Final_mod.fit(X_train,Y_train)
y_pred=Final_mod.predict(X_test)
print('\n')                                        
print('\033[1m'+' Error in Final Model :' +'\033[0m')
print('Mean absolute error :', mean_absolute_error(Y_test,y_pred))
print('Mean squared error :', mean_squared_error(Y_test,y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(Y_test,y_pred)))
print('\n')
print('\033[1m'+' R2 Score of Final Model :'+'\033[0m')
print(r2_score(Y_test,y_pred)) 
print('\n')

In [None]:
plt.figure(figsize=(12,7))
y_pred=Final_mod.predict(X_test)
sns.swarmplot(Y_test.round(2), y_pred)
print('\033[1m'+' True Values Vs Predicted Value plot :' +'\033[0m')
plt.xlabel('True Values' , fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.tight_layout()

Saving model

In [None]:
import joblib
joblib.dump(Final_mod,'Avacado_Final.pkl')