In [1]:
import os
import pandas as pd
import numpy as np
from ipynb.fs.full.Functions import *
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import statsmodels.api as sm
from statsmodels.formula.api import ols
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
#importing the data
palm_ffb = pd.read_csv('data/palm_ffb.csv',parse_dates=['Date'])
palm_ffb.head()

FileNotFoundError: [Errno 2] File data/palm_ffb.csv does not exist: 'data/palm_ffb.csv'

In [None]:
#looking into palm_ffb data and finding the datatypes & number of non-null rows
palm_ffb.info()

In [None]:
#Finding total number of missing values in each features
palm_ffb.isnull().sum()

In [None]:
palm_ffb.describe()

#### Using "Describe" function we can find out about some general information on our dataset. we can obtain information such as mean,std,min,max which can be useful for further analysis  

In [None]:
palm_ffb.corr()

In [None]:
plt.figure(figsize=(16,16))
sns.heatmap(palm_ffb.corr(),annot=True)

Heatmap above shows that "SoilMoisture" has high negative correlations with "Average_Temp" and "HA_Harvested", meaning if one increase the other will most likely decrease. On the other hand "SoilMoisture" with "Precipitation" have high positve correlations meaning if "SoilMoisture" increases "Precipitation" also most likely will increase.

In [None]:
#In order to do further analysis,Date is broken down into 3 more columns 
palm_ffb["Year"] = palm_ffb["Date"].apply(lambda x: x.year)
palm_ffb["Month"] = palm_ffb["Date"].apply(lambda x: x.day)
palm_ffb["Day"] = palm_ffb["Date"].apply(lambda x: x.month)

In [None]:
cols = ['Year','Month','Day']
count_plots(palm_ffb,cols,3,'y')

In [None]:
count_plots(palm_ffb[palm_ffb['Year']==2018],cols,3,'y')

Date format is assumed to be year-day-month for the plot above. 
The given data is from 2008 to 2018 with only data for January. 2008 to 2017 has all months in a year and 2018 has only first 10 months data.

In [None]:
palm_ffb.drop(['Date'],axis=1, inplace=True)

From yearly analysis we can see that on 2018 we have data until October lets look into the data yearly and working days

In [None]:
def count_plot_yearly2(df,title=None,figsize=(17,17), suptitle_y=0.95):
    fig, axs=plt.subplots(4,3, figsize=figsize)
    sns.countplot(x=df['Working_days'], data=df[df['Year']==2008],ax=axs[0][0])
    sns.countplot(x=df['Working_days'], data=df[df['Year']==2009],ax=axs[0][1])
    sns.countplot(x=df['Working_days'], data=df[df['Year']==2010],ax=axs[0][2])
    sns.countplot(x=df['Working_days'], data=df[df['Year']==2011],ax=axs[1][0])
    sns.countplot(x=df['Working_days'], data=df[df['Year']==2012],ax=axs[1][1])
    sns.countplot(x=df['Working_days'], data=df[df['Year']==2013],ax=axs[1][2])
    sns.countplot(x=df['Working_days'], data=df[df['Year']==2014],ax=axs[2][0])
    sns.countplot(x=df['Working_days'], data=df[df['Year']==2015],ax=axs[2][1])
    sns.countplot(x=df['Working_days'], data=df[df['Year']==2016],ax=axs[2][2])
    sns.countplot(x=df['Working_days'], data=df[df['Year']==2017],ax=axs[3][0])
    sns.countplot(x=df['Working_days'], data=df[df['Year']==2018],ax=axs[3][1])


In [None]:
cols = ['Working_days']
count_plot_yearly2(palm_ffb)

Pattern of working days us the same for every year. Therefore, it can be concluded that 25 days is highest number.

In [None]:
plt.figure(figsize=(16, 4))
sns.boxplot(data = palm_ffb)

In [None]:
# cols = ['SoilMoisture','Average_Temp','Min_Temp','Max_Temp','Precipitation','Working_days','HA_Harvested','FFB_Yield']
box_plots(palm_ffb,palm_ffb.columns,3,'y',title=None,figsize=(16,20))

In [None]:
sns.barplot(x=palm_ffb['Working_days'],y=palm_ffb['FFB_Yield'])

Based on graph above, higher number of working days does not necessary contribute to high FFB-yield.

In [None]:
sns.barplot(x=palm_ffb['Working_days'],y=palm_ffb['HA_Harvested'])

The HA_Harvested amount is almost the same for different working days.

In [None]:
sns.barplot(x=palm_ffb['Working_days'],y=palm_ffb['Average_Temp'])

The Average_Temp amount is almost the same for different working days.

In [None]:
sns.barplot(x=palm_ffb['Year'],y=palm_ffb['FFB_Yield'])

Figure above shows the yearly FFB_Yeild where on 2016 we have the lowest FFB_Yield 

In [None]:
sns.barplot(x=palm_ffb['Month'],y=palm_ffb['FFB_Yield'])

Figure above shows the Monthly FFB_Yeild where it shows that seond half of the year has higher FFB_Yeild than first half

In [None]:
palm_ffb.info()

## Feature Selections

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_regression

# Spliting the data into features & target

x = palm_ffb.drop(columns=['FFB_Yield'])
y = palm_ffb['FFB_Yield']

In [None]:
#Break the data into Train , test and validation set
xtrain, xtest, ytrain, ytest = train_test_split(x, y,random_state=666, test_size=0.20)
xtrain, xval, ytrain, yval = train_test_split(xtrain, ytrain,random_state=666, test_size=0.10)

In [None]:
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
def randForest(xtrain, ytrain):
    # Building and fitting my_forest
    forest = RandomForestRegressor(max_depth = None, min_samples_leaf=8, min_samples_split=2, n_estimators = 50, random_state = 666)
    randforest = forest.fit(xtrain, ytrain)
    return randforest
    
randforest = randForest(xtrain, ytrain)

In [None]:
def randforestplot(randforest, xval, yval):
    y_pred_class = randforest.predict(xval)
    impact_feature = pd.DataFrame({'impact':randforest.feature_importances_})    
    impact_feature['Features'] = xtrain.columns
    impact_feature.sort_values(by='impact', ascending=False, inplace=True)
    impact_feature = impact_feature.iloc[:xtrain.shape[1]]
    
    impact_feature.sort_values(by='impact', inplace=True)
    impact_feature = impact_feature.set_index('Features', drop=True)
    impact_feature.plot.barh(title='Feature Impact', figsize=(15,8))
    plt.xlabel('Feature Impact Result')
    plt.show()
    
    from sklearn import metrics
    print('Mean Absolute Error:', metrics.mean_absolute_error(yval, y_pred_class))
    print('Mean Squared Error:', metrics.mean_squared_error(yval, y_pred_class))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(yval, y_pred_class)))
randforestplot(randforest, xval, yval)

Figure above shows the external factors affecting FFB_Yield and their impact level.