In [1]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score

#avoid warning messages
import warnings
warnings.filterwarnings('ignore')

In [2]:
link = 'https://drive.google.com/file/d/1LnU6moeljyBUx0YKrk4LzhS7JEdpnL4e/view'
id = link.split("/")[-2]
new_link = f'https://drive.google.com/uc?id={id}'
df = pd.read_csv(new_link)
df

Unnamed: 0,Soil_Type,Sunlight_Hours,Water_Frequency,Fertilizer_Type,Temperature,Humidity,Growth_Milestone
0,loam,5.192294,bi-weekly,chemical,31.719602,61.591861,0
1,sandy,4.033133,weekly,organic,28.919484,52.422276,1
2,loam,8.892769,bi-weekly,none,23.179059,44.660539,0
3,loam,8.241144,bi-weekly,none,18.465886,46.433227,0
4,sandy,8.374043,bi-weekly,organic,,63.625923,0
...,...,...,...,...,...,...,...
188,sandy,5.652000,daily,none,28.000000,70.200000,0
189,clay,7.528000,weekly,chemical,30.500000,60.100000,1
190,loam,4.934000,bi-weekly,none,24.500000,61.700000,0
191,sandy,8.273000,daily,organic,27.900000,69.500000,1


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Soil_Type         193 non-null    object 
 1   Sunlight_Hours    191 non-null    float64
 2   Water_Frequency   192 non-null    object 
 3   Fertilizer_Type   192 non-null    object 
 4   Temperature       183 non-null    float64
 5   Humidity          191 non-null    float64
 6   Growth_Milestone  193 non-null    int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 10.7+ KB


In [4]:
df.describe(include='all')

Unnamed: 0,Soil_Type,Sunlight_Hours,Water_Frequency,Fertilizer_Type,Temperature,Humidity,Growth_Milestone
count,193,191.0,192,192,183.0,191.0,193.0
unique,3,,3,3,,,
top,clay,,daily,none,,,
freq,67,,73,74,,,
mean,,6.814887,,,25.017731,58.08635,0.497409
std,,1.603823,,,5.363576,12.696306,0.501294
min,,4.033133,,,15.2,30.567682,0.0
25%,,5.464666,,,20.599339,49.05649,0.0
50%,,6.832,,,25.912336,59.182806,0.0
75%,,8.252072,,,29.49667,69.15,1.0


In [5]:
df.nunique()

Unnamed: 0,0
Soil_Type,3
Sunlight_Hours,188
Water_Frequency,3
Fertilizer_Type,3
Temperature,180
Humidity,189
Growth_Milestone,2


In [6]:
df.isnull().sum()

Unnamed: 0,0
Soil_Type,0
Sunlight_Hours,2
Water_Frequency,1
Fertilizer_Type,1
Temperature,10
Humidity,2
Growth_Milestone,0


In [7]:
cat_cols=df.select_dtypes(include='object').columns.tolist()
nums_cols=df.select_dtypes(include='number').columns.tolist()
print("Categorical")
print(cat_cols)
print("Numerical")
print(nums_cols)

Categorical
['Soil_Type', 'Water_Frequency', 'Fertilizer_Type']
Numerical
['Sunlight_Hours', 'Temperature', 'Humidity', 'Growth_Milestone']


In [8]:
df = df.drop_duplicates()
df

Unnamed: 0,Soil_Type,Sunlight_Hours,Water_Frequency,Fertilizer_Type,Temperature,Humidity,Growth_Milestone
0,loam,5.192294,bi-weekly,chemical,31.719602,61.591861,0
1,sandy,4.033133,weekly,organic,28.919484,52.422276,1
2,loam,8.892769,bi-weekly,none,23.179059,44.660539,0
3,loam,8.241144,bi-weekly,none,18.465886,46.433227,0
4,sandy,8.374043,bi-weekly,organic,,63.625923,0
...,...,...,...,...,...,...,...
188,sandy,5.652000,daily,none,28.000000,70.200000,0
189,clay,7.528000,weekly,chemical,30.500000,60.100000,1
190,loam,4.934000,bi-weekly,none,24.500000,61.700000,0
191,sandy,8.273000,daily,organic,27.900000,69.500000,1


In [12]:
# Fill missing values in numerical features with column's median value
df['Soil_Type'].fillna(df['Soil_Type'].mode()[0], inplace=True)
df['Sunlight_Hours'].fillna(df['Sunlight_Hours'].median(), inplace=True)
df['Water_Frequency'].fillna(df['Water_Frequency'].mode()[0], inplace=True)
df['Fertilizer_Type'].fillna(df['Fertilizer_Type'].mode()[0], inplace=True)
df['Temperature'].fillna(df['Temperature'].median(), inplace=True)
df['Humidity'].fillna(df['Humidity'].median(), inplace=True)
df

Unnamed: 0,Soil_Type,Sunlight_Hours,Water_Frequency,Fertilizer_Type,Temperature,Humidity,Growth_Milestone
0,loam,5.192294,bi-weekly,chemical,31.719602,61.591861,0
1,sandy,4.033133,weekly,organic,28.919484,52.422276,1
2,loam,8.892769,bi-weekly,none,23.179059,44.660539,0
3,loam,8.241144,bi-weekly,none,18.465886,46.433227,0
4,sandy,8.374043,bi-weekly,organic,15.200000,63.625923,0
...,...,...,...,...,...,...,...
188,sandy,5.652000,daily,none,28.000000,70.200000,0
189,clay,7.528000,weekly,chemical,30.500000,60.100000,1
190,loam,4.934000,bi-weekly,none,24.500000,61.700000,0
191,sandy,8.273000,daily,organic,27.900000,69.500000,1


In [13]:
df.shape

(193, 7)

In [14]:
# Converting categorical variables to numerical
df = pd.get_dummies(df, columns = ['Soil_Type', 'Water_Frequency', 'Fertilizer_Type'])
df.head()

Unnamed: 0,Sunlight_Hours,Temperature,Humidity,Growth_Milestone,Soil_Type_clay,Soil_Type_loam,Soil_Type_sandy,Water_Frequency_bi-weekly,Water_Frequency_daily,Water_Frequency_weekly,Fertilizer_Type_chemical,Fertilizer_Type_none,Fertilizer_Type_organic
0,5.192294,31.719602,61.591861,0,False,True,False,True,False,False,True,False,False
1,4.033133,28.919484,52.422276,1,False,False,True,False,False,True,False,False,True
2,8.892769,23.179059,44.660539,0,False,True,False,True,False,False,False,True,False
3,8.241144,18.465886,46.433227,0,False,True,False,True,False,False,False,True,False
4,8.374043,15.2,63.625923,0,False,False,True,True,False,False,False,False,True


In [15]:
# Split the data into 80-20 train-test split (you can do other ratios as well)
X = df.drop(['Growth_Milestone'], axis=1)
df['Growth_Milestone']= df['Growth_Milestone'].replace(0, -1)
y = df['Growth_Milestone']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Display the shapes

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (154, 12)
X_test shape: (39, 12)
y_train shape: (154,)
y_test shape: (39,)


In [17]:
n_estimators = [5, 10, 15]
learning_rate = [0.1, 0.3, 0.4]

for i in n_estimators:
  for j in learning_rate:
    abc = AdaBoostClassifier(n_estimators = i, learning_rate = j, random_state = 42)
    model = abc.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # plot_adaboost(X, y, model)
    print(f"For n_estimators = {i} and learning_rate = {j}")
    print("Accuracy score (training): {0:.3f}".format(model.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(model.score(X_test, y_test)))

For n_estimators = 5 and learning_rate = 0.1
Accuracy score (training): 0.682
Accuracy score (validation): 0.590
For n_estimators = 5 and learning_rate = 0.3
Accuracy score (training): 0.649
Accuracy score (validation): 0.615
For n_estimators = 5 and learning_rate = 0.4
Accuracy score (training): 0.675
Accuracy score (validation): 0.590
For n_estimators = 10 and learning_rate = 0.1
Accuracy score (training): 0.682
Accuracy score (validation): 0.590
For n_estimators = 10 and learning_rate = 0.3
Accuracy score (training): 0.695
Accuracy score (validation): 0.590
For n_estimators = 10 and learning_rate = 0.4
Accuracy score (training): 0.708
Accuracy score (validation): 0.590
For n_estimators = 15 and learning_rate = 0.1
Accuracy score (training): 0.656
Accuracy score (validation): 0.590
For n_estimators = 15 and learning_rate = 0.3
Accuracy score (training): 0.701
Accuracy score (validation): 0.615
For n_estimators = 15 and learning_rate = 0.4
Accuracy score (training): 0.727
Accuracy sco