In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import accuracy_score,roc_auc_score,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBRegressor
import warnings

In [2]:
df = pd.read_csv('raw.csv')

In [3]:
X = df.drop(columns=['label'],axis=1)

In [4]:
y = df['label']

In [5]:
X.shape

(2200, 7)

In [6]:
y.shape

(2200,)

In [7]:
num_col= X.select_dtypes(exclude="object").columns
cat_col = X.select_dtypes(include="object").columns

In [32]:
from sklearn.impute import SimpleImputer # for handaling missing values
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder # for scaling values
# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import precision_score,f1_score

In [9]:
num_pipeline = Pipeline(
    steps =[
        ("imputer",SimpleImputer(strategy="median")),
        ("Scaling",StandardScaler()) # for scaling values
    ]
)

In [10]:
preprocessor = ColumnTransformer([
    
    ("Numeric_Pipeline",num_pipeline,num_col),
     
])


In [11]:
preprocessor

In [12]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=40)

In [13]:
preprocessor.fit_transform(X_train)

array([[ 1.1695014 , -0.03253049, -0.19795827, ...,  0.48681131,
         0.76222039,  3.30907872],
       [ 0.67798847,  0.05886077, -0.59312417, ..., -0.39279231,
        -1.01314958, -0.78595148],
       [ 0.26839437,  0.02839701, -0.15844168, ...,  0.80357829,
         0.70869594,  0.82800721],
       ...,
       [-1.34267579, -1.28154435, -0.35602463, ...,  1.11794771,
        -1.18639993,  0.46430146],
       [ 1.85215825, -1.1901531 , -0.05965021, ...,  0.58988395,
         0.07537497, -1.01662534],
       [ 0.45953828, -0.39809553, -0.15844168, ...,  0.75229016,
         1.1035587 ,  0.84807665]])

In [14]:
preprocessor.transform(X_test)

array([[-4.41568754e-01,  1.80715776e-01, -6.32640755e-01, ...,
        -3.07676574e-01,  1.08986748e+00, -7.34173179e-01],
       [ 5.41457105e-01, -2.06674035e-03, -3.98919164e-02, ...,
         9.00966687e-01,  5.68476911e-01,  3.15865106e-01],
       [ 1.82485198e+00, -6.11341795e-01, -4.54816104e-01, ...,
        -6.44169396e-01,  9.47815383e-01,  3.45288732e-01],
       ...,
       [-8.23856588e-01,  4.54889550e-01, -6.52399050e-01, ...,
        -2.17321701e+00, -7.05483429e-01,  4.22573704e-02],
       [ 2.34367118e+00, -5.50414289e-01, -5.53607577e-01, ...,
         6.19433474e-01,  5.76325930e-01, -6.01566334e-01],
       [-7.41937767e-01,  6.37672067e-01, -6.12882461e-01, ...,
        -3.18821133e-01,  9.28904980e-01, -1.01181260e+00]])

In [15]:
preprocessor.get_feature_names_out()

array(['Numeric_Pipeline__N', 'Numeric_Pipeline__P',
       'Numeric_Pipeline__K', 'Numeric_Pipeline__temperature',
       'Numeric_Pipeline__humidity', 'Numeric_Pipeline__ph',
       'Numeric_Pipeline__rainfall'], dtype=object)

In [16]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test =  pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [17]:
X_test

Unnamed: 0,Numeric_Pipeline__N,Numeric_Pipeline__P,Numeric_Pipeline__K,Numeric_Pipeline__temperature,Numeric_Pipeline__humidity,Numeric_Pipeline__ph,Numeric_Pipeline__rainfall
0,-0.441569,0.180716,-0.632641,1.147252,-0.307677,1.089867,-0.734173
1,0.541457,-0.002067,-0.039892,2.802537,0.900967,0.568477,0.315865
2,1.824852,-0.611342,-0.454816,-0.141298,-0.644169,0.947815,0.345289
3,-0.851163,2.008541,2.904094,-1.346886,0.451143,-0.708843,-0.561240
4,-0.987694,-1.098762,-0.356025,0.840903,-0.852439,-1.666083,-0.165726
...,...,...,...,...,...,...,...
721,1.551789,-1.312008,0.118174,0.600355,0.910590,-0.498744,-1.347670
722,-0.086587,0.759527,0.592374,-1.595029,-2.533594,0.168077,-0.711767
723,-0.823857,0.454890,-0.652399,-0.417210,-2.173217,-0.705483,0.042257
724,2.343671,-0.550414,-0.553608,-0.485731,0.619433,0.576326,-0.601566


In [18]:
X_train

Unnamed: 0,Numeric_Pipeline__N,Numeric_Pipeline__P,Numeric_Pipeline__K,Numeric_Pipeline__temperature,Numeric_Pipeline__humidity,Numeric_Pipeline__ph,Numeric_Pipeline__rainfall
0,1.169501,-0.032530,-0.197958,0.272117,0.486811,0.762220,3.309079
1,0.677988,0.058861,-0.593124,-1.216586,-0.392792,-1.013150,-0.785951
2,0.268394,0.028397,-0.158442,-0.110357,0.803578,0.708696,0.828007
3,1.224114,0.637672,-0.059650,0.678000,0.533834,-0.840272,0.070936
4,-0.823857,-0.763661,-0.415300,-0.089803,0.956684,-0.559483,0.939697
...,...,...,...,...,...,...,...
1469,1.579096,-0.915979,-0.336266,-0.281667,-0.670592,-0.061830,0.752823
1470,-1.015001,0.089325,-0.454816,0.543010,0.679886,0.495732,-0.972740
1471,-1.342676,-1.281544,-0.356025,0.435062,1.117948,-1.186400,0.464301
1472,1.852158,-1.190153,-0.059650,-0.264543,0.589884,0.075375,-1.016625


In [35]:
def evaluate_model(true, predicted):
    ac = accuracy_score(true, predicted)
    ps = precision_score(true, predicted,average='macro')
    fs = f1_score(true, predicted,average='macro')
    
    return ac , ps ,fs


In [20]:
models = {
    "Logistic Regressor": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [21]:
trained_model_list = []
model_list = []
ac_list =[]

In [37]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train) # Train model
    
    # Make Prediction
    y_pred = model.predict(X_test)

    # validation score
    ac,ps,fs  = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("Accuracy Score:",ac)
    print("Precision Score:",ps)
    print("F1 Score:",fs)
    
    

    ac_list.append(ac)
    
    print('='*35)
    print('\n')

Logistic Regressor
Model Training Performance
Accuracy Score: 0.9641873278236914
Precision Score: 0.964117478000767
F1 Score: 0.9596636494052707


K-Neighbors Classifier
Model Training Performance
Accuracy Score: 0.9738292011019284
Precision Score: 0.9740608175090933
F1 Score: 0.9704252625699946


Decision Tree
Model Training Performance
Accuracy Score: 0.9834710743801653
Precision Score: 0.9833992033797372
F1 Score: 0.9831388215585125


Random Forest Classifier
Model Training Performance
Accuracy Score: 0.9972451790633609
Precision Score: 0.9973262032085561
F1 Score: 0.9971074380165288






AdaBoost Classifier
Model Training Performance
Accuracy Score: 0.2327823691460055
Precision Score: 0.1612396694214876
F1 Score: 0.17077020202020202




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
ac_list

[0.9641873278236914,
 0.9738292011019284,
 0.987603305785124,
 0.9986225895316805,
 0.2327823691460055,
 0.9641873278236914,
 0.9738292011019284,
 0.9862258953168044,
 0.9972451790633609,
 0.2327823691460055]