In [153]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBRegressor
import warnings

In [154]:
df = pd.read_csv('raw.csv')

In [155]:
X = df.drop(['label'],axis=1)

In [156]:
y = df['label']

In [157]:
X.shape

(2200, 7)

In [158]:
y.shape

(2200,)

In [159]:
num_col= X.select_dtypes(exclude="object").columns
cat_col = X.select_dtypes(include="object").columns

In [160]:
from sklearn.impute import SimpleImputer # for handaling missing values
from sklearn.preprocessing import StandardScaler,OneHotEncoder # for scaling values
# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [161]:
num_pipeline = Pipeline(
    steps =[
        ("imputer",SimpleImputer(strategy="median")),
        ("Scaling",StandardScaler()) # for scaling values
    ]
)

In [162]:
cat_pipeline = Pipeline(
    steps =[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder",OneHotEncoder()),
        ("scaler",StandardScaler(with_mean=False)) 
    ]
)

In [163]:
preprocessor = ColumnTransformer([
    ("Numeric_Pipeline",num_pipeline,num_col),
     ("Categoric_Pipeline",cat_pipeline,cat_col)
])


In [164]:
preprocessor

In [165]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=40)

In [166]:
preprocessor.fit_transform(X_train)

array([[ 1.1695014 , -0.03253049, -0.19795827, ...,  0.48681131,
         0.76222039,  3.30907872],
       [ 0.67798847,  0.05886077, -0.59312417, ..., -0.39279231,
        -1.01314958, -0.78595148],
       [ 0.26839437,  0.02839701, -0.15844168, ...,  0.80357829,
         0.70869594,  0.82800721],
       ...,
       [-1.34267579, -1.28154435, -0.35602463, ...,  1.11794771,
        -1.18639993,  0.46430146],
       [ 1.85215825, -1.1901531 , -0.05965021, ...,  0.58988395,
         0.07537497, -1.01662534],
       [ 0.45953828, -0.39809553, -0.15844168, ...,  0.75229016,
         1.1035587 ,  0.84807665]])

In [167]:
preprocessor.transform(X_test)

array([[-4.41568754e-01,  1.80715776e-01, -6.32640755e-01, ...,
        -3.07676574e-01,  1.08986748e+00, -7.34173179e-01],
       [ 5.41457105e-01, -2.06674035e-03, -3.98919164e-02, ...,
         9.00966687e-01,  5.68476911e-01,  3.15865106e-01],
       [ 1.82485198e+00, -6.11341795e-01, -4.54816104e-01, ...,
        -6.44169396e-01,  9.47815383e-01,  3.45288732e-01],
       ...,
       [-8.23856588e-01,  4.54889550e-01, -6.52399050e-01, ...,
        -2.17321701e+00, -7.05483429e-01,  4.22573704e-02],
       [ 2.34367118e+00, -5.50414289e-01, -5.53607577e-01, ...,
         6.19433474e-01,  5.76325930e-01, -6.01566334e-01],
       [-7.41937767e-01,  6.37672067e-01, -6.12882461e-01, ...,
        -3.18821133e-01,  9.28904980e-01, -1.01181260e+00]])

In [168]:
preprocessor.get_feature_names_out()

array(['Numeric_Pipeline__N', 'Numeric_Pipeline__P',
       'Numeric_Pipeline__K', 'Numeric_Pipeline__temperature',
       'Numeric_Pipeline__humidity', 'Numeric_Pipeline__ph',
       'Numeric_Pipeline__rainfall'], dtype=object)

In [169]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test =  pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [170]:
def evaluate_model(true, predicted):
    ac = accuracy_score(true, predicted)
    
    return ac


In [171]:
models = {
    "Logistic Regressor": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [172]:
trained_model_list = []
model_list = []
ac_list =[]

In [173]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train) # Train model
    
    # Make Prediction
    y_pred = model.predict(X_test)

    # validation score
    ac  = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("Accuracy Score:",ac)
    

    ac_list.append(ac)
    
    print('='*35)
    print('\n')

Logistic Regressor
Model Training Performance
Accuracy Score: 0.9641873278236914


K-Neighbors Classifier
Model Training Performance
Accuracy Score: 0.9738292011019284


Decision Tree
Model Training Performance
Accuracy Score: 0.987603305785124


Random Forest Classifier
Model Training Performance
Accuracy Score: 0.9972451790633609






AdaBoost Classifier
Model Training Performance
Accuracy Score: 0.2327823691460055




In [174]:
ac_list

[0.9641873278236914,
 0.9738292011019284,
 0.987603305785124,
 0.9972451790633609,
 0.2327823691460055]