# Imports

In [None]:
# DataFrame Manipulation, Linear Algebra
import pandas as pd
import numpy as np

# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

# Preprocessing

In [None]:
!pip install openpyxl --quiet

In [None]:
raw_data = pd.read_excel("../input/date-fruit-datasets/Date_Fruit_Datasets/Date_Fruit_Datasets.xlsx")
raw_data

In [None]:
rows, cols = raw_data.shape
print(f"Number of Rows : {rows}\nNumber of Columns : {cols}")

In [None]:
raw_data.info()

In [None]:
raw_data.columns

In [None]:
features = raw_data.iloc[:, :-1]
labels = raw_data.iloc[:, -1]

## Standard Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)
scaled_features

## Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA

n_components = 15

pca = PCA(n_components=n_components)

reduced_features = pd.DataFrame(pca.fit_transform(scaled_features), columns=[f"PC{i+1}" for i in range(n_components)])
reduced_features

# Classification

In [None]:
full_data = pd.concat([reduced_features, labels], axis=1)
full_data

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
for train_index, test_index in split.split(full_data, full_data['Class']):
    train = full_data.loc[train_index]
    test = full_data.loc[test_index]

In [None]:
X_train = train.drop("Class", axis=1)
y_train = train["Class"]

X_test = test.drop("Class", axis=1)
y_test = test["Class"]

## Catboost Classifier

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV

catboost_clf = CatBoostClassifier(task_type="GPU", silent=True)

param_dict = {
    "early_stopping_rounds" : np.arange(5, 15),
    "learning_rate" : np.linspace(0.05, 0.15, 10),
    "n_estimators" : np.arange(100, 200, 10),
    "max_depth" : np.arange(1, 10)
    }

rscv = RandomizedSearchCV(catboost_clf , param_dict, scoring='accuracy', cv=10)

rscv.fit(X_train, y_train)

print(rscv.best_params_)
print(rscv.best_score_)

In [None]:
from sklearn.metrics import classification_report

catboost_clf = CatBoostClassifier(task_type="GPU", silent=True, **rscv.best_params_)

catboost_clf.fit(X_train, y_train)

y_pred = catboost_clf.predict(X_test)

print(classification_report(y_pred, y_test))

## LogisticRegressionCV Classifier

### Base LogisticRegressionCV

In [None]:
from sklearn.linear_model import LogisticRegressionCV

lrcv = LogisticRegressionCV(cv=10, n_jobs=-1)

lrcv.fit(X_train, y_train)

y_pred = lrcv.predict(X_test)

print(classification_report(y_pred, y_test))

### HyperParameter Tuned

In [None]:
lrcv = LogisticRegressionCV(cv=10, n_jobs=-1, max_iter=10000)

param_dict = dict(
    solver=['newton-cg', 'lbfgs', 'liblinear'],
    penalty=["l1", "l2", "elasticnet"],
    Cs=[100, 10, 1.0, 0.1, 0.01]
)

rscv = RandomizedSearchCV(lrcv , param_dict, scoring='accuracy', cv=10)

rscv.fit(X_train, y_train)

print(rscv.best_params_)
print(rscv.best_score_)

In [None]:
lrcv = LogisticRegressionCV(**rscv.best_params_)

lrcv.fit(X_train, y_train)

y_pred = lrcv.predict(X_test)

print(classification_report(y_pred, y_test))