In [10]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "Date_Fruit_Datasets/Date_Fruit_Datasets.xlsx"

# Load the latest version
df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "muratkokludataset/date-fruit-datasets",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

df.head()

Unnamed: 0,AREA,PERIMETER,MAJOR_AXIS,MINOR_AXIS,ECCENTRICITY,EQDIASQ,SOLIDITY,CONVEX_AREA,EXTENT,ASPECT_RATIO,...,KurtosisRR,KurtosisRG,KurtosisRB,EntropyRR,EntropyRG,EntropyRB,ALLdaub4RR,ALLdaub4RG,ALLdaub4RB,Class
0,422163,2378.908,837.8484,645.6693,0.6373,733.1539,0.9947,424428,0.7831,1.2976,...,3.237,2.9574,4.2287,-59191263232,-50714214400,-39922372608,58.7255,54.9554,47.84,BERHI
1,338136,2085.144,723.8198,595.2073,0.569,656.1464,0.9974,339014,0.7795,1.2161,...,2.6228,2.635,3.1704,-34233065472,-37462601728,-31477794816,50.0259,52.8168,47.8315,BERHI
2,526843,2647.394,940.7379,715.3638,0.6494,819.0222,0.9962,528876,0.7657,1.315,...,3.7516,3.8611,4.7192,-93948354560,-74738221056,-60311207936,65.4772,59.286,51.9378,BERHI
3,416063,2351.21,827.9804,645.2988,0.6266,727.8378,0.9948,418255,0.7759,1.2831,...,5.0401,8.6136,8.2618,-32074307584,-32060925952,-29575010304,43.39,44.1259,41.1882,BERHI
4,347562,2160.354,763.9877,582.8359,0.6465,665.2291,0.9908,350797,0.7569,1.3108,...,2.7016,2.9761,4.4146,-39980974080,-35980042240,-25593278464,52.7743,50.908,42.6666,BERHI


In [None]:
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
target_col = ["Class"]


['AREA',
 'PERIMETER',
 'MAJOR_AXIS',
 'MINOR_AXIS',
 'ECCENTRICITY',
 'EQDIASQ',
 'SOLIDITY',
 'CONVEX_AREA',
 'EXTENT',
 'ASPECT_RATIO',
 'ROUNDNESS',
 'COMPACTNESS',
 'SHAPEFACTOR_1',
 'SHAPEFACTOR_2',
 'SHAPEFACTOR_3',
 'SHAPEFACTOR_4',
 'MeanRR',
 'MeanRG',
 'MeanRB',
 'StdDevRR',
 'StdDevRG',
 'StdDevRB',
 'SkewRR',
 'SkewRG',
 'SkewRB',
 'KurtosisRR',
 'KurtosisRG',
 'KurtosisRB',
 'EntropyRR',
 'EntropyRG',
 'EntropyRB',
 'ALLdaub4RR',
 'ALLdaub4RG',
 'ALLdaub4RB']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
model = LogisticRegression()
X = df[num_cols]
y= df[target_col]

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X[num_cols])

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train.values.ravel())


Model accuracy: 0.96


In [18]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_scaled, y.values.ravel(), cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.91666667 0.91111111 0.92777778 0.92178771 0.90502793]
Mean cross-validation score: 0.9164742396027312


In [21]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train.values.ravel())
cv_scores = cross_val_score(model, X_scaled, y.values.ravel(), cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.90555556 0.85555556 0.89444444 0.91620112 0.87150838]
Mean cross-validation score: 0.8886530105524519


In [22]:
# tune hyperparameters using gridsearchcv
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train.values.ravel())
print("Best hyperparameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
cv_scores = cross_val_score(best_model, X_scaled, y.values.ravel(), cv=5)
print("Cross-validation scores with best model:", cv_scores)
print("Mean cross-validation score with best model:", cv_scores.mean())

Best hyperparameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Cross-validation scores with best model: [0.9        0.85555556 0.88333333 0.9273743  0.87150838]
Mean cross-validation score with best model: 0.8875543140906268
