In [73]:
# setup
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# suppress the warnings
import warnings
warnings.filterwarnings("ignore")

# load and inspect the dataset
raisins = pd.read_csv('Raisin_Dataset.csv')
raisins.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,0
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,0
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,0
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,0
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,0


In [74]:
# split the dataset into features and labels
X = raisins.drop(columns='Class')
y = raisins['Class']

# examine the dataset
print(f"Total number of features: {len(X.columns)}")
print(f"Total number of samples: {len(y)}")
raisins[raisins.Class==1]

Total number of features: 7
Total number of samples: 900


Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
450,137583,649.541485,273.260282,0.907201,142650,0.731638,1590.354,1
451,117592,533.292856,288.558319,0.840966,123587,0.730068,1432.006,1
452,95546,487.178282,251.960243,0.855875,99166,0.722782,1276.807,1
453,96582,446.705203,278.325498,0.782172,100113,0.706598,1216.979,1
454,61409,403.701295,209.365889,0.855007,67286,0.597393,1083.477,1
...,...,...,...,...,...,...,...,...
895,83248,430.077308,247.838695,0.817263,85839,0.668793,1129.072,1
896,87350,440.735698,259.293149,0.808629,90899,0.636476,1214.252,1
897,99657,431.706981,298.837323,0.721684,106264,0.741099,1292.828,1
898,93523,476.344094,254.176054,0.845739,97653,0.658798,1258.548,1


In [75]:
# split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# create a decision tree model
tree = DecisionTreeClassifier()
parameters = {
    "max_depth": [3, 5, 7],
    "min_samples_split": [2, 3, 4]
}

In [76]:
# create a grid search classifier
grid = GridSearchCV(estimator=tree, param_grid=parameters)
grid.fit(X_train, y_train)

# observe the best estimator and best score
best_model = grid.best_estimator_
best_score = grid.best_score_
print(f"Best estimator: {best_model}")
print(f"Best score: {best_score}")
print(f"Test score: {grid.score(X_test, y_test)}")

Best estimator: DecisionTreeClassifier(max_depth=3)
Best score: 0.8486111111111111
Test score: 0.8833333333333333


In [77]:
# observe the score for each hyperparameter combination
hyperparameters = pd.DataFrame(grid.cv_results_["params"])
grid_scores = pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["score"])

result = pd.concat([hyperparameters, grid_scores], axis=1)
result

Unnamed: 0,max_depth,min_samples_split,score
0,3,2,0.848611
1,3,3,0.848611
2,3,4,0.848611
3,5,2,0.8375
4,5,3,0.836111
5,5,4,0.8375
6,7,2,0.819444
7,7,3,0.8125
8,7,4,0.825


In [78]:
# random search with logistic regression
lr = LogisticRegression(solver='liblinear', max_iter=1000)

# specify parameters and distributions
distributions = {
    "penalty": ["l1", "l2"],
    "C": uniform(loc=0, scale=100)
}

# create a random search classifier
clf = RandomizedSearchCV(estimator=lr, param_distributions=distributions, n_iter=8)
clf.fit(X_train, y_train)

In [79]:
# observe the best estimator and best scores
best_model_rand = clf.best_estimator_
best_score_rand = clf.best_score_
print(f"Best estimator: {best_model_rand}")
print(f"Best score: {best_score_rand}")

Best estimator: LogisticRegression(C=np.float64(16.667731479571323), max_iter=1000,
                   solver='liblinear')
Best score: 0.875


In [80]:
# observe the combinations and correspondent scores
hyperparameters_rand = pd.DataFrame(clf.cv_results_["params"])
grid_scores_rand = pd.DataFrame(clf.cv_results_["mean_test_score"], columns=["score"])

results_rand = pd.concat([hyperparameters_rand, grid_scores_rand], axis=1)
results_rand

Unnamed: 0,C,penalty,score
0,26.618116,l1,0.854167
1,34.631767,l2,0.872222
2,16.667731,l2,0.875
3,75.50668,l1,0.854167
4,35.57939,l1,0.861111
5,60.483552,l2,0.872222
6,73.796333,l2,0.872222
7,31.30365,l1,0.854167
