In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV

In [37]:
raisins = pd.read_excel("Raisin_Dataset.xlsx")
#  print(raisins.head())

X = raisins.drop(columns='Class')
y = raisins['Class']

print(X.columns)
print(X.count())
print(y.unique())

#  Convert categorical variable values for class into binary.  0 = 'Besni' and 1 = 'Kecimen'.
y = pd.get_dummies(y, columns='Class')
y = y.drop(columns='Besni')
y = y.rename(columns={'Kecimen' : 'Class'})
print(y.head())

print(y.value_counts())

Index(['Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity',
       'ConvexArea', 'Extent', 'Perimeter'],
      dtype='object')
Area               900
MajorAxisLength    900
MinorAxisLength    900
Eccentricity       900
ConvexArea         900
Extent             900
Perimeter          900
dtype: int64
['Kecimen' 'Besni']
   Class
0      1
1      1
2      1
3      1
4      1
Class
0        450
1        450
dtype: int64


In [49]:
#  Split data into train and test datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=19)

#  Initialize a decision tree classifier.
tree = DecisionTreeClassifier()
parameters = {'max_depth' : [3, 5, 7], 'min_samples_split' : [2, 3, 4]}
grid = GridSearchCV(tree, param_grid=parameters)
grid.fit(X_train, y_train)

#  Model analysis.
best_model = grid.best_estimator_
best_score = grid.best_score_
print(best_model)
print(best_score)
test_score = grid.score(X_test, y_test)
print(test_score)

test_scores = pd.DataFrame(grid.cv_results_['mean_test_score'], columns=['Accuracy'])
params = pd.DataFrame(grid.cv_results_['params'])

scores_params = pd.concat([test_scores, params], axis=1)
print(scores_params)

DecisionTreeClassifier(max_depth=5)
0.8666666666666668
0.8133333333333334
   Accuracy  max_depth  min_samples_split
0  0.859259          3                  2
1  0.860741          3                  3
2  0.860741          3                  4
3  0.866667          5                  2
4  0.866667          5                  3
5  0.863704          5                  4
6  0.841481          7                  2
7  0.850370          7                  3
8  0.842963          7                  4


In [48]:
#  Random search hyperparameter tuning.
lr = LogisticRegression(solver='liblinear', max_iter=1000)
distributions = {'penalty' : ['l1', 'l2'], 'C' : uniform(loc=0, scale=100)}
clf = RandomizedSearchCV(estimator=lr, param_distributions=distributions, n_iter=8)

print(y_train)
clf.fit(X_train, y_train.values.ravel())

print(clf.best_estimator_)
print(clf.best_score_)

RandSearch_table = pd.concat([pd.DataFrame(clf.cv_results_['params']), pd.DataFrame(clf.cv_results_['mean_test_score'], columns=['Accuracy'])], axis=1)
print(RandSearch_table)

     Class
392      1
615      0
100      1
542      0
657      0
..     ...
19       1
354      1
757      0
622      0
605      0

[675 rows x 1 columns]
LogisticRegression(C=22.527423736652718, max_iter=1000, penalty='l1',
                   solver='liblinear')
0.8755555555555556
           C penalty  Accuracy
0  22.527424      l1  0.875556
1  14.815681      l2  0.875556
2  22.747425      l2  0.874074
3  45.178235      l1  0.875556
4  39.084317      l2  0.874074
5  39.982112      l2  0.875556
6   7.007522      l1  0.875556
7  78.167209      l2  0.875556
