In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

## Load and investigate the data

In [3]:
raisins = pd.read_csv('Raisin_Dataset.csv')
raisins.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,0
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,0
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,0
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,0
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,0


In [6]:
X = raisins.drop(['Class'], axis=1)
y = raisins.Class

In [8]:
print(len(X.columns))
print(len(X))
print(len(raisins[raisins.Class==1]))

7
900
450


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=19)

## Grid Search with Decision Tree Classifier

In [13]:
tree = DecisionTreeClassifier()

In [15]:
parameters = {'max_depth': [3, 5, 7], 'min_samples_split': [2, 3, 4]}

In [17]:
grid = GridSearchCV(tree, parameters)
grid.fit(X_train, y_train)

In [23]:
# Print best model
print(grid.best_estimator_)
# Print best score
print(grid.best_score_)
# Print the accuracy of the final model on the test data
print(grid.score(X_test, y_test))

DecisionTreeClassifier(max_depth=5, min_samples_split=4)
0.8666666666666668
0.8133333333333334


In [25]:
# Print a table summarizing the results of GridSearchCV
hyperparameter_grid = pd.DataFrame(grid.cv_results_['params'])
grid_scores = pd.DataFrame(grid.cv_results_['mean_test_score'], columns = ['score'])
df = pd.concat([hyperparameter_grid, grid_scores], axis = 1)
print(df)

   max_depth  min_samples_split     score
0          3                  2  0.860741
1          3                  3  0.859259
2          3                  4  0.857778
3          5                  2  0.859259
4          5                  3  0.865185
5          5                  4  0.866667
6          7                  2  0.840000
7          7                  3  0.848889
8          7                  4  0.845926


## Random Search with Logistic Regression

In [28]:
lr = LogisticRegression(solver='liblinear', max_iter=1000)

In [34]:
distributions = {'penalty': ['l1', 'l2'], 'C': uniform(loc=0, scale=100)}

In [36]:
clf = RandomizedSearchCV(lr, distributions, n_iter=8)
clf.fit(X_train, y_train)

In [40]:
# Print best esimator and best score
print(clf.best_estimator_)
print(clf.best_score_)
# Print a table summarizing the results of RandomSearchCV
hyperparameter_grid = pd.DataFrame(clf.cv_results_['params'])
grid_scores = pd.DataFrame(clf.cv_results_['mean_test_score'], columns = ['score'])
df = pd.concat([hyperparameter_grid, grid_scores], axis = 1)
print(df)

LogisticRegression(C=44.46778551365716, max_iter=1000, penalty='l1',
                   solver='liblinear')
0.8755555555555556
           C penalty     score
0  44.467786      l1  0.875556
1  97.365715      l1  0.874074
2  67.900766      l2  0.874074
3  21.198597      l1  0.875556
4  78.845296      l1  0.874074
5  11.442002      l1  0.875556
6  28.932544      l2  0.875556
7  86.146242      l1  0.874074
