# Implement

## 1. Data preparation

Download sqlite DB from web.

In [None]:
! wget -nc https://github.com/PaulHancock/COMP5009_pracs/raw/main/data/Assignment2024.sqlite

Import related packages.

In [None]:
import pandas as pd
import sqlite3
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
con = sqlite3.connect('Assignment2024.sqlite')
train_df = pd.read_sql("SELECT * FROM train", con)
test_df = pd.read_sql("SELECT * FROM test", con)
con.close()

In [None]:
train_df

In [None]:
test_df

In [None]:
out_df = test_df['index'].copy().to_frame()

In [None]:
out_df

In [None]:
print("Column Names")
print(train_df.columns)
print()
print("Data types")
print(train_df.dtypes)
print(train_df.shape)

In [None]:
for i in range(10):
  print(train_df['Music'][i], train_df['Storage'][i], train_df['Guitar'][i])

In [None]:
train_df.describe()

In [None]:
train_df.sum(numeric_only=True)

In [None]:
# We can plot a histogram of all the data together
train_df_bak = train_df.copy()
train_df_bak.hist(figsize=(12,12))
plt.show()

In [None]:
train_df_bak_desc = train_df_bak.describe()
print(train_df_bak_desc.loc['std'])
train_df_bak_desc.loc['std'].plot.line()
plt.title('Standard Deviation of Each Attribute')
plt.show()

In [None]:
config_std_threshold = 100
outlier_columns = []

for column in train_df_bak.columns:
  if 'object' != train_df_bak.dtypes[column] and train_df_bak_desc[column]['std'] > config_std_threshold:
    print(column, train_df_bak_desc[column]['std'])
    if column != 'index':
      outlier_columns.append(column)
print(outlier_columns)

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler(copy=False)
# Determine the mean/std for each column and set up the scaler
print(outlier_columns)
scaler.fit(train_df[outlier_columns])

# Now transform our data using this scaler, replacing the original data
train_df[outlier_columns] = scaler.transform(train_df[outlier_columns])
train_df_bak[outlier_columns].hist()
plt.show()
train_df[outlier_columns].hist()
plt.show()


In [None]:
print(train_df.head())
train_df_bak.sort_values(by='class', ignore_index=True, inplace=True)
train_df.sort_values(by='class', ignore_index=True, inplace=True)
print(train_df.head())

In [None]:
test_df.drop(columns=['class'], inplace=True)
test_df.shape

In [None]:
data_df = train_df.iloc[:, 0:-1].copy()
print(data_df)
label_df = train_df['class'].copy()
print(label_df)
print(data_df.shape)
print(label_df.shape)

In [None]:
# We can plot a histogram of all the data together
data_df.hist(figsize=(12,12))
plt.show()

### Identify and remove irrelevant attributes.

In [None]:
# Remove index column
print(f"Before dropping {data_df.shape}")
drop_columns = ['index']
print(drop_columns)
data_df.drop(columns=drop_columns, inplace=True)
print(f"After dropping {data_df.shape}")

In [None]:
test_df.drop(columns=drop_columns, inplace=True)
test_df.shape

In [None]:
print(train_df.shape)
cor = train_df.corr(numeric_only=True).abs()
fig, ax = plt.subplots(1,1,figsize=(24,20))
# use seaborn to do the plot
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds, ax=ax)
cols = cor.columns
config_cor_hi_threshold = 0.8
config_cor_lw_threshold = 0.1
cor_hi_columns = []
cor_low_columns = []
for i,col in enumerate(cols):
  for j in range(i + 1, len(cols)-1): # Ignore the last column: class
    if cor.iloc[i,j] > config_cor_hi_threshold:
      print(cols[i], ' vs. ', cols[j], cor.iloc[i,j])
      if cols[j] not in cor_hi_columns:
          cor_hi_columns.append(cols[j])
for i,col in enumerate(cols):
  if cor.iloc[i,len(cols)-1] < config_cor_lw_threshold:
    print(cols[i], " vs. ", cols[len(cols)-1], cor.iloc[i,len(cols)-1])
    if cols[i] != 'index':
      cor_low_columns.append(cols[i])

print(cor_hi_columns)
print(cor_low_columns)

In [None]:
print(f"Before dropping {data_df.shape}")
print(cor_hi_columns)
data_df.drop(columns=cor_hi_columns,
           inplace=True, errors='ignore')
print(cor_low_columns)
data_df.drop(columns=cor_low_columns,
           inplace=True, errors='ignore')
print(f"After dropping {data_df.shape}")

In [None]:
test_df.drop(columns=cor_hi_columns, inplace=True, errors='ignore')
test_df.drop(columns=cor_low_columns, inplace=True, errors='ignore')
test_df.shape

In [None]:
# Remove dtype=object columns
non_number_columns = data_df.select_dtypes(exclude=['number']).columns
print(non_number_columns)

In [None]:
# figure out which columns you want to drop from above, and put their names in the list below
print(f"Before dropping {data_df.shape}")
print(non_number_columns)
data_df.drop(columns=non_number_columns,
           inplace=True)
print(f"After dropping {data_df.shape}")

In [None]:
test_df.drop(columns=non_number_columns, inplace=True)
test_df.shape

In [None]:
# Cite: https://scikit-learn.org/stable/modules/feature_selection.html#feature-selection

from sklearn.feature_selection import VarianceThreshold
print(f"Before dropping {data_df.shape}")
# TODO: How much to set ? Need handle test_df at the same time
config_p = 0.8
sel = VarianceThreshold(threshold=(config_p * (1 - config_p)))
# sel.fit_transform(data_df)
print(f"After dropping {data_df.shape}")

### Detect and handle missing entries.

In [None]:
# find which columns have missing data
def missing(df):
  """
  For each attribute/column in the dataframe `df`, count the number of missing entries.
  Return a list of all the coulmns with more than 80% missing entries.
  """
  missing_dict = dict()
  total = df.shape[0] # shape[0] is the number of rows
  for attribute in df.columns:
    missing = df[attribute].isna().sum() # count the number of Null/nan/na values
    frac = missing/total * 100 # as a percentage
    missing_dict[attribute] = frac
  return missing_dict

In [None]:
m_dict = missing(data_df)
m_dict

In [None]:
m_dict_test = missing(test_df)
m_dict_test

In [None]:
# Look at each attribute/frac pair in the dict and choose those with a frac that is >80
conf_drop_frac = 20
cols_to_drop = [ att for att,frac in m_dict.items() if frac > conf_drop_frac]
cols_to_drop

In [None]:
# figure out which columns you want to drop from above, and put their names in the list below
print(f"Before dropping {data_df.shape}")
print(cols_to_drop)
data_df.drop(columns=cols_to_drop,
           inplace=True)
print(f"After dropping {data_df.shape}")

In [None]:
# confirm that our data frame now has fewer columns (was 280)
data_df.columns

In [None]:
test_df.drop(columns=cols_to_drop, inplace=True)
test_df.shape

In [None]:
m_dict = missing(data_df)
m_dict

In [None]:
# Look at each attribute/frac pair in the dict and choose those with a frac that is <5
conf_impute_frac = 5
cols_to_impute = [ att for att,frac in m_dict.items() if 0 < frac < conf_impute_frac]
cols_to_impute

In [None]:
for col in cols_to_impute:
  print(col, "missing data", m_dict[col])

In [None]:
for col in cols_to_impute:
  # compute the mean
  mean = data_df[col].mean()
  # now use the fillna function to replace the NaN avalues with the mean value
  data_df.fillna({col: mean}, inplace=True)

In [None]:
# double check the missing data
m_dict = missing(data_df)
for col in cols_to_impute:
  print(col, "missing data", m_dict[col])

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
data_df.shape
label_df.shape
X_new = SelectKBest(f_classif, k=7).fit_transform(data_df, label_df) # Need no nan
X_new.shape

In [None]:
data_df.describe()

In [None]:
test_df.describe()

### Detect and handle duplicates (both instances and attributes).

In [None]:
# Remove the rows have the same data
dup_rows = data_df.duplicated()
# recall that true = 1, false = 0
print(dup_rows.sum())
dup_row_indexes = dup_rows.index[dup_rows == True].tolist()
print(dup_row_indexes)
print(data_df.shape)
data_df.drop(index=dup_row_indexes, inplace=True)
print(data_df.shape)
print(label_df.shape)
label_df.drop(index=dup_row_indexes, inplace=True)
print(label_df.shape)

In [None]:
# Remove duplicated columns
dup_cols = data_df.T.duplicated()
dup_cols2 = data_df.columns.duplicated()
# recall that true = 1, false = 0
print(dup_cols.sum())
print(dup_cols2.sum())

### Select suitable data types for attributes.

In [None]:
print("Column Names")
print(data_df.columns)
print()
print("Data types")
print(data_df.dtypes)
print(data_df.shape)
print(train_df.shape)

In [None]:
data_df.describe()

In [None]:
# We can plot a histogram of all the data together
data_df.hist(figsize=(12,12))
plt.show()

In [None]:
int_columns = data_df.select_dtypes(include=['int64']).columns
print(int_columns)

In [None]:
for int_col in int_columns:
  print(data_df[int_col].describe())
  data_df[int_col].hist(figsize=(12,12))
  plt.show()

In [None]:
new_train_df = data_df.join(label_df)
cor = new_train_df.corr().abs()
fig, ax = plt.subplots(1,1,figsize=(24,20))
# use seaborn to do the plot
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds, ax=ax)

### Perform data transformation (such as scaling/standardization) if needed.

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
# choose all the numeric type attributes (all of them)
numeric_attributes = data_df.select_dtypes(include='number').columns
numeric_attributes

In [None]:
# Create a standard scaler
scaler = StandardScaler()
# Determine the mean/std for each column and set up the scaler
scaler.fit(data_df[numeric_attributes])

# Now transform our data using this scaler, replacing the original data
data_df[numeric_attributes] =  scaler.transform(data_df[numeric_attributes])
# if you have other data frames that you want to scale with the same transform you can do it as
# other_df[numeric_attributes] = scaler.transform(other_df[numeric_attributes])


In [None]:
test_df[numeric_attributes] =  scaler.transform(test_df[numeric_attributes])

In [None]:
# verify that this worked
# the mean should be close to zero, and the std should be close to 1.
data_df.describe()

In [None]:
test_df.describe()

In [None]:
test_df.shape, data_df.shape

### Perform other data preparation operations (This is optional, bonus marks will be awarded for novel ideas).

##2. Data classification

### Class imbalance

The original labelled data is not equally distributed between the three classes. You need to demonstrate that such an issue exists within the data, explain the importance of this issue, and describe how you address this problem.

In [None]:
label_df.hist()
plt.show()

In [None]:
# Normally our we are given train/test data separately
# hewever for this prac we will take 25% of the iris data can pretend that it's test data
from sklearn.model_selection import train_test_split

config_train_ratio = 0.70
config_validation_ratio = 0.15
config_test_ratio = 0.15

# train is now 75% of the entire data set
X_train, X_test, y_train, y_test = train_test_split(data_df, label_df,
                                                    test_size=1 - config_train_ratio,
                                                    random_state=4)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,
                                                test_size=config_test_ratio/(config_test_ratio + config_validation_ratio),
                                                random_state=4)

print(X_train, X_val, X_test)
print(y_train, y_val, y_test)

In [None]:
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

In [None]:
y_train.hist()
plt.show()

In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.linear_model import LogisticRegression

oversampler = RandomOverSampler(random_state=4)
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
print(X_resampled.shape, y_resampled.shape)

# X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
# print(X_resampled.shape, y_resampled.shape)
# clf_smote = LogisticRegression().fit(X_resampled, y_resampled)

# X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)
# print(X_resampled.shape, y_resampled.shape)
# clf_adasyn = LogisticRegression().fit(X_resampled, y_resampled)

df_oversampled = pd.DataFrame(X_resampled, columns=X_train.columns)
df_oversampled['class'] = y_resampled
# For compare
# df_oversampled = pd.DataFrame(X_train, columns=X_train.columns)
# df_oversampled['class'] = y_train
df_oversampled.head()
print(df_oversampled.shape)

# oversampler = RandomOverSampler(random_state=4)
# X_resampled_val, y_resampled_val = oversampler.fit_resample(X_val, y_val)
# print(X_resampled_val.shape, y_resampled_val.shape)
# df_oversampled_val = pd.DataFrame(X_resampled_val, columns=X_val.columns)
# df_oversampled_val['class'] = y_resampled_val
# df_oversampled_val.head()

# from imblearn.under_sampling import RandomUnderSampler
# undersampler = RandomUnderSampler(random_state=4)
# X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)
# print(X_resampled.shape, y_resampled.shape)
# df_undersampled = pd.DataFrame(X_resampled, columns=X_train.columns)
# df_undersampled['class'] = y_resampled
# df_undersampled.head()

In [None]:
df_oversampled['class'].hist()
plt.show()

In [None]:
print(df_oversampled.head())
df_oversampled.sort_values(by='class', ignore_index=True, inplace=True)
print(df_oversampled.head())

In [None]:
X_train = df_oversampled.iloc[:, 0:-1].copy()
y_train = df_oversampled['class'].copy()
# X_val = df_oversampled_val.iloc[:, 0:-1].copy()
# y_val = df_oversampled_val['class'].copy()

In [None]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

In [None]:
df_oversampled.head()

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold, ShuffleSplit

In [None]:
# This is random sampling
ss = ShuffleSplit(n_splits=10, test_size=15, random_state=4)
# This is non-random sampling, we just break the data in to 10 contiguous sub-sets
kf = KFold(n_splits=10)
# Ensuring the balance between classes in the model/validate sets
# means we should use stratified sampling
skf = StratifiedKFold(n_splits=10)

In [None]:
# This cell sets up a nice visulisation that I found on the scikit-learn documentation page.
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm

def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """
    Create a sample plot for indices of a cross-validation object.
    Adapted from https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#define-a-function-to-visualize-cross-validation-behavior

    Parameters
    ----------
    cv: cross validation method

    X : training data

    y : data labels

    group : group labels

    ax : matplolib axes object

    n_splits : number of splits

    lw : line width for plotting
    """

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=cmap_data)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['class', 'group']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [None]:
# Set up a figure with three subplots
fig, ax = plt.subplots(1,3, figsize=(18,6))
# visualise the ShulffleSplit algorithm
plot_cv_indices(ss,
                X_train, y_train,
                group=None,
                ax=ax[0],
                n_splits=10)
# visualise the KFolds algorithm
plot_cv_indices(kf,
                X_train, y_train,
                group=None,
                ax=ax[1],
                n_splits=10)
# visualise the StratifiedKFolds algorithm
plot_cv_indices(skf,
                X_train, y_train,
                group=None,
                ax=ax[2],
                n_splits=10)
plt.show()

### Model training and tuning

Every classifier typically has hyperparameters to tune in order. For each classifier, you need to select (at least one) and explain the tuning hyperparameters of your choice. You must select and describe a suitable cross-validation/validation scheme that can measure the performance of your model on labelled data well and can address the class imbalance issue. Then you will need to conduct the actual tuning of your model and report the tuning results in detail. You are expected to look at several classification performance metrics and make comments on the classification performance of each model. Finally, you will need to clearly indicate and justify the selected values of the tuning hyperparameters of each model.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score, PredefinedSplit, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
split_index = [-1]*len(X_train) + [0]*len(X_val)
print(X_train.shape)
print(X_val.shape)
print(len(split_index))

X = pd.concat([X_train, X_val])
y = pd.concat([y_train, y_val])
pds = PredefinedSplit(test_fold = split_index)
print(X_train.shape)
print(X_val.shape)
print(X.shape)
print(y.shape)

In [None]:
comp_name = "Name"
comp_param = "Param"
comp_score = "Score"
comp_val = "Validation"
comp_test = "Test"
model_comp_dict = {comp_name:[], comp_param:[], comp_score:[], comp_val:[], comp_test:[]}

In [None]:
from sklearn.metrics import precision_score, make_scorer
precision_scorer = make_scorer(precision_score, average='weighted')

#### Model: KNN

In [None]:
# Create a dictionary of all the parameters we'll be iterating over
parameters = {'weights': ['uniform','distance'], # this should be the different weighting schemes
              'n_neighbors':[1,3,7,11,17,21,25,30,35,40,45,50,55,60]} # this should be a list of the nearest neigbhours
# make a classifier object
knn = KNeighborsClassifier()
# create a GridSearchCV object to do the training with cross validation
gscv = GridSearchCV(estimator=knn,
                    param_grid=parameters,
                    # cv=skf,  # the cross validation folding pattern
                    cv=pds,
                    # scoring='accuracy')
                    scoring=precision_scorer)
# now train our model
best_knn = gscv.fit(X, y)

In [None]:
best_knn.best_params_, best_knn.best_score_ # ({'n_neighbors': 50, 'weights': 'distance'}, 0.9213341032832865)
# ({'n_neighbors': 35, 'weights': 'distance'}, 0.8519515477792732)

# Final: ({'n_neighbors': 25, 'weights': 'distance'}, 0.8703840669201928)
# ({'n_neighbors': 35, 'weights': 'distance'}, 0.9134834913944504)

In [None]:
knn = KNeighborsClassifier(weights = best_knn.best_params_['weights'],
                            n_neighbors = best_knn.best_params_['n_neighbors'])
knn.fit(X_train, y_train)

In [None]:
knn.score(X_val, y_val)
# 0.8519515477792732
# final: 0.847913862718708
# 0.8600269179004038

In [None]:
knn.score(X_test, y_test) # 0.8529886914378029
# 0.845222072678331
# final: 0.847913862718708
# 0.8492597577388964

In [None]:
model_comp_dict[comp_name].append("KNN")
model_comp_dict[comp_param].append(best_knn.best_params_)
model_comp_dict[comp_score].append(best_knn.best_score_)
model_comp_dict[comp_val].append(knn.score(X_val, y_val))
model_comp_dict[comp_test].append(knn.score(X_test, y_test))

In [None]:
fig, ax = plt.subplots(1,1, figsize=(6, 6))

ConfusionMatrixDisplay.from_estimator(knn,
                                      X_test, y_test,
                                      ax=ax)
plt.tight_layout()
plt.show()

#### Model: DT

In [None]:
from sklearn import tree

In [None]:
# Create a dictionary of all the parameters we'll be iterating over
parameters = {'criterion': ('gini','entropy'),  # this should be the different splitting criteria
              'min_samples_split': [2,3,4,5,7,10,15,20], # this should be the different values for min_samples_split
              'max_depth': [5,7,8,9,10,11,12,13,14,15]}
dtc = tree.DecisionTreeClassifier()
gscv = GridSearchCV(estimator=dtc,
                    param_grid=parameters,
                    cv=pds, #5
                    # scoring='accuracy')
                    scoring=precision_scorer)
best_dtc = gscv.fit(X, y)

In [None]:
best_dtc.best_params_, best_dtc.best_score_ # ({'criterion': 'entropy', 'max_depth': 9, 'min_samples_split': 2}, 0.8464952978056427)
#({'criterion': 'entropy', 'max_depth': 12, 'min_samples_split': 4}, 0.7577388963660835)

In [None]:
dtc = tree.DecisionTreeClassifier(criterion=best_dtc.best_params_['criterion'],
                                  min_samples_split=best_dtc.best_params_['min_samples_split'],
                                  max_depth=best_dtc.best_params_['max_depth'])
dtc.fit(X_train, y_train)

In [None]:
dtc.score(X_val, y_val)
# 0.7496635262449529

In [None]:
dtc.score(X_test, y_test) # 0.7568659127625202
# 0.7510094212651414

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12,12))
tree.plot_tree(dtc,
               filled=True, # color the nodes based on class/purity
               ax=ax, fontsize=12)
plt.show()

In [None]:
model_comp_dict[comp_name].append("DT")
model_comp_dict[comp_param].append(best_dtc.best_params_)
model_comp_dict[comp_score].append(best_dtc.best_score_)
model_comp_dict[comp_val].append(dtc.score(X_val, y_val))
model_comp_dict[comp_test].append(dtc.score(X_test, y_test))

In [None]:
fig, ax = plt.subplots(1,1, figsize=(6, 6))

ConfusionMatrixDisplay.from_estimator(dtc,
                                      X_test, y_test,
                                      # display_labels=iris['target_names'],
                                      ax=ax)
plt.tight_layout()
plt.show()

#### Model: NB

In [None]:
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score

In [None]:
# Create a dictionary of all the parameters we'll be iterating over
parameters = {'var_smoothing': [1e-10, 1e-09, 1e-08, 1e-07]}
nb = naive_bayes.GaussianNB()
gscv = GridSearchCV(estimator=nb,
                    param_grid=parameters,
                    cv=pds, #5
                    # scoring='accuracy')
                    scoring=precision_scorer)
best_nb = gscv.fit(X, y)

In [None]:
best_nb.best_params_, best_nb.best_score_ #({'var_smoothing': 1e-10}, 0.6944818304172274)

In [None]:
nb = naive_bayes.GaussianNB(var_smoothing=best_nb.best_params_['var_smoothing'])
nb.fit(X_train, y_train)

In [None]:
nb.score(X_val, y_val)

In [None]:
nb.score(X_test, y_test)

In [None]:
# no parameters to adjust so no need to optimise, just train
# nb = naive_bayes.GaussianNB()
# nb.fit(X_train, y_train)
# y_pred = nb.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"NB accuracy is {accuracy:5.3f}")
# print(nb.score(X_test, y_test))
# NB accuracy is 0.698  0.6978998384491115

In [None]:
model_comp_dict[comp_name].append("NB")
model_comp_dict[comp_param].append(best_nb.best_params_)
model_comp_dict[comp_score].append(best_nb.best_score_)
model_comp_dict[comp_val].append(nb.score(X_val, y_val))
model_comp_dict[comp_test].append(nb.score(X_test, y_test))

In [None]:
fig, ax = plt.subplots(1,1)
ConfusionMatrixDisplay.from_estimator(nb,
                                      X_test, y_test,
                                      # display_labels=iris['target_names'],
                                      ax=ax)
plt.tight_layout()
plt.show()

#### Model: SVM

In [None]:
from sklearn import svm

In [None]:
# Create a dictionary of all the parameters we'll be iterating over
parameters = {'kernel': ('linear', 'poly', 'rbf'),  # this should be the different splitting criteria
              'C': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], # this should be the different values for min_samples_split
              'gamma': ['scale', 'auto']}
svc = svm.SVC()
gscv = GridSearchCV(estimator=svc,
                    param_grid=parameters,
                    cv=pds, #5
                    # scoring='accuracy')
                    scoring=precision_scorer)
best_svc = gscv.fit(X, y)

In [None]:
best_svc.best_params_, best_svc.best_score_ # ({'C': 3.0, 'gamma': 'scale', 'kernel': 'rbf'}, 0.9500409173403728)
# ({'C': 4.0, 'gamma': 'scale', 'kernel': 'rbf'}, 0.8721399730820996)

In [None]:
svc = svm.SVC(kernel=best_svc.best_params_['kernel'],
                                  C=best_svc.best_params_['C'],
                                  gamma=best_svc.best_params_['gamma'])
svc.fit(X_train, y_train)

In [None]:
svc.score(X_val, y_val) # 0.8721399730820996

In [None]:
svc.score(X_test, y_test) # 0.9063004846526656
# 0.873485868102288

In [None]:
model_comp_dict[comp_name].append("SVM")
model_comp_dict[comp_param].append(best_svc.best_params_)
model_comp_dict[comp_score].append(best_svc.best_score_)
model_comp_dict[comp_val].append(svc.score(X_val, y_val))
model_comp_dict[comp_test].append(svc.score(X_test, y_test))

In [None]:
model_comp_dict

In [None]:
fig, ax = plt.subplots(1,1, figsize=(6, 6))

ConfusionMatrixDisplay.from_estimator(svc,
                                      X_test, y_test,
                                      # display_labels=iris['target_names'],
                                      ax=ax)
plt.tight_layout()
plt.show()

### Model comparison

Once you have finished tuning all models, you will need to compare them and explain how you select the best two models for producing the prediction on the 500 test samples.

### Prediction

In [None]:
model_comp_pd = pd.DataFrame(model_comp_dict)
model_comp_pd

#### Predict using 2 models

In [None]:
data_df.describe()

In [None]:
test_df.describe()

In [None]:
test_df.shape, data_df.shape

In [None]:
y_predict_svc = svc.predict(test_df)
y_predict_knn = knn.predict(test_df)

In [None]:
y_predict_svc

In [None]:
y_predict_knn

In [None]:
(y_predict_svc == y_predict_knn).sum()

In [None]:
type(out_df)

In [None]:
out_df['Predict1'] = y_predict_svc
out_df['Predict2'] = y_predict_knn

In [None]:
out_df

#### Saving data to an sqlite database

In [None]:
import sqlite3

In [None]:
con = sqlite3.connect('Answers.sqlite')

In [None]:
out_df.to_sql(name='predict',
          con=con,
          if_exists='replace',
          index=False) # don't save the index column to the output

In [None]:
con.close()