# Attempt to implement kfold CV and regularization

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import plotly.express as px

# Load the data
df_instances = pd.read_csv("./data/Train_call.txt", sep='\t')
df_solution = pd.read_csv("./data/Train_clinical.txt", sep='\t')

df_instances = df_instances.T

# for now, lets remove chromosome numbers etc. 
# (might be useful in later stages when trying to optimize the model but for now we just need to check the feature space)

drop_list = [
    "Chromosome",
    "Start",
    "End",
    "Nclone" 
    ]
dffinal = df_instances.drop(drop_list)


df_sol_final = df_solution.drop(["Sample"], axis=1)


In [2]:
# convert panda df to array (which is the input for the kfold split)

df_instance_array = dffinal.rename_axis('ID').values
df_solution_array = df_sol_final.rename_axis('ID').values



### Cross-validation

The SKlearn library has a function built in, called KFold. In this example I used the whole dataset, but **be carefull**, as we still need to separate a test set (that is not used in the k-fold cross-validation). This test set can only be used when assessing the finalized model.

#### k-fold cv
kfold cross validation function takes the sample array and splits it 'kfold', determined by the n_splits.

- n_splits: number of splits. i.e. 7, then 7 different splits are made, ensuring that every sample is in the validation set once
- shuffle makes sure the list is shuffled
- random_state=10 sets the seed for the shuffle to ensure the same random shuffle is performed. This makes comparing seperate runs comparable.

In [None]:
#sklearn example kfold cv
import numpy as np
from sklearn.model_selection import KFold

X = df_instance_array
y = df_solution_array
kfold = KFold(n_splits=7, random_state=10, shuffle=True)
kfold.get_n_splits(X)

print(kfold)

for train_index, val_index in kfold.split(X):
    print("TRAIN:", train_index, "VALIDATION:", val_index)
    X_train, X_test = X[train_index], X[val_index]
    y_train, y_test = y[train_index], y[val_index]



KFold(n_splits=7, random_state=10, shuffle=True)
TRAIN: [ 0  4  5  6  7  8  9 10 11 12 13 15 16 17 18 20 21 22 23 24 25 26 27 28
 29 30 31 32 33 34 35 36 39 40 42 44 45 46 47 48 49 50 51 52 54 55 56 57
 58 59 61 62 63 64 65 67 69 70 71 72 73 74 75 76 77 78 80 81 82 83 84 85
 86 87 88 89 90 91 92 93 94 96 97 98 99] VALIDATION: [ 1  2  3 14 19 37 38 41 43 53 60 66 68 79 95]
TRAIN: [ 0  1  2  3  4  5  7  8  9 10 11 12 13 14 15 16 17 18 19 20 22 23 24 25
 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 45 47 48 49 51 52 53
 54 56 57 58 60 62 64 65 66 67 68 69 71 72 73 75 77 78 79 80 81 82 83 84
 85 86 87 88 89 91 93 94 95 96 97 98 99] VALIDATION: [ 6 21 26 44 46 50 55 59 61 63 70 74 76 90 92]
TRAIN: [ 0  1  2  3  6  7  8  9 10 11 12 13 14 15 16 17 18 19 21 22 23 24 25 26
 27 28 29 30 31 33 36 37 38 40 41 43 44 46 47 49 50 51 53 54 55 56 57 59
 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 85
 86 87 88 89 90 91 92 93 94 95 96 97 98 99] VALIDATION: [ 4  5 20 32 34 3

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

model = MODEL_NAME
kfold = KFold(n_splits=7,shuffle = True, random_state=10)
results = cross_val_score(model, X_trainset, y_trainset, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

### (L1) regularisation (see google docs)

I don't think we can implement the L1 regularisation directly. But we can use the **maximum depth parameter** (and others) as the regularization parameter. This wil ensure that we do not exceed certain depth. Other regularization methods are discussed in the google docs!


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c915e4f9-60c2-40b5-a522-8a90cb3fd50a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>