# Skin Cancer Detection

**load packages**

In [1]:
# data analysis stack
import numpy as np
import pandas as pd

# data visualization stack
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

# machine learning stack
from sklearn.linear_model import LogisticRegression

# miscellaneous
import scipy.stats as ss
import warnings
warnings.filterwarnings("ignore")

**load data**

In [3]:
# Load and preprocess the CSV data
df = pd.read_csv("Augmentation/train_df.csv")
df = df.sample(frac=1).reset_index(drop=True)
df = df.drop(["index", "level_0"], axis=1)
df.rename(columns=lambda x: x.lower(), inplace=True)

In [4]:
X = df.drop(["diagnostic", "img_id"], axis=1)

In [5]:
y = df['diagnostic']

### Model Selection

**Parameter grid**

In [6]:
param_grid = {
    'penalty': ['l1','l2','elasticnet'],
    'C': [0.1,0.5,1.0]
}

**Instantiate GridSearchCV**

In [7]:
from sklearn.model_selection import GridSearchCV

In [8]:
gscv = GridSearchCV(
    estimator=LogisticRegression(max_iter=1_000),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5, 
    n_jobs=-1,
    verbose=1
)

**Grid-search cross-validation**

In [9]:
import time

In [10]:
# initial time
ti = time.time()

# grid-search cross-validation
gscv.fit(X,y)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

Fitting 5 folds for each of 9 candidates, totalling 45 fits


time taken: 1.62 sec


**Best hyperparameters**

In [11]:
gscv.best_params_

{'C': 0.5, 'penalty': 'l2'}

**Best score**

In [12]:
round(gscv.best_score_,6)

0.678333

**Model selection**

In [13]:
best_model = gscv.best_estimator_
best_model

**Build model**

In [14]:
best_model.fit(X,y);

**Model validation**

In [15]:
# training score
training_score = best_model.score(X,y)

print(f'Train score: {round(training_score,6)}')

Train score: 0.765


### Model Deployment

**load test data**

In [16]:
# Load and preprocess the CSV data
test = pd.read_csv("Augmentation/test_df.csv")
test = test.sample(frac=1).reset_index(drop=True)
test.rename(columns=lambda x: x.lower(), inplace=True)

In [18]:
X_test = test.drop(["diagnostic", "img_id"], axis=1)

In [19]:
y_test = test['diagnostic']

**performance**

In [20]:
# test score
test_score = best_model.score(X_test,y_test)

print(f'Test score : {round(test_score,6)}')

Test score : 0.75
