# Importing Libraries

In [118]:
import requests
import zipfile
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Importing Dataset

In [138]:
url = "https://github.com/mohsley/skin-cancer-detection/raw/refs/heads/main/data.zip"

In [139]:
zip_file_path = "data.zip"
response = requests.get(url)
if response.status_code == 200:
    with open(zip_file_path, "wb") as file:
        file.write(response.content)
    print("ZIP file downloaded successfully.")
else:
    print(f"Failed to download the ZIP file. Status code: {response.status_code}")
    exit()

ZIP file downloaded successfully.


In [140]:
extract_folder = "extracted_data"
os.makedirs(extract_folder, exist_ok=True)
try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)
    print("ZIP file extracted successfully.")
except zipfile.BadZipFile:
    print("Error: The downloaded file is not a valid ZIP file.")

ZIP file extracted successfully.


In [141]:
!rm "/content/data.zip"

In [123]:
df = pd.read_csv("/content/extracted_data/data/train-metadata.csv")

  df = pd.read_csv("/content/extracted_data/data/train-metadata.csv")


In [124]:
df.head()

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251


In [125]:
missing_percentages = (df.isnull().sum() / len(df)) * 100
columns_to_keep = missing_percentages[missing_percentages < 50].index
skin_cancer_df= df[columns_to_keep]
skin_cancer_df = skin_cancer_df.fillna(skin_cancer_df.mode().iloc[0])
skin_cancer_df.shape

(401059, 48)

In [126]:
skin_cancer_df.head()

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z,attribution,copyright_license,iddx_full,iddx_1,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,0.590476,85,-182.703552,613.493652,-42.427948,Memorial Sloan Kettering Cancer Center,CC-BY,Benign,Benign,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,0.285714,55,-0.078308,1575.687,57.1745,Memorial Sloan Kettering Cancer Center,CC-BY,Benign,Benign,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,0.361905,105,123.6497,1472.01,232.9089,Memorial Sloan Kettering Cancer Center,CC-BY,Benign,Benign,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,0.209581,130,-141.02478,1442.185791,58.359802,ACEMID MIA,CC-0,Benign,Benign,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,0.313433,20,-72.31564,1488.72,21.42896,Memorial Sloan Kettering Cancer Center,CC-BY,Benign,Benign,70.44251


In [127]:
numerical_df = skin_cancer_df.select_dtypes(include=["number"])


In [128]:
numerical_df.head()

Unnamed: 0,target,age_approx,clin_size_long_diam_mm,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,tbp_lv_Cext,tbp_lv_H,...,tbp_lv_perimeterMM,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z,tbp_lv_dnn_lesion_confidence
0,0,60.0,3.04,20.244422,16.261975,26.922447,23.954773,33.684638,28.953117,53.058545,...,9.307003,0.0,2.036195,2.63778,0.590476,85,-182.703552,613.493652,-42.427948,97.517282
1,0,60.0,1.1,31.71257,25.36474,26.331,24.54929,41.21903,35.29926,39.70291,...,3.354148,0.0,0.853227,3.912844,0.285714,55,-0.078308,1575.687,57.1745,3.141455
2,0,60.0,3.4,22.57583,17.12817,37.97046,33.48541,44.17492,37.6118,59.26585,...,8.886309,0.0,1.743651,1.950777,0.361905,105,123.6497,1472.01,232.9089,99.80404
3,0,65.0,3.22,14.242329,12.164757,21.448144,21.121356,25.7462,24.374023,56.414429,...,9.514499,0.66469,1.258541,1.573733,0.209581,130,-141.02478,1442.185791,58.359802,99.989998
4,0,55.0,2.73,24.72552,20.05747,26.4649,25.71046,36.21798,32.60874,46.94607,...,6.467562,0.0,2.085409,2.480509,0.313433,20,-72.31564,1488.72,21.42896,70.44251


In [129]:
numerical_df.shape[1]

36

In [130]:
X = numerical_df.drop('target', axis=1)
y = numerical_df['target']

In [131]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [132]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# KNN

In [133]:
k = 2
knn = KNeighborsClassifier(n_neighbors=k)

In [134]:
knn.fit(X_train, y_train)

In [135]:
y_pred = knn.predict(X_test)

In [136]:
accuracy = accuracy_score(y_test, y_pred)

In [137]:
print(accuracy)

0.9990525108462575
