In [None]:
#import library yang dibutuhkan
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import KElbowVisualizer
from sklearn.decomposition import PCA
import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv("oral_cancer_prediction_dataset.csv")
df.head()

Unnamed: 0,ID,Country,Age,Gender,Tobacco Use,Alcohol Consumption,HPV Infection,Betel Quid Use,Chronic Sun Exposure,Poor Oral Hygiene,...,Difficulty Swallowing,White or Red Patches in Mouth,Tumor Size (cm),Cancer Stage,Treatment Type,"Survival Rate (5-Year, %)",Cost of Treatment (USD),Economic Burden (Lost Workdays per Year),Early Diagnosis,Oral Cancer (Diagnosis)
0,1,Italy,36,Female,Yes,Yes,Yes,No,No,Yes,...,No,No,0.0,0,No Treatment,100.0,0.0,0,No,No
1,2,Japan,64,Male,Yes,Yes,Yes,No,Yes,Yes,...,No,No,1.782186,1,No Treatment,83.340103,77772.5,177,No,Yes
2,3,UK,37,Female,No,Yes,No,No,Yes,Yes,...,No,Yes,3.523895,2,Surgery,63.222871,101164.5,130,Yes,Yes
3,4,Sri Lanka,55,Male,Yes,Yes,No,Yes,No,Yes,...,No,No,0.0,0,No Treatment,100.0,0.0,0,Yes,No
4,5,South Africa,68,Male,No,No,No,No,No,Yes,...,No,No,2.834789,3,No Treatment,44.293199,45354.75,52,No,Yes


In [None]:
df.isnull().sum()

Unnamed: 0,0
ID,0
Country,0
Age,0
Gender,0
Tobacco Use,0
Alcohol Consumption,0
HPV Infection,0
Betel Quid Use,0
Chronic Sun Exposure,0
Poor Oral Hygiene,0


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df = df.drop(columns=[col for col in df.columns if 'ID' in col])
df.head()

Unnamed: 0,Country,Age,Gender,Tobacco Use,Alcohol Consumption,HPV Infection,Betel Quid Use,Chronic Sun Exposure,Poor Oral Hygiene,Diet (Fruits & Vegetables Intake),...,Difficulty Swallowing,White or Red Patches in Mouth,Tumor Size (cm),Cancer Stage,Treatment Type,"Survival Rate (5-Year, %)",Cost of Treatment (USD),Economic Burden (Lost Workdays per Year),Early Diagnosis,Oral Cancer (Diagnosis)
0,Italy,36,Female,Yes,Yes,Yes,No,No,Yes,Low,...,No,No,0.0,0,No Treatment,100.0,0.0,0,No,No
1,Japan,64,Male,Yes,Yes,Yes,No,Yes,Yes,High,...,No,No,1.782186,1,No Treatment,83.340103,77772.5,177,No,Yes
2,UK,37,Female,No,Yes,No,No,Yes,Yes,Moderate,...,No,Yes,3.523895,2,Surgery,63.222871,101164.5,130,Yes,Yes
3,Sri Lanka,55,Male,Yes,Yes,No,Yes,No,Yes,Moderate,...,No,No,0.0,0,No Treatment,100.0,0.0,0,Yes,No
4,South Africa,68,Male,No,No,No,No,No,Yes,High,...,No,No,2.834789,3,No Treatment,44.293199,45354.75,52,No,Yes


In [None]:
numeric_columns = df.select_dtypes(include=[np.number]).columns #mengambil kolom dengn tipe data numerik atau angka
scaler = MinMaxScaler() #ini pemanggilan fungsi manmixscler
df[numeric_columns] = scaler.fit_transform(df[numeric_columns]) #ini proses minmaxscaler pada kolom numerik
df[numeric_columns].head()#ini untuk menampilkan data numeric yang sudah di minmaxscaller

Unnamed: 0,Age,Tumor Size (cm),Cancer Stage,"Survival Rate (5-Year, %)",Cost of Treatment (USD),Economic Burden (Lost Workdays per Year)
0,0.244186,0.0,0.0,1.0,0.0,0.0
1,0.569767,0.297036,0.25,0.814888,0.486115,0.988827
2,0.255814,0.587326,0.5,0.591362,0.632326,0.726257
3,0.465116,0.0,0.0,1.0,0.0,0.0
4,0.616279,0.472473,0.75,0.38103,0.283488,0.290503


In [None]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
label_encoder = LabelEncoder()
df[categorical_columns] = df[categorical_columns].apply(label_encoder.fit_transform)
df[categorical_columns].head()

Unnamed: 0,Country,Gender,Tobacco Use,Alcohol Consumption,HPV Infection,Betel Quid Use,Chronic Sun Exposure,Poor Oral Hygiene,Diet (Fruits & Vegetables Intake),Family History of Cancer,Compromised Immune System,Oral Lesions,Unexplained Bleeding,Difficulty Swallowing,White or Red Patches in Mouth,Treatment Type,Early Diagnosis,Oral Cancer (Diagnosis)
0,6,0,1,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0
1,7,1,1,1,1,0,1,1,0,0,0,0,1,0,0,1,0,1
2,15,0,0,1,0,0,1,1,2,0,0,0,0,0,1,3,1,1
3,13,1,1,1,0,1,0,1,2,0,0,1,0,0,0,1,1,0
4,12,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1


In [None]:
y = df['Oral Cancer (Diagnosis)']
x = df.drop(columns=['Oral Cancer (Diagnosis)'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
decisiontree = DecisionTreeClassifier()
decisiontree.fit(x_train, y_train)

In [None]:
ypred = decisiontree.predict(x_test)
accuracy = accuracy_score(y_test, ypred)
precision = precision_score(y_test, ypred)
recall = recall_score(y_test, ypred)
f1 = f1_score(y_test, ypred)
print(f"accuracy: {accuracy:.2f}")
print(f"precision: {precision:.2f}")
print(f"recall: {recall:.2f}")
print(f"f1 Score: {f1:.2f}")

accuracy: 1.00
precision: 1.00
recall: 1.00
f1 Score: 1.00
