# Simple Health Care

## Simple Health Care is a fictitious startup that aims to reduce the number of tests required for the diagnosis of a specific type of cancer.

### Project developed as a study to deal with high-dimensional data.

### Imports and preprocess the dataset

In [2]:
import pandas as pd

uri = "data/exams.csv"
data = pd.read_csv(uri)

data = data.rename(columns = {"diagnostico": "diagnostic"})
data.columns = data.columns.str.replace('exame', 'exam')

data.head()

Unnamed: 0,id,diagnostic,exam_1,exam_2,exam_3,exam_4,exam_5,exam_6,exam_7,exam_8,...,exam_24,exam_25,exam_26,exam_27,exam_28,exam_29,exam_30,exam_31,exam_32,exam_33
0,842302,M,17.99,10.38,122.8,103.78,1001.0,0.1184,0.2776,0.3001,...,184.6,2019.0,0.1622,0.6656,0.7119,0.786,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,103.78,1326.0,0.08474,0.07864,0.0869,...,158.8,1956.0,0.1238,0.1866,0.2416,0.786,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,103.78,1203.0,0.1096,0.1599,0.1974,...,152.5,1709.0,0.1444,0.4245,0.4504,0.786,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,103.78,386.1,0.1425,0.2839,0.2414,...,98.87,567.7,0.2098,0.8663,0.6869,0.786,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,103.78,1297.0,0.1003,0.1328,0.198,...,152.2,1575.0,0.1374,0.205,0.4,0.786,0.1625,0.2364,0.07678,0.854454


### Remove unnecessary columns

In [None]:
data_v1 = data.drop(columns="exam_33")

### Split the data into train and test


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from numpy import random

SEED = 123
random.seed(SEED)

x = data_v1.drop(columns=['id', 'diagnostic'])
y = data_v1.diagnostic

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3)

clf = RandomForestClassifier(n_estimators = 100)
clf.fit(train_x, train_y)
acc = clf.score(test_x, test_y)
acc

In [None]:
from sklearn.dummy import DummyClassifier

SEED = 123
random.seed(SEED)

dummy = DummyClassifier(strategy = "most_frequent")
dummy.fit(train_x, train_y)
acc = dummy.score(test_x, test_y)
acc

In [None]:
def plot_violin_graph(X, start, end):
    import seaborn as sns
    import matplotlib.pyplot as plt

    data_plot = pd.concat([y, X.iloc[:, start:end]], axis = 1)
    data_plot = pd.melt(data_plot, id_vars = "diagnostic", var_name = "exams", value_name = "values")

    plt.figure(figsize = (10, 10))
    plt.xticks(rotation = 90)
    sns.violinplot(x = "exams", y = "values", hue = "diagnostic", data = data_plot, split = True)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x)
x_v2 = scaler.transform(x)
x_v2 = pd.DataFrame(data = x_v2, columns = x.keys())

plot_violin_graph(x_v2, 0, 10)
plot_violin_graph(x_v2, 11, 20)
plot_violin_graph(x_v2, 21, 30)
plot_violin_graph(x_v2, 30, 41)

### Remove constant value columns

In [None]:
x_v3 = x_v2.drop(columns = ["exam_4", "exam_29"])

In [None]:
def classify(X):
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from numpy import random

    SEED = 123
    random.seed(SEED)

    x = data_v1.drop(columns=['id', 'diagnostic'])
    y = data_v1.diagnostic

    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3)

    clf = RandomForestClassifier(n_estimators = 100)
    clf.fit(train_x, train_y)
    acc = clf.score(test_x, test_y) * 100
    print(f'{acc:.2f}%')

In [None]:
classify(x_v3)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

x_correlated = x_v3.corr()
plt.figure(figsize = (17, 15))
sns.heatmap(x_correlated, annot = True, fmt = ".1f")

In [None]:
x_correlated_v1 = x_correlated[x_correlated > 0.99]

In [None]:
x_correlated_v2 = x_correlated_v1.sum()

In [None]:
correlated_vars = x_correlated_v2[x_correlated_v2 > 1]
correlated_vars

In [None]:
x_v4 = x_v3.drop(columns = correlated_vars.keys())
x_v4.head()

In [None]:
classify(x_v4)

In [None]:
x_v5 = x_v3.drop(columns = ["exam_3", "exam_24"])
x_v5.head()

In [None]:
classify(x_v5)

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

kbest_features = SelectKBest(chi2, k = 5)

In [None]:
x_v6 = x.drop(columns = ["exam_3", "exam_4", "exam_24", "exam_29"])

In [None]:
SEED = 123
random.seed(SEED)
train_x, test_x, train_y, test_y = train_test_split(x_v6, y, test_size = 0.3)

kbest_features.fit(train_x, train_y)
train_x_kbest = kbest_features.transform(train_x)
test_x_kbest = kbest_features.transform(test_x)

In [None]:
test_x_kbest.shape

In [None]:
clf = RandomForestClassifier(n_estimators = 100, random_state = SEED)
clf.fit(train_x_kbest, train_y)
acc = clf.score(test_x_kbest, test_y) * 100
print(f'{acc:.2f}%')

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(test_y, clf.predict(test_x_kbest))
conf_matrix

In [None]:
plt.figure(figsize = (10, 8))
sns.set(font_scale = 1.25)
sns.heatmap(conf_matrix, annot = True, fmt = "d").set(xlabel = "Predict", ylabel = "Real")

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from numpy import random

SEED = 123
random.seed(SEED)

train_x, test_x, train_y, test_y = train_test_split(x_v6, y, test_size = 0.3)

clf = RandomForestClassifier(n_estimators = 100, random_state = SEED)
clf.fit(train_x, train_y)

rfe = RFE(estimator = clf, n_features_to_select = 5, step = 1)
rfe.fit(train_x, train_y)
train_rfe_x = rfe.transform(train_x)
test_rfe_x = rfe.transform(test_x)
clf.fit(train_rfe_x, train_y)

conf_matrix = confusion_matrix(test_y, clf.predict(test_rfe_x))

score = clf.score(test_rfe_x, test_y) * 100

plt.figure(figsize = (10, 8))
sns.set(font_scale = 1.25)
sns.heatmap(conf_matrix, annot = True, fmt = "d").set(xlabel = "Predict", ylabel = "Real")

print(f'{score:.2f}%')

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from numpy import random
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

uri = "data/exams.csv"
data = pd.read_csv(uri)

data = data.rename(columns = {"diagnostico": "diagnostic"})
data.columns = data.columns.str.replace('exame', 'exam')

data_v1 = data.drop(columns="exam_33")

x = data_v1.drop(columns=['id', 'diagnostic'])
y = data_v1.diagnostic

x_v6 = x.drop(columns = ["exam_3", "exam_4", "exam_24", "exam_29"])

SEED = 123
random.seed(SEED)

train_x, test_x, train_y, test_y = train_test_split(x_v6, y, test_size = 0.3)

clf = RandomForestClassifier(n_estimators = 100, random_state = SEED)
clf.fit(train_x, train_y)
rfecv = RFECV(estimator = clf, cv = 5, step = 1, scoring = "accuracy")
rfecv.fit(train_x, train_y)
train_rfecv_x= rfecv.transform(train_x)
test_rfecv_x = rfecv.transform(test_x)
clf.fit(train_rfecv_x, train_y)

conf_matrix = confusion_matrix(test_y, clf.predict(test_rfecv_x))

score = clf.score(test_rfecv_x, test_y) * 100

plt.figure(figsize = (10, 8))
sns.set(font_scale = 1.25)
sns.heatmap(conf_matrix, annot = True, fmt = "d").set(xlabel = "Predict", ylabel = "Real")

print(f'{score:.2f}%')

In [None]:
train_x.columns[rfecv.support_]

In [None]:
len(rfecv.grid_scores_)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize = (14, 8))
plt.xlabel("Total exams")
plt.ylabel("Accuracy")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from numpy import random

SEED = 123
random.seed(SEED)

train_x, test_x, train_y, test_y = train_test_split(x_v6, y, test_size = 0.3)

clf = RandomForestClassifier(n_estimators = 100, random_state = SEED)
clf.fit(train_x, train_y)

rfe = RFE(estimator = clf, n_features_to_select = 2, step = 1)
rfe.fit(train_x, train_y)
train_rfe_x = rfe.transform(train_x)
test_rfe_x = rfe.transform(test_x)
clf.fit(train_rfe_x, train_y)

conf_matrix = confusion_matrix(test_y, clf.predict(test_rfe_x))

score = clf.score(test_rfe_x, test_y) * 100

plt.figure(figsize = (10, 8))
sns.set(font_scale = 1.25)
sns.heatmap(conf_matrix, annot = True, fmt = "d").set(xlabel = "Predict", ylabel = "Real")

print(f'{score:.2f}%')

In [None]:
x_v7 = rfe.transform(x_v6)
x_v7.shape

In [None]:
import seaborn as sns

plt.figure(figsize = (14, 8))
sns.scatterplot(x = x_v7[:, 0], y = x_v7[:, 1], hue = y)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
x_v8 = pca.fit_transform(x_v5)

In [None]:
sns.scatterplot(x = x_v8[:, 0], y = x_v8[:, 1], hue = y)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components = 2)
x_v9 = tsne.fit_transform(x_v5)
sns.scatterplot(x = x_v9[:, 0], y = x_v9[:, 1], hue = y)