In [59]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor


def create_target(value):

    if(value in ["A", "B","C","D"]):
        return "hyperthyroid condition"

    elif(value in ["E", "F", "G", "H"]):
        return "hypothyroid condition"

    elif(value in ["I", "J"]):
        return "binding protein"

    elif(value in ["L", "M", "N"]):
        return "replacement therapy"

    elif(value == "R"):
        return "discordant results"

    elif(value == "K"):
        return "general health"

    elif(value == "-"):
        return "hypothyroid condition"

    return "other"

def replaceCol(col):
    arr = []
    mapa = dict()
    n = -1
    for line in col:
        if(line == "?"):
            arr.append(-1)
        elif(line not in mapa.values):
            n+=1
            mapa.update(line, n)
            arr.append(n)
        else:
            e = mapa.get(line)
            arr.append(e)
    return arr

def naive_model_testing(train, test):

    #test 2 models, DTs and LR, and print out the results
    dtr= DecisionTreeRegressor(max_depth=5)
    dtr.fit(train.drop('target', axis=1), train['target'])

    lmr=LinearRegression()
    lmr.fit(train.drop('target', axis=1), train['target'])

   # rf_preds=rfr.predict(X_test)
    dt_preds=dtr.predict(test.drop('target', axis=1))
    lr_preds=lmr.predict(test.drop('target', axis=1))

   # print("RVE RFs: %7.4f" % explained_variance_score(y_test, rf_preds))
    print("R2 Decision Tree Regression: %7.4f" % f1_score(test['target'], dt_preds))
    print("R2 Linear Regression: %7.4f" % f1_score(test['target'], lr_preds))


data = pd.read_csv("proj-data.csv")

data['target'] = data['diagnoses'].apply(create_target)

data = data.drop(['diagnoses', '[record identification]'], axis=1)

data = data.replace('?', np.NaN)
data = data.replace('f', 0)
data = data.replace('t', 1)
data = data.replace('M', 1)
data = data.replace('F', 0)

data['referral source:'] = data['referral source:'].astype('category').cat.codes#replaceCol(data['referral source:']) #
data['target'] = data['target'].astype('category').cat.codes#replaceCol(data['target']) #
#----------------------------------------------------------------------------------------



# para avaliar o modelo, vamos dividir o dataset em treino e teste


  data = data.replace('t', 1)
  data = data.replace('F', 0)


In [36]:

spear = data.corr(method='spearman')

d = spear['target'].abs().nlargest(30)
# print("spearman")
# print(d)

In [52]:
pearson = data.corr(method='pearson')

d = pearson['target'].abs().nlargest(30)
print("pearson")
print(d)

pearson
target                        1.000000
TBG:                          0.334092
on thyroxine:                 0.252414
pregnant:                     0.223739
T4U:                          0.184534
referral source:              0.184009
TT4:                          0.107681
T3:                           0.094007
TT4 measured:                 0.088449
T4U measured:                 0.083256
FTI measured:                 0.082986
TBG measured:                 0.081623
T3 measured:                  0.074284
TSH measured:                 0.059817
TSH:                          0.055648
thyroid surgery:              0.046009
query hypothyroid:            0.045878
I131 treatment:               0.037085
FTI:                          0.032401
query hyperthyroid:           0.031676
query on thyroxine:           0.028076
on antithyroid medication:    0.017606
sex:                          0.012169
sick:                         0.008605
lithium:                      0.005053
psych:           

In [38]:

data.filter(pearson['target'].abs().nlargest(10))

X = data.drop(['target'], axis=1)
y = data['target']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#train, test = train_test_split(data, test_size=0.2, random_state=0)

#f1_score(y_train, y_test)

# Assuming you've imported necessary libraries and defined 'data' and 'pearson' appropriately

# Filter data based on the top correlated features
# top_correlated_features = pearson['target'].abs().nlargest(10).index
# filtered_data = data[top_correlated_features]

# # Define features (X) and target (y)
# X = filtered_data.drop(['target'], axis=1)
# y = filtered_data['target']

# x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)



In [39]:
# N, M = train.shape
# M = M-1
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(x_train,y_train)
# sfs = SequentialFeatureSelector(tree, n_features_to_select=5).set_output(transform="pandas")
# sfs.fit(X, y)

pred = tree.predict(x_test)

#get the relevant columns
# features=sfs.get_support()
# Features_selected =np.arange(M)[features]
# print("The features selected are columns: ", Features_selected)
# print(data.columns[Features_selected])

f1_score(y_test, pred, average="weighted")

0.8690306859680775

In [73]:

for i in range(2, data.columns.size):

    dt = data.filter(pearson['target'].abs().nlargest(i).index)
    dt['target'] = data['target']

    X = dt.drop(['target'], axis=1)
    y = dt['target']
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    tree = DecisionTreeClassifier()
    tree.fit(x_train,y_train)
    test_pred = tree.predict(x_test)
    train_pred = tree.predict(x_train)
    print(i,": ")
    print(f1_score(y_test, pred, average="weighted"))
    print(accuracy_score(y_train, train_pred))
    print(accuracy_score(y_test, test_pred), "\n")



2 : 
0.7213657578946404
0.8156729131175469
0.8038147138964578 

3 : 
0.7213657578946404
0.8158432708688246
0.8038147138964578 

4 : 
0.7213657578946404
0.8204429301533219
0.8051771117166212 

5 : 
0.7213657578946404
0.837649063032368
0.8044959128065395 

6 : 
0.7213657578946404
0.8495741056218058
0.8038147138964578 

7 : 
0.7213657578946404
0.982793867120954
0.8065395095367848 

8 : 
0.7213657578946404
0.9965928449744463
0.8869209809264306 

9 : 
0.7213657578946404
0.9977853492333901
0.8862397820163488 

10 : 
0.7213657578946404
0.9969335604770017
0.8807901907356949 

11 : 
0.7213657578946404
0.9971039182282794
0.8807901907356949 

12 : 
0.7213657578946404
0.996252129471891
0.8794277929155313 

13 : 
0.7213657578946404
0.9974446337308348
0.8841961852861036 

14 : 
0.7213657578946404
0.9988074957410562
0.8835149863760218 

15 : 
0.7213657578946404
0.999488926746167
0.9162125340599455 

16 : 
0.7213657578946404
0.9988074957410562
0.9243869209809265 

17 : 
0.7213657578946404
0.9988074957