In [41]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.base import r2_score
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor


def create_target(value):

    if(value in ["A", "B","C","D"]):
        return "hyperthyroid condition"

    elif(value in ["E", "F", "G", "H"]):
        return "hypothyroid condition"

    elif(value in ["I", "J"]):
        return "binding protein"

    elif(value in ["L", "M", "N"]):
        return "replacement therapy"

    elif(value == "R"):
        return "discordant results"

    elif(value == "K"):
        return "general health"

    elif(value == "-"):
        return "hypothyroid condition"

    return "other"

def replaceCol(col):
    arr = []
    mapa = dict()
    n = -1
    for line in col:
        if(line == "?"):
            arr.append(-1)
        elif(line not in mapa.values):
            n+=1
            mapa.update(line, n)
            arr.append(n)
        else:
            e = mapa.get(line)
            arr.append(e)
    return arr

def naive_model_testing(train, test):

    #test 2 models, DTs and LR, and print out the results
    dtr= DecisionTreeRegressor(max_depth=5)
    dtr.fit(train.drop('target', axis=1), train['target'])

    lmr=LinearRegression()
    lmr.fit(train.drop('target', axis=1), train['target'])

   # rf_preds=rfr.predict(X_test)
    dt_preds=dtr.predict(test.drop('target', axis=1))
    lr_preds=lmr.predict(test.drop('target', axis=1))

   # print("RVE RFs: %7.4f" % explained_variance_score(y_test, rf_preds))
    print("R2 Decision Tree Regression: %7.4f" % r2_score(test['target'], dt_preds))
    print("R2 Linear Regression: %7.4f" % r2_score(test['target'], lr_preds))

data = pd.read_csv("proj-data.csv")

data['target'] = data['diagnoses'].apply(create_target)

data = data.drop(['diagnoses', '[record identification]'], axis=1)

data = data.replace('?', np.NaN)
data = data.replace('f', 0)
data = data.replace('t', 1)
data = data.replace('M', 1)
data = data.replace('F', 0)

data['referral source:'] = data['referral source:'].astype('category').cat.codes#replaceCol(data['referral source:']) #
data['target'] = data['target'].astype('category').cat.codes#replaceCol(data['target']) #

spear = data.corr(method='spearman')

d = spear['target'].abs().nlargest(30)
print("spearman")
print(d)

pearson = data.corr(method='pearson')

d = pearson['target'].abs().nlargest(30)
print("pearson")
print(d)

X = data.drop(['target'], axis=1)
y = data['target']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
train, test = train_test_split(data, test_size=0.2, random_state=0)


N, M = train.shape
M = M-1
tree = DecisionTreeClassifier(max_depth=3)
sfs = SequentialFeatureSelector(tree, n_features_to_select=5).set_output(transform="pandas")
sfs.fit(X, y)

#get the relevant columns
features=sfs.get_support()
Features_selected =np.arange(M)[features]
print("The features selected are columns: ", Features_selected)
print(data.columns[Features_selected])

# para avaliar o modelo, vamos dividir o dataset em treino e teste


  data = data.replace('t', 1)
  data = data.replace('F', 0)


spearman
target                        1.000000
TBG:                          0.661215
on thyroxine:                 0.287526
referral source:              0.177709
pregnant:                     0.146479
TBG measured:                 0.123879
TT4 measured:                 0.123322
FTI measured:                 0.104956
T4U measured:                 0.104952
T3 measured:                  0.101581
TSH:                          0.091952
TSH measured:                 0.073706
thyroid surgery:              0.061383
query hyperthyroid:           0.058473
TT4:                          0.058177
query hypothyroid:            0.053465
sex:                          0.046490
I131 treatment:               0.041666
query on thyroxine:           0.025741
on antithyroid medication:    0.021126
sick:                         0.017615
FTI:                          0.015591
psych:                        0.011145
age:                          0.008339
tumor:                        0.008296
T3:             