In [52]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import plotly.express as px
from sklearn.svm import LinearSVC


In [18]:
data = pd.read_csv('Thyroid_Diff.csv', sep = ',')


In [19]:
data

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,72,M,Yes,Yes,Yes,Euthyroid,Single nodular goiter-right,Right,Papillary,Uni-Focal,High,T4b,N1b,M1,IVB,Biochemical Incomplete,Yes
379,81,M,Yes,No,Yes,Euthyroid,Multinodular goiter,Extensive,Papillary,Multi-Focal,High,T4b,N1b,M1,IVB,Structural Incomplete,Yes
380,72,M,Yes,Yes,No,Euthyroid,Multinodular goiter,Bilateral,Papillary,Multi-Focal,High,T4b,N1b,M1,IVB,Structural Incomplete,Yes
381,61,M,Yes,Yes,Yes,Clinical Hyperthyroidism,Multinodular goiter,Extensive,Hurthel cell,Multi-Focal,High,T4b,N1b,M0,IVA,Structural Incomplete,Yes


In [20]:
data.columns

Index(['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
       'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology',
       'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred'],
      dtype='object')

In [21]:

fig = px.bar(data, x = 'Risk', y = 'Age', color= 'Recurred', 
             hover_data=['Pathology'], height= 500, 
             title = 'Recuperação por Risco',
             labels={'Age': 'Idade','Pathology': 'Patologia'}, 
            #  attern_shape='Risk', 
             text_auto= True)
fig.show()



In [22]:
stri = px.strip(data, x = 'Age', y = 'Recurred', color='Gender', facet_col='Pathology')
stri.show()

In [23]:
stri2 = px.strip(data, x = 'Age', y = 'Risk', color='Recurred', 
                 facet_col='Focality')
stri2.show()

In [24]:
px.bar(data, x = 'Focality', y = 'Age', color ='Recurred', facet_col='Risk')

In [25]:
data.isnull().sum()

Age                     0
Gender                  0
Smoking                 0
Hx Smoking              0
Hx Radiothreapy         0
Thyroid Function        0
Physical Examination    0
Adenopathy              0
Pathology               0
Focality                0
Risk                    0
T                       0
N                       0
M                       0
Stage                   0
Response                0
Recurred                0
dtype: int64

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Age                   383 non-null    int64 
 1   Gender                383 non-null    object
 2   Smoking               383 non-null    object
 3   Hx Smoking            383 non-null    object
 4   Hx Radiothreapy       383 non-null    object
 5   Thyroid Function      383 non-null    object
 6   Physical Examination  383 non-null    object
 7   Adenopathy            383 non-null    object
 8   Pathology             383 non-null    object
 9   Focality              383 non-null    object
 10  Risk                  383 non-null    object
 11  T                     383 non-null    object
 12  N                     383 non-null    object
 13  M                     383 non-null    object
 14  Stage                 383 non-null    object
 15  Response              383 non-null    ob

In [27]:
# Para fazer o grafico de correlação entre os dados primeiro
# preciso transformar os dados em inteiros

label = LabelEncoder()
for coluna in data:
    data[coluna + ' ' + 'Incoder'] = label.fit_transform(data[coluna])

In [28]:
data = data.select_dtypes('int')

In [29]:
data = data.drop('Age Incoder', axis = 1)

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   Age                           383 non-null    int64
 1   Gender Incoder                383 non-null    int32
 2   Smoking Incoder               383 non-null    int32
 3   Hx Smoking Incoder            383 non-null    int32
 4   Hx Radiothreapy Incoder       383 non-null    int32
 5   Thyroid Function Incoder      383 non-null    int32
 6   Physical Examination Incoder  383 non-null    int32
 7   Adenopathy Incoder            383 non-null    int32
 8   Pathology Incoder             383 non-null    int32
 9   Focality Incoder              383 non-null    int32
 10  Risk Incoder                  383 non-null    int32
 11  T Incoder                     383 non-null    int32
 12  N Incoder                     383 non-null    int32
 13  M Incoder                     383 n

In [31]:
y = data['Recurred Incoder']
X = data.drop('Recurred Incoder', axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [54]:
from sklearn.model_selection import GridSearchCV, KFold
import warnings 
warnings.filterwarnings('ignore')
logic = LogisticRegression()

params_logisc = {'penalty': ['None', 'l2', 'l1', 'elasticnet'],
                 'C': [0.5, 0.7, 0.9, 1],
                 'max_iter': [1000,30000,2000,20000],
                 'verbose': [0,1,2,3]
                 }

grid_logic = GridSearchCV(logic, param_grid= params_logisc, cv=5, verbose = 1)
grid_logic.fit(x_train, y_train)



Fitting 5 folds for each of 256 candidates, totalling 1280 fits


In [51]:
standar = StandardScaler()
scalar = standar.fit_transform(x_train)


In [55]:
svm = LinearSVC(random_state=42)

svm_params = {'max_iter': [1000,2000,3000,4000,5000,10000],
              'verbose': [0,1,2,3],
              'intercept_scaling': [1,3,4,6,8],
              'penalty': ['l1', 'l2']
              }

svm_grid = GridSearchCV(svm, param_grid = svm_params, verbose = 1, cv = 6)
svm_grid.fit(scalar, y_train)

Fitting 6 folds for each of 240 candidates, totalling 1440 fits
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][

In [None]:
random_forest = RandomForestClassifier()
random_params = {'n_estimators': [500],
                 'random_state': [42,84,128,200],
                 'criterion': ['gine', 'entropy', 'log_loss'],
                 'n_jobs': [1,3,4,5],
                 'verbose': [1,2,3]
                 }


random_cv = GridSearchCV(random_forest, param_grid= random_params, verbose=2, cv = 5)
random_cv.fit(x_train, y_train)

In [60]:
random_cv.best_score_
random_cv.best_params_

{'criterion': 'entropy',
 'n_estimators': 500,
 'n_jobs': 1,
 'random_state': 42,
 'verbose': 1}

In [70]:
gnb = GaussianNB()
gaussian = gnb.fit(x_train, y_train)
gaussian.score(x_train, y_train)

0.9040697674418605

In [74]:
y_pred_svm = svm_grid.best_estimator_.predict(x_test)
y_pred_gnb = gaussian.predict(x_test)
y_pred_forest = random_cv.best_estimator_.predict(x_test)
y_pred_logistc = grid_logic.best_estimator_.predict(x_test)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.0s


In [78]:
from sklearn.metrics import  classification_report


print(classification_report(y_test, y_pred_svm))
print()

print(classification_report(y_test, y_pred_gnb))
print()

print(classification_report(y_test, y_pred_forest))
print()

print(classification_report(y_test, y_pred_logistc))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.28      1.00      0.44        11

    accuracy                           0.28        39
   macro avg       0.14      0.50      0.22        39
weighted avg       0.08      0.28      0.12        39


              precision    recall  f1-score   support

           0       0.87      0.96      0.92        28
           1       0.88      0.64      0.74        11

    accuracy                           0.87        39
   macro avg       0.87      0.80      0.83        39
weighted avg       0.87      0.87      0.86        39


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        11

    accuracy                           1.00        39
   macro avg       1.00      1.00      1.00        39
weighted avg       1.00      1.00      1.00        39


              pr