# Modelagem Hepatite - Regressão Logistica
#### Alex Cecconi de Souza 

## Preprocessamento dos dados

Neste notebook irei efetuar a modelagem para o problema de classificação hepatite, o proposito é identificar quem tem hepatite.\
1º Começo com a preparação dos dados \
2º Efetuo a classifiação por regressão linear \
3º Termino com a classificação por regressão logistica \

In [3]:
#Pacote de dados
import pandas as pd
import numpy as np

In [4]:
#importa dados, dados coletados em UC Irvine Machine Learning Repository.
list_names = ["Class","AGE","SEX","STEROID","ANTIVIRALS","FATIGUE","MALAISE","ANOREXIA","LIVER BIG","LIVER FIRM","SPLEEN PALPABLE","SPIDERS","ASCITES","VARICES","BILIRUBIN",
              "ALK PHOSPHATE","SGOT","ALBUMIN","PROTIME","HISTOLOGY"]
df = pd.read_csv(r"C:\Users\Alex\Desktop\Aprendizado_de_Maquina\Atividade_01\Hepatite\hepatitis.data", sep=",", names = list_names)

In [5]:
#Visualiza top 10 linhas
df.head(n=10)

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1.00,85,18,4.0,?,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.90,135,42,3.5,?,1
2,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.70,96,32,4.0,?,1
3,2,31,1,?,1,2,2,2,2,2,2,2,2,2,0.70,46,52,4.0,80,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.00,?,200,4.0,?,1
5,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.90,95,28,4.0,75,1
6,1,51,1,1,2,1,2,1,2,2,1,1,2,2,?,?,?,?,?,1
7,2,23,1,2,2,2,2,2,2,2,2,2,2,2,1.00,?,?,?,?,1
8,2,39,1,2,2,1,2,2,2,1,2,2,2,2,0.70,?,48,4.4,?,1
9,2,30,1,2,2,2,2,2,2,2,2,2,2,2,1.00,?,120,3.9,?,1


In [6]:
#Filtra X e Y
y = df.iloc[:,0].replace({2:1,1:-1})                                
X = df.iloc[:,1:21]

In [7]:
#Separa ba base em X e Y, sendo que Y é codificado em 1 e -1.
pd.DataFrame(X.dtypes).T

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,int64,int64,object,int64,object,object,object,object,object,object,object,object,object,object,object,object,object,object,int64


In [8]:
#Substitui '?' por valor nulo, dado que os nulos eu farei empute pela média da coluna
X.replace('?',np.nan, inplace = True)

In [9]:
pd.DataFrame(X).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,145,146,147,148,149,150,151,152,153,154
AGE,30.0,50.0,78.0,31.0,34.0,34.0,51.0,23.0,39.0,30.0,...,31.0,41.0,70.0,20.0,36.0,46.0,44.0,61.0,53.0,43.0
SEX,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0
STEROID,1.0,1.0,2.0,,2.0,2.0,1.0,2.0,2.0,2.0,...,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0
ANTIVIRALS,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
FATIGUE,2.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,...,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0
MALAISE,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,1.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0
ANOREXIA,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,...,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0
LIVER BIG,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,,2.0,2.0,2.0,2.0,1.0,2.0,2.0
LIVER FIRM,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,...,2.0,1.0,,,2.0,2.0,1.0,1.0,2.0,2.0
SPLEEN PALPABLE,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,...,2.0,1.0,,2.0,2.0,2.0,2.0,2.0,1.0,1.0


In [10]:
#Preenche os nulos de toda a tabela com a media desconsiderando os nulos, aplica direto
X.fillna( X.median(skipna = True ), inplace = True)

In [11]:
from sklearn import preprocessing 

In [12]:
#Instancio o scaler para padronizar as features, as features escolhidas estão no código, ali  apresento os indices das colunas.
Scaler =preprocessing.StandardScaler()
X.iloc[:,[0,13,14,15,16,17]] = Scaler.fit_transform(X.iloc[:,[0,13,14,15,16,17]] )

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
#Aqui eu quebro a base em treino, validação e teste.
X_treino, X_test, y_treino, y_test = train_test_split(X.values.astype(float),y.values.astype(float), test_size = 0.33 , random_state=42, 
                                                                                                              stratify = y)
X_train, X_valid, y_train, y_valid = train_test_split(X_treino,y_treino, test_size = 0.2 , random_state=42, stratify = y_treino)

-----
# Regressao Logistica - Matricial

In [15]:
w_new = np.random.rand(X.shape[1]) # Inicializando um w - peso aleatorio 
i, n_iteracoes  = 1, 10000 # Parametros de interação
eta = 0.001 # taxa de aprendizado
precisao = 0.0001 
Delta_Xs = 1

Gradiente descendente da função de custo Entropia Cruzada
## $ \nabla E \left (\overrightarrow{w}  \right ) =  -\frac{1}{2} \sum_{n = 1}^{N}  \frac{\overrightarrow{y_{n}} \cdot \overrightarrow{x_{n}\ } }{ 1 + e^{\overrightarrow{y_{n}} \overrightarrow{w} \cdot \overrightarrow{x_{n}} } } $

In [16]:
#Processo Gradiente Descendente - A base utilziada foi a de treinamento 
for i in range(n_iteracoes)  :
    w_old = w_new
    #Na linha abaixo eu crio o gradiente descendente representado pela função lambda, e itero linha a linha pela função map.
    Grad_Func_Cust = map(lambda m: np.divide( y_treino[m] * X_treino[m] , (1 + np.exp( y_treino[m] * w_new @ X_treino[m]))),list(range(len(X_treino))))
    erro = pd.DataFrame(Grad_Func_Cust).sum(axis = 0).values
    #Caminhar descendente do gradiente
    w_new = w_new -  eta*np.divide(-1,len(X_treino))*erro
    #Avalio a evolução dos Ws (pesos)
    Delta_Ws = abs( np.linalg.norm(w_new) - np.linalg.norm(w_old) )
    print(Delta_Ws, i)
    if Delta_Ws < precisao:
        break

0.00089898078812789 0
0.0008987839310523604 1
0.0008985868901545402 2
0.0008983896652456913 3
0.000898192256132635 4
0.0008979946626230806 5
0.0008977968845256257 6
0.0008975989216484237 7
0.0008974007737974077 8
0.0008972024407820633 9
0.0008970039224069914 10
0.0008968052184816777 11
0.0008966063288116111 12
0.0008964072532031686 13
0.0008962079914627274 14
0.0008960085433975529 15
0.0008958089088118015 16
0.0008956090875131828 17
0.0008954090793054093 18
0.0008952088839957462 19
0.0008950085013879061 20
0.0008948079312878221 21
0.0008946071734996508 22
0.000894406227828437 23
0.0008942050940783375 24
0.000894003772054397 25
0.0008938022615589958 26
0.0008936005623971788 27
0.0008933986743731026 28
0.0008931965972882594 29
0.0008929943309485822 30
0.0008927918751537867 31
0.0008925892297098059 32
0.000892386394417688 33
0.0008921833690807013 34
0.0008919801534998939 35
0.0008917767474785343 36
0.0008915731508181146 37
0.000891369363320571 38
0.0008911653847865075 39
0.000890961215018

0.0008059034898435513 391
0.0008056180797013646 392
0.0008053323942318613 393
0.0008050464331503804 394
0.0008047601961731488 395
0.0008044736830146171 396
0.0008041868933910123 397
0.0008038998270154529 398
0.0008036124836032776 399
0.0008033248628680489 400
0.0008030369645233293 401
0.0008027487882835693 402
0.000802460333859667 403
0.000802171600966961 404
0.0008018825893154613 405
0.0008015932986191743 406
0.0008013037285889979 407
0.0008010138789358301 408
0.000800723749371901 409
0.0008004333396076646 410
0.0008001426493531305 411
0.0007998516783191967 412
0.0007995604262145406 413
0.0007992688927505043 414
0.000798977077634877 415
0.0007986849805763363 416
0.0007983926012848919 417
0.0007980999394674448 418
0.0007978069948322286 419
0.0007975137670865884 420
0.0007972202559387576 421
0.0007969264610938609 422
0.0007966323822605759 423
0.000796338019142695 424
0.0007960433714493398 425
0.0007957484388825264 426
0.0007954532211500442 427
0.0007951577179552416 428
0.000794861929004

0.0006759622812695376 768
0.0006755494478598578 769
0.000675136201816251 770
0.0006747225426693149 771
0.0006743084699518676 772
0.0006738939831945068 773
0.000673479081926498 774
0.000673063765679327 775
0.0006726480339800389 776
0.0006722318863583432 777
0.0006718153223412848 778
0.000671398341456797 779
0.0006709809432310365 780
0.0006705631271901602 781
0.0006701448928594367 782
0.0006697262397641346 783
0.0006693071674290785 784
0.0006688876753759843 785
0.0006684677631305647 786
0.0006680474302127593 787
0.0006676266761469485 788
0.0006672055004526278 789
0.0006667839026515132 790
0.0006663618822639883 791
0.0006659394388091044 792
0.0006655165718068012 793
0.0006650932807747978 794
0.0006646695652312573 795
0.000664245424694343 796
0.0006638208586795535 797
0.0006633958667050521 798
0.0006629704482850052 799
0.0006625446029349114 800
0.0006621183301698252 801
0.0006616916295043573 802
0.0006612645004508977 803
0.0006608369425236127 804
0.0006604089552344483 805
0.000659980538094

0.0005104122746368844 1105
0.0005098341167810716 1106
0.0005092554202108523 1107
0.0005086761851580412 1108
0.0005080964118580056 1109
0.0005075161005594353 1110
0.0005069352515181258 1111
0.0005063538650005306 1112
0.0005057719412837614 1113
0.0005051894806520352 1114
0.0005046064834015596 1115
0.0005040229498387561 1116
0.0005034388802771517 1117
0.0005028542750440401 1118
0.0005022691344742647 1119
0.000501683458914659 1120
0.0005010972487204945 1121
0.0005005105042585889 1122
0.0004999232259073061 1123
0.0004993354140534478 1124
0.0004987470690958062 1125
0.0004981581914442756 1126
0.000497568781516744 1127
0.0004969788397479746 1128
0.0004963883665762836 1129
0.0004957973624573064 1130
0.0004952058278542282 1131
0.0004946137632431125 1132
0.0004940211691106811 1133
0.0004934280459556462 1134
0.0004928343942873781 1135
0.0004922402146281257 1136
0.0004916455075112403 1137
0.0004910502734816191 1138
0.0004904545130961502 1139
0.0004898582269237117 1140
0.0004892614155456165 1141
0.0

0.0003112471074255829 1416
0.0003105881100615715 1417
0.0003099294295068056 1418
0.00030927107110612084 1419
0.0003086130402087939 1420
0.0003079553421707626 1421
0.0003072979823515176 1422
0.0003066409661165448 1423
0.0003059842988351047 1424
0.0003053279858813429 1425
0.00030467203263317977 1426
0.0003040164444720883 1427
0.00030336122678376043 1428
0.0003027063849567746 1429
0.00030205192438259587 1430
0.00030139785045601997 1431
0.00030074416857450714 1432
0.00030009088413707197 1433
0.0002994380025456156 1434
0.00029878552920359347 1435
0.00029813346951579334 1436
0.0002974818288887793 1437
0.0002968306127297815 1438
0.0002961798264473625 1439
0.00029552947544986274 1440
0.00029487956514673286 1441
0.0002942301009467574 1442
0.0002935810882589429 1443
0.000292932532492296 1444
0.00029228443905382484 1445
0.0002916368133500935 1446
0.000290989660787222 1447
0.00029034298676866577 1448
0.000289696796696548 1449
0.00028905109597099354 1450
0.00028840588998968464 1451
0.00028776118414

0.0001272257958460088 1778
0.00012691695958144678 1779
0.00012660918721185688 1780
0.00012630247638845127 1781
0.00012599682475000762 1782
0.00012569222992420137 1783
0.0001253886895273837 1784
0.00012508620116591374 1785
0.0001247847624343823 1786
0.00012448437091783227 1787
0.00012418502418953814 1788
0.0001238867198145588 1789
0.00012358945534551857 1790
0.00012329322832749234 1791
0.00012299803629445272 1792
0.00012270387677104644 1793
0.00012241074727303847 1794
0.00012211864530664585 1795
0.00012182756836964792 1796
0.00012153751395005408 1797
0.00012124847952810214 1798
0.00012096046257514814 1799
0.00012067346055388839 1800
0.00012038747091969171 1801
0.00012010249111948923 1802
0.00011981851859199644 1803
0.00011953555076926747 1804
0.0001192535850744747 1805
0.00011897261892501731 1806
0.00011869264973007887 1807
0.00011841367489195953 1808
0.00011813569180652017 1809
0.00011785869786229419 1810
0.00011758269044248593 1811
0.00011730766692186201 1812
0.00011703362467141432 18

Função Logistica: Output uma probabilidade
Preditor
## $ \theta \left (\overrightarrow{s}  \right ) = \frac{e^{\overrightarrow{w} \cdot \overrightarrow{x_{n}} } }{ 1 + e^{ \overrightarrow{w} \cdot \overrightarrow{x_{n}} } } $

In [25]:
#Função logistica
#Para a predição e a acuracia, foi utilizado base teste
theta = np.divide( np.exp(np.sum(w_new*X_test, axis = 1)) , (1 + np.exp(np.sum(w_new*X_test, axis = 1)))  )

In [30]:
#Acuracia: Avaliando quantos eu acertei na predição
Acuracia = np.sum( np.where( theta >= 0.5,1,-1) == y_test ) / y_test.shape[0]
round(Acuracia)

0.8076923076923077

# FIM