In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [3]:
train = pd.read_csv("./Practice_LungCancer/Train.csv")
valid = pd.read_csv("./Practice_LungCancer/Valid.csv")

In [80]:
train.head(2) # 컬럼확인

Unnamed: 0,Relapse,CEA_Post,Lymphatic,Vascular,pStage,pT,pN,Perineural,Age
0,0,0.0,0,0,4,2,2,0,73
1,0,1.2,1,0,4,2,2,1,73


In [7]:
train.info() # 결측치 화인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Relapse     392 non-null    int64  
 1   CEA_Post    392 non-null    float64
 2   Lymphatic   392 non-null    int64  
 3   Vascular    392 non-null    int64  
 4   pStage      392 non-null    int64  
 5   pT          392 non-null    int64  
 6   pN          392 non-null    int64  
 7   Perineural  392 non-null    int64  
 8   Age         392 non-null    int64  
dtypes: float64(1), int64(8)
memory usage: 27.7 KB


In [9]:
train.corr() # 일단 범주형 변수도 포함이지만 상관계수 확인

Unnamed: 0,Relapse,CEA_Post,Lymphatic,Vascular,pStage,pT,pN,Perineural,Age
Relapse,1.0,0.163411,0.206385,0.208355,0.180064,0.056499,0.204281,0.130624,-0.001398
CEA_Post,0.163411,1.0,0.069175,0.056759,0.121155,-0.050621,0.165132,0.201012,-0.000181
Lymphatic,0.206385,0.069175,1.0,0.228239,0.199909,-0.093983,0.319781,0.160543,-0.076858
Vascular,0.208355,0.056759,0.228239,1.0,0.129301,0.042075,0.182302,0.250198,0.10331
pStage,0.180064,0.121155,0.199909,0.129301,1.0,0.370512,0.770096,0.023054,0.099237
pT,0.056499,-0.050621,-0.093983,0.042075,0.370512,1.0,-0.172943,0.01433,0.166362
pN,0.204281,0.165132,0.319781,0.182302,0.770096,-0.172943,1.0,0.055978,-0.029191
Perineural,0.130624,0.201012,0.160543,0.250198,0.023054,0.01433,0.055978,1.0,0.040122
Age,-0.001398,-0.000181,-0.076858,0.10331,0.099237,0.166362,-0.029191,0.040122,1.0


In [6]:
valid.head(2) # 확인

Unnamed: 0,Relapse,CEA_Post,Lymphatic,Vascular,pStage,pT,pN,Perineural,Age
0,0,1.2,1,0,5,1,3,0,42
1,1,1.0,1,0,4,2,2,0,45


In [82]:
model = ols(formula = "Relapse ~ CEA_Post + C(Lymphatic) + C(Vascular) + pN", data = train).fit()

In [83]:
model.summary()

0,1,2,3
Dep. Variable:,Relapse,R-squared:,0.102
Model:,OLS,Adj. R-squared:,0.093
Method:,Least Squares,F-statistic:,10.98
Date:,"Fri, 15 Apr 2022",Prob (F-statistic):,1.93e-08
Time:,17:02:53,Log-Likelihood:,-258.48
No. Observations:,392,AIC:,527.0
Df Residuals:,387,BIC:,546.8
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1035,0.068,1.532,0.126,-0.029,0.236
C(Lymphatic)[T.1],0.1337,0.055,2.440,0.015,0.026,0.241
C(Vascular)[T.1],0.1500,0.049,3.036,0.003,0.053,0.247
CEA_Post,0.0012,0.000,2.600,0.010,0.000,0.002
pN,0.0719,0.032,2.229,0.026,0.008,0.135

0,1,2,3
Omnibus:,3024.0,Durbin-Watson:,1.974
Prob(Omnibus):,0.0,Jarque-Bera (JB):,46.045
Skew:,0.306,Prob(JB):,1e-10
Kurtosis:,1.436,Cond. No.,166.0


## 정리
- Lyphatic, Vascular, CEA_Postm, pN만 유의한 독립변수

## odds ratio 구하기

In [62]:
np.exp(model.params).round(4) # odds ratio

Intercept            1.2090
C(Lymphatic)[T.1]    1.1419
C(Vascular)[T.1]     1.1595
C(pN)[T.2]           1.0382
C(pN)[T.3]           1.1557
CEA_Post             1.0012
dtype: float64

In [None]:
# Lymphatic이 1.1419배만큼 영향을 준다

In [78]:
# 평가용 변수 분리
x_valid = valid.loc[:, ["CEA_Post", "Lymphatic", "Vascular", "pN"]]
y_valid = valid["Relapse"]

In [77]:
pred = model.predict(x_valid)
pred

0     0.454343
1     0.382198
2     0.463856
3     0.175444
4     0.605696
        ...   
93    0.328223
94    0.538425
95    0.604744
96    0.179605
97    0.312074
Length: 98, dtype: float64

## AUROC 구하기

In [75]:
from sklearn.metrics import roc_auc_score

In [79]:
roc_auc_score(y_true= y_valid,
              y_score = pred) # 최대값이 1, 최소값이 0.5에 가까움

0.6421052631578947