In [1]:
import numpy as np
import pandas as pd

In [2]:
brain = pd.read_csv('BrainCancer.csv', index_col='Unnamed: 0', na_values='nan')
brain.head()

Unnamed: 0,sex,diagnosis,loc,ki,gtv,stereo,status,time
1,Female,Meningioma,Infratentorial,90,6.11,SRS,0,57.64
2,Male,HG glioma,Supratentorial,90,19.35,SRT,1,8.98
3,Female,Meningioma,Infratentorial,70,7.95,SRS,0,26.46
4,Female,LG glioma,Supratentorial,80,7.61,SRT,1,47.8
5,Male,HG glioma,Supratentorial,90,5.06,SRT,1,6.3


In [3]:
brain.isna().sum()
brain.diagnosis.fillna(brain.diagnosis.mode(dropna=True)[0], inplace=True)

In [4]:
brain.isna().sum()

sex          0
diagnosis    0
loc          0
ki           0
gtv          0
stereo       0
status       0
time         0
dtype: int64

In [5]:
brain.sex = brain.sex.map({'Female': 1, 'Male': 0})
brain.head()


Unnamed: 0,sex,diagnosis,loc,ki,gtv,stereo,status,time
1,1,Meningioma,Infratentorial,90,6.11,SRS,0,57.64
2,0,HG glioma,Supratentorial,90,19.35,SRT,1,8.98
3,1,Meningioma,Infratentorial,70,7.95,SRS,0,26.46
4,1,LG glioma,Supratentorial,80,7.61,SRT,1,47.8
5,0,HG glioma,Supratentorial,90,5.06,SRT,1,6.3


In [6]:
from sklearn.preprocessing import scale
brain['gtv_s'] = scale(brain.gtv, with_mean=True, with_std=True)
brain['ki_s'] = scale(brain.ki, with_mean=True, with_std=True)

In [7]:
brainQuant = pd.DataFrame({"ki": brain.ki_s, "gtv": brain.gtv_s})

In [8]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['Features'] = brainQuant.columns
vif['VIF'] = [variance_inflation_factor(brainQuant.values, i) for i in range (len(brainQuant.columns))]

In [9]:
vif

Unnamed: 0,Features,VIF
0,ki,1.086608
1,gtv,1.086608


In [10]:
brain.head()

Unnamed: 0,sex,diagnosis,loc,ki,gtv,stereo,status,time,gtv_s,ki_s
1,1,Meningioma,Infratentorial,90,6.11,SRS,0,57.64,-0.29632,0.859153
2,0,HG glioma,Supratentorial,90,19.35,SRT,1,8.98,1.24174,0.859153
3,1,Meningioma,Infratentorial,70,7.95,SRS,0,26.46,-0.082571,-1.054909
4,1,LG glioma,Supratentorial,80,7.61,SRT,1,47.8,-0.122068,-0.097878
5,0,HG glioma,Supratentorial,90,5.06,SRT,1,6.3,-0.418296,0.859153


In [11]:
import statsmodels.formula.api as sm
sm.ols('time~status', data=brain).fit().summary()

0,1,2,3
Dep. Variable:,time,R-squared:,0.162
Model:,OLS,Adj. R-squared:,0.152
Method:,Least Squares,F-statistic:,16.6
Date:,"Thu, 20 Apr 2023",Prob (F-statistic):,0.000102
Time:,14:20:19,Log-Likelihood:,-380.77
No. Observations:,88,AIC:,765.5
Df Residuals:,86,BIC:,770.5
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,33.9979,2.546,13.356,0.000,28.938,39.058
status,-16.4445,4.036,-4.074,0.000,-24.468,-8.421

0,1,2,3
Omnibus:,3.394,Durbin-Watson:,2.005
Prob(Omnibus):,0.183,Jarque-Bera (JB):,3.384
Skew:,0.455,Prob(JB):,0.184
Kurtosis:,2.69,Cond. No.,2.45
