# 数据预处理

## 读取数据

In [1]:
# 读取数据
lung.cancer <- read.csv("lung-cancer.csv")

# 去除不需要的特征
columns_to_remove <- c("Patient.ID",
                       "Year.of.death.recode",
                       "Year.of.follow.up.recode",
                       "Regional.nodes.positive..1988..",
                       "Regional.nodes.examined..1988..",
                       "Derived.EOD.2018.T..2018..",
                       "Derived.EOD.2018.N..2018..",
                       "Derived.EOD.2018.M..2018..",
                       "Derived.EOD.2018.Stage.Group..2018..",
                       "RX.Summ..Surg.Rad.Seq",
                       "RX.Summ..Systemic.Sur.Seq..2007..",
                       "EOD.Primary.Tumor..2018..")
                       #"Histology.recode...broad.groupings")
lung.cancer <- lung.cancer[, !names(lung.cancer) %in% columns_to_remove]

## 处理特征

In [2]:
# 数值型变量
num_columns <- c("Tumor.Size.Summary..2016..",
                 "Survival.months",
                 "Months.from.diagnosis.to.treatment",
                 "Age.recode.with.single.ages.and.90.")

# 将非数值型变量转化为 factor
lung.cancer[, !names(lung.cancer) %in% num_columns] <- lapply(lung.cancer[, !names(lung.cancer) %in% num_columns], as.factor)

# 去除 Tumor size 的缺失值
lung.cancer <- lung.cancer[lung.cancer$Tumor.Size.Summary..2016..<990, ]

# 描述性统计

# 数据建模

In [3]:
summarize.model <- function(model, filename){
    model.summary <- summary(model)
    output <- capture.output(model.summary)
    writeLines(output, filename)
    # output.vars <- capture.output(summary(model)$coefficients[, "Pr(>|t|)"] < 0.05)
    # writeLines(output.vars, paste("significant_cars_", filename))
}

## 线性模型

In [6]:
linear.model <- lm(Survival.months ~ ., data=lung.cancer)
summarize.model(linear.model, "linear_model_summary.txt")
aic_value <- AIC(linear.model)
print(aic_value)

[1] 185557.2


## 广义线性模型

In [5]:
library(MASS)
glinear.model <- glm.nb(Survival.months ~ ., data=lung.cancer)
summarize.model(glinear.model, "generalized_linear_model_summary.txt")