-
Notifications
You must be signed in to change notification settings - Fork 0
/
tidy_environment(genome)_2.Rmd
338 lines (233 loc) · 7.01 KB
/
tidy_environment(genome)_2.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
---
output:
bookdown::html_document2:
code_folding: show
number_sections: no
toc: yes
toc_depth: 6
toc_float: yes
---
# Load the packages and data
```{r}
library('xlsx')
library(caret)
library(tidyverse)
library(glmnet)
library(survminer)
library(timeROC)
library(e1071)
library(xgboost)
library(randomForest)
```
```{r}
env_list <- read.xlsx("environment.xlsx", sheetIndex=1)
genome <- read.xlsx("总基因组.xlsx", sheetIndex = 1)
protein <- read.csv('./protein2/my_data.csv')
```
# Tidy the environment data
```{r}
df3 <- merge(env_list,genome, by.x = 'name',by.y='生境1')
df4 <- data.frame(df3$Assembly.基因组编号.,
df3$group1)
colnames(df4)[1] <- "GCA_id"
colnames(protein)[1] <- "GCA"
data <- merge(df4,protein,by.x='GCA_id',by.y = 'GCA')
colnames(data)[2] <- "group1"
data <- data[, -3]
water_data <- data %>% filter(group1 == "water")
non_water_data <- data %>% filter(group1 == "non_water")
```
# t-test
```{r}
# 读入数据
# 提取数值型变量的列名
numeric_cols <- names(data)[sapply(data, is.numeric)]
# 创建一个空数据框,用于保存检验结果
ttest_results <- data.frame(variable = character(),
p.value = numeric(),
stringsAsFactors = FALSE)
# 循环遍历数值型变量,对每个变量进行t检验
for (col in numeric_cols) {
ttest_result <- t.test(data[[col]] ~ data$group1)
ttest_results <- rbind(ttest_results,
data.frame(variable = col, p.value = ttest_result$p.value))
}
```
The result of the t-test and the p-value < 0.01
```{r,results='hide'}
significant_vars <- ttest_results %>%
filter(p.value < 0.01) %>%
select(variable)
significant_vars
```
# The random forest model
```{r}
data$group1 <- as.factor(data$group1)
# 划分训练集和测试集
set.seed(111)
train_idx <- sample(1:nrow(data), 0.8 * nrow(data))
train_data <- data[train_idx, ]
test_data <- data[-train_idx, ]
# 训练随机森林模型
rf_model <- randomForest(group1 ~ ., data = train_data, importance = TRUE)
print(rf_model)
```
```{r}
rf_model <- randomForest(group1 ~ ., data = train_data)
importance_df <- data.frame(Feature = row.names(importance(rf_model)),
Importance = importance(rf_model)[, "MeanDecreaseGini"])
```
```{r}
importance_df_sort <- importance_df %>% arrange(Importance)
```
```{r}
importance_df_sort
```
```{r}
# 绘制特征重要性图
ggplot(importance_df_sort, aes(x = Feature, y = Importance)) +
geom_point(size = 3, color = "dodgerblue4") +
NULL
```
```{r}
library(pROC)
# 将测试集的标签转化为因子型
test_labels <- factor(test_data$group1, levels = c("non_water", "water"))
train_labels <- factor(train_data$group1, levels = c("non_water", "water"))
# 使用随机森林模型在测试集上进行预测
rf_predictions <- predict(rf_model, newdata = test_data, type = "prob")
# 提取随机森林在测试集上预测为阳性的概率
rf_probabilities <- rf_predictions[, "water"]
# 计算 ROC 曲线和 AUC 值
roc_rf_test <- roc(test_labels, rf_probabilities)
auc_rf_test <- auc(roc_rf_test)
# 绘制 ROC 曲线图
ggroc(roc_rf_test) + ggtitle(paste0("Random Forest ROC Curve (AUC = ", round(auc_rf_test, 2), ")"))
```
```{r}
library(ROCR)
```
# SVM
```{r,warning=FALSE}
# 使用径向基核函数训练SVM模型
svm_model <- svm(group1 ~ ., data = train_data[, -1], kernel = "radial", cost = 2)
# 对测试集进行预测
predictions <- predict(svm_model, test_data[, -1])
# 计算混淆矩阵
confusion_matrix <- table(Predicted = predictions, Actual = test_data$group1)
print(confusion_matrix)
# 计算准确率
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
print(svm_model)
svm_model$index
```
```{r}
# 加载库
library(pROC)
# 使用径向基核函数训练SVM模型
svm_model <- svm(group1 ~ ., data = train_data[, -1], kernel = "radial", cost = 2, probability = TRUE)
# 对测试集进行预测
predictions <- predict(svm_model, test_data[, -1], probability = TRUE)
# 获取预测概率
probabilities <- attr(predictions, "probabilities")[, 2]
# 计算混淆矩阵
confusion_matrix <- table(Predicted = predictions, Actual = test_data$group1)
print(confusion_matrix)
# 计算准确率
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
# 计算ROC曲线
roc_curve <- roc(test_data$group1, probabilities)
# 绘制ROC曲线
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, lwd = 2, col = "gray", lty = 2)
# 计算AUC值
auc_value <- auc(roc_curve)
print(paste("AUC:", auc_value))
# 打印SVM模型
print(svm_model)
# 获取支持向量的索引
svm_model$index
```
```{r}
auc_value
```
# GBT model
```{r}
# 提取特征和标签
train_features <- data.matrix(train_data[, -1])
train_labels <- as.numeric(train_data$group1) - 1 # 转换为从0开始的数字
test_features <- data.matrix(test_data[, -1])
test_labels <- as.numeric(test_data$group1) - 1
# 设置参数
params <- list(
objective = "binary:logistic",
eval_metric = "error",
max_depth = 6,
eta = 0.3
)
# 训练模型
gbt_model <- xgboost(
data = train_features,
label = train_labels,
params = params,
nrounds = 100,
early_stopping_rounds = 3,
# watchlist = list(train = list(data = train_features, label = train_labels)),
verbose = 1 # 设置为1以显示训练进度
)
# 对测试集进行预测
predictions <- predict(gbt_model, test_features)
predicted_labels <- ifelse(predictions > 0.5, 1, 0)
# 计算混淆矩阵
confusion_matrix <- table(Predicted = predicted_labels, Actual = test_labels)
print(confusion_matrix)
# 计算准确率
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
```
```{r}
gbt_model
```
```{r}
# 加载库
library(pROC)
# 计算ROC曲线
roc_curve <- roc(test_labels, predictions)
# 绘制ROC曲线
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, lwd = 2, col = "gray", lty = 2)
# 计算AUC值
auc_value <- auc(roc_curve)
print(paste("AUC:", auc_value))
# 如果需要,添加其他可视化分析
```
```{r, warning=FALSE}
# 将group1列转换为因子类型
data$group1 <- as.factor(data$group1)
# 定义交叉验证参数
k <- 10
folds <- createFolds(data$group1, k = k, list = TRUE, returnTrain = FALSE)
# 初始化准确率向量
accuracies <- numeric(k)
# 进行交叉验证
for (i in 1:k) {
# 分割训练集和测试集
train_data <- data[-folds[[i]], ]
test_data <- data[folds[[i]], ]
# 训练SVM模型
svm_model <- svm(group1 ~ ., data = train_data[,-1], kernel = "radial", cost = 15)
# 预测测试集
predictions <- predict(svm_model, test_data[, -1])
# 计算混淆矩阵和准确率
confusion_matrix <- table(Predicted = predictions, Actual = test_data$group1)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
# 保存准确率
accuracies[i] <- accuracy
}
print(accuracies)
print(paste("Average accuracy:", mean(accuracies)))
```
```{r}
```