-
Notifications
You must be signed in to change notification settings - Fork 2
/
Model Solution.R
249 lines (247 loc) · 7.79 KB
/
Model Solution.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
setwd(" ")
getwd()
## [1] " "
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.4.4
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.4
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(pscl)
## Warning: package 'pscl' was built under R version 3.4.4
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.4
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.4
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(pROC)
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.4
# install.packages("Deducer")
library(Deducer)
## Warning: package 'Deducer' was built under R version 3.4.4
## Loading required package: JGR
## Warning: package 'JGR' was built under R version 3.4.4
## Loading required package: rJava
## Warning: package 'rJava' was built under R version 3.4.4
## Loading required package: JavaGD
## Warning: package 'JavaGD' was built under R version 3.4.4
##
## Please type JGR() to launch console. Platform specific launchers (.exe and
.app) can also be obtained at http://www.rforge.net/JGR/files/.
## Loading required package: car
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.4.4
## Loading required package: MASS
##
##
## Note Non-JGR console detected:
## Deducer is best used from within JGR (http://jgr.markushelbig.org/).
## To Bring up GUI dialogs, type deducer().
library(ggplot2)
#install.packages("rJava")
library(rJava)
#Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre1.8.0_181/bin')
# Read the training and test data trainData
<- read.csv("Training_Data.csv")
View(trainData)
testData <- read.csv("Test_Data.csv")
View(testData)
# Build logistic regression Model
logitModel <- glm(Prescription ~ Lagged_Returns, data = trainData, family =
binomial)
# anova() to analyze the devaiance
anova(logitModel, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Prescription
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 125 174.67
## Lagged_Returns 1 4.3556 124 170.32 0.03689 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Overall significance of the Logistic Regression to test
Applicability lrtest(logitModel)
## Likelihood ratio test
##
## Model 1: Prescription ~ Lagged_Returns
## Model 2: Prescription ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 2 -85.159
## 2 1 -87.337 -1 4.3556 0.03689 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# McFadden R Square and Interpretation
pR2(logitModel)
## llh llhNull G2 McFadden r2ML
## -85.15875583 -87.33654475 4.35557784 0.02493560 0.03397743
## r2CU
## 0.04530324
# Summary of the model
summary(logitModel)
##
## Call:
## glm(formula = Prescription ~ Lagged_Returns, family = binomial,
## data = trainData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7985 -1.1366 0.1346 1.1483 1.4857
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0008332 0.1812373 -0.005 0.9963
## Lagged_Returns 37.0917281 18.6910542 1.984 0.0472 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 174.67 on 125 degrees of freedom
## Residual deviance: 170.32 on 124 degrees of freedom
## AIC: 174.32
##
## Number of Fisher Scoring iterations: 4
# Odds Ratio
odds <- exp(coef(logitModel))
probability <- odds / (1+odds)
odds
## (Intercept) Lagged_Returns
## 9.991672e-01 1.284496e+16
probability
## (Intercept) Lagged_Returns
## 0.4997917 1.0000000
# Let us predict on the training data
trainData$predictions <- predict(logitModel, trainData, type = "response")
trainData$predictions <- ifelse(trainData$predictions > 0.5,1,0)
rocplot(logitModel)
ggplot(trainData, aes(trainData$predictions, color =
as.factor(trainData$Prescription)))+ geom_density()
# Confusion matrix
confusionMatrix(data = as.factor(trainData$predictions), reference
= as.factor(trainData$Prescription))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 42 27
## 1 21 36
##
## Accuracy : 0.619
## 95% CI : (0.5283, 0.7041)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.004752
##
## Kappa : 0.2381
## Mcnemar's Test P-Value : 0.470486
##
## Sensitivity : 0.6667
## Specificity : 0.5714
## Pos Pred Value : 0.6087
## Neg Pred Value : 0.6316
## Prevalence : 0.5000
## Detection Rate : 0.3333
## Detection Prevalence : 0.5476
## Balanced Accuracy : 0.6190
##
## 'Positive' Class : 0
##
tabValues <- table(trainData$predictions, trainData$Prescription)
sum(diag(tabValues)/sum(tabValues))
## [1] 0.6190476
1 - sum(diag(tabValues)/sum(tabValues))
## [1] 0.3809524
ROCRpred <- prediction(trainData$predictions, trainData$Prescription)
ROCRperf <- performance(ROCRpred, measure = "tpr", x.measure = "fpr")
# ROC & AUC Plots
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2,1.7), print.cutoffs.at =
seq(0,1,0.1))
auc <- performance(ROCRpred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.6190476
# Write the predicted values in an output file
write.csv(trainData, "Logit_Train_Predictions.csv")
# Let us predict on the test data
testData$predictions <- predict(logitModel,newdata = testData,type
= 'response')
testData$predictions <- ifelse(testData$predictions > 0.5,1,0)
testData$predictions <- ifelse(testData$predictions > 0.5,1,0)
ggplot(testData, aes(testData$predictions, color =
as.factor(testData$Prescription)))+ geom_density()
# Confusion matrix
confusionMatrix(data=as.factor(testData$predictions),
reference=as.factor(testData$Prescription))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 29 34
## 1 29 32
##
## Accuracy : 0.4919
## 95% CI : (0.4011, 0.5832)
## No Information Rate : 0.5323
## P-Value [Acc > NIR] : 0.8389
##
## Kappa : -0.0151
## Mcnemar's Test P-Value : 0.6143
##
## Sensitivity : 0.5000
## Specificity : 0.4848
## Pos Pred Value : 0.4603
## Neg Pred Value : 0.5246
## Prevalence : 0.4677
## Detection Rate : 0.2339
## Detection Prevalence : 0.5081
## Balanced Accuracy : 0.4924
##
## 'Positive' Class : 0
##
tabValues <- table(testData$predictions, testData$Prescription)
sum(diag(tabValues)/sum(tabValues))
## [1] 0.4919355
1 - sum(diag(tabValues)/sum(tabValues))
## [1] 0.5080645
ROCRpred <- prediction(testData$predictions, testData$Prescription)
ROCRperf <- performance(ROCRpred, measure = "tpr", x.measure = "fpr")
# ROC & AUC Plots
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2,1.7), print.cutoffs.at =
seq(0,1,0.1))
auc
auc <- performance(ROCRpred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.4924242
# Write the predicted values in an output file
write.csv(testData, "Logit_Test_Predictions.csv")