<a href="https://colab.research.google.com/github/Dly27/oscar-predictor/blob/main/oscar_winner_predictor_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package installation

In [None]:
if (!require(scales)) install.packages("scales")
if (!require(caret)) install.packages("caret")
if (!require(car)) install.packages("car")
if (!require(pROC)) install.packages("pROC")

# Import libraries

In [None]:
library(pROC)
library(car)
library(scales)
library(caret)

# Data preparation

In [None]:
# Load data
data <- read.csv("/content/oscars.csv", header=TRUE)

# Remove values to predict
train_test_data <- data[-(1:10), ]

# Rescale predictors
train_test_data$Ch <- ifelse(train_test_data$Ch == 1, 1, 0) # Use train_test_data$Ch instead of train_data$Ch
train_test_data$WR<- train_test_data$WR / 10
train_test_data$Ebert <- train_test_data$Ebert / 4
for (col in c("Nom", "Length", "Days")) {
  train_test_data[[col]] <- rescale(train_test_data[[col]]) # Use train_test_data[[col]] instead of train_test_ata[[col]]
}

# Split data into training and testing sets (80% training, 20% test)
set.seed(123)
train_test_index <- createDataPartition(train_test_data$Ch, p = 0.8, list = FALSE)

# Create the training and test sets
train_data <- train_test_data[train_test_index, ]
test_data <- train_test_data[-train_test_index, ]

print(train_data)

    Year                                            Name Ch        Nom Pic Dir
11  2023                                American Fiction  0 0.30769231   1   1
14  2023                      Killers of the Flower Moon  0 0.69230769   1   1
15  2023                                         Maestro  0 0.46153846   1   0
16  2023                                     Oppenheimer  1 0.46153846   1   1
17  2023                                      Past Lives  0 0.07692308   1   0
18  2023                                     Poor Things  0 0.76923077   1   1
20  2023                            The Zone of Interest  0 0.30769231   1   1
21  2022                  All Quiet on the Western Front  0 0.61538462   1   0
22  2022                        Avatar: The Way of Water  0 0.23076923   1   0
23  2022                                           Elvis  0 0.53846154   1   0
24  2022               Everything Everywhere All at Once  1 0.76923077   1   1
26  2022                       The Banshees of Inish

# Fit full model

In [None]:
full_model <- glm(Ch ~ . -Year -Name -Pic -Anf, data = train_data, family = binomial )
summary(full_model)


Call:
glm(formula = Ch ~ . - Year - Name - Pic - Anf, family = binomial, 
    data = train_data)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -10.93047    3.99967  -2.733  0.00628 ** 
Nom            1.81402    6.12440   0.296  0.76708    
Dir            1.72168    0.81968   2.100  0.03569 *  
Aml           -0.49533    0.62685  -0.790  0.42942    
Afl            0.03678    0.72102   0.051  0.95932    
Ams            0.40548    0.57710   0.703  0.48230    
Afs           -0.28542    0.65734  -0.434  0.66414    
Scr            0.88444    0.81614   1.084  0.27851    
Cin            0.22110    0.68120   0.325  0.74550    
Art           -0.51191    0.77826  -0.658  0.51069    
Cos           -0.85891    0.84341  -1.018  0.30850    
Sco            0.08391    0.63531   0.132  0.89493    
Son           -0.25362    1.05735  -0.240  0.81043    
Edi            1.44742    0.69821   2.073  0.03817 *  
Sou           -0.91281    0.84534  -1.080  0.28022    
For    

# Calculate confidence intervals

In [None]:
suppressWarnings(confint(full_model))

Waiting for profiling to be done...



Unnamed: 0,2.5 %,97.5 %
(Intercept),-19.1521676,-3.3783552
Nom,-11.2259071,13.5588029
Dir,0.1951741,3.4556821
Aml,-1.7304598,0.7685618
Afl,-1.3743303,1.4815254
Ams,-0.7091538,1.5941985
Afs,-1.5837208,1.0344024
Scr,-0.6814095,2.5619538
Cin,-1.1034298,1.5999448
Art,-2.0432315,1.0460378


# Calculate VIF

In [None]:
print(round(vif(full_model),4))

      Nom       Dir       Aml       Afl       Ams       Afs       Scr       Cin 
  42.8236    2.1825    3.3707    3.1583    3.1774    4.5967    2.0898    3.0272 
      Art       Cos       Sco       Son       Edi       Sou       For       Eff 
   3.8267    3.7775    2.8091    2.3226    2.4170    4.4926    1.8186    2.6017 
      Mak       Dan        AD       Gdr       Gmc        Gd       Gm1       Gm2 
   2.7870    2.0359    1.0000    1.8194    1.9880    2.8053    2.5088    1.0775 
      Gf1       Gf2       PGA       DGA    Action Adventure Animation Biography 
   4.1151    1.0000    3.3387    3.3298    2.1791    1.8872    1.0000    1.5757 
   Comedy     Crime      Docu     Drama    Family   Fantasy Film.noir   History 
   2.1307    2.1207    1.0000    2.5013    2.0385    2.7829    1.2323    1.8961 
   Horror     Music   Musical   Mystery   Romance     SciFi     Sport  Thriller 
   1.1126    1.7410    2.6886    1.7987    1.7610    2.0357    1.5318    1.8610 
      War   Western    Lengt

Nomination has extremely large VIF so we should remove it

# K-fold Cross Validation

In [66]:
train_control <- trainControl(method = "cv", number = 10, summaryFunction = defaultSummary)
cv_model <- train(Ch ~ . -Year -Name -Pic -Anf -Nom, data = train_data, method = "glm", family = "binomial", trControl = train_control)
print(cv_model)
mse <- cv_model$results$RMSE^2
print(paste("Average PMSE across folds:", mean(mse)))

“You are trying to do regression and your outcome only has two possible values Are you trying to do classification? If so, use a 2 level factor as your outcome column.”
“prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”


Generalized Linear Model 

484 samples
 67 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 435, 435, 436, 436, 436, 435, ... 
Resampling results:

  RMSE       Rsquared   MAE      
  0.3599705  0.1986693  0.1737482

[1] "Average PMSE across folds: 0.129578785401439"


# Stepwise selection

In [None]:
full_model <- glm(Ch ~ . -Year -Name -Pic -Anf -Nom, data = train_data, family = binomial )
backward_model <- step(full_model, direction = "backward", trace = 0)

null_model <- glm(Ch ~ 1, data = train_data, family = binomial )
forward_model <- step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward", trace = 0)

stepwise_model <- stepwise_model <- step(full_model, direction = "both", trace = 0)

summary(backward_model)
summary(forward_model)
summary(stepwise_model)


Call:
glm(formula = Ch ~ Dir + Scr + Edi + Dan + Gdr + Gd + PGA + DGA + 
    Musical + Romance + SciFi + Days + R + NSFC + WR, family = binomial, 
    data = train_data)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -14.1482     3.1630  -4.473 7.71e-06 ***
Dir           1.3869     0.5614   2.470  0.01350 *  
Scr           0.8285     0.5507   1.505  0.13243    
Edi           1.1055     0.4445   2.487  0.01287 *  
Dan           2.4480     1.4805   1.653  0.09824 .  
Gdr           1.5094     0.4769   3.165  0.00155 ** 
Gd           -4.4912     1.8544  -2.422  0.01544 *  
PGA           3.2979     0.4959   6.650 2.92e-11 ***
DGA           2.6952     1.2724   2.118  0.03417 *  
Musical       1.5835     0.7441   2.128  0.03332 *  
Romance       0.8458     0.4323   1.956  0.05042 .  
SciFi        -2.6457     1.6126  -1.641  0.10086    
Days          3.1308     1.1613   2.696  0.00702 ** 
R            -0.7827     0.4715  -1.660  0.09695 .  
NSFC          1.559


Call:
glm(formula = Ch ~ PGA + Dir + Edi + Dan + Gdr + Days + NSFC + 
    SciFi + Gd + DGA + Romance + WR + Musical + R + Scr, family = binomial, 
    data = train_data)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -14.1482     3.1630  -4.473 7.71e-06 ***
PGA           3.2979     0.4959   6.650 2.92e-11 ***
Dir           1.3869     0.5614   2.470  0.01350 *  
Edi           1.1055     0.4445   2.487  0.01287 *  
Dan           2.4480     1.4805   1.653  0.09824 .  
Gdr           1.5094     0.4769   3.165  0.00155 ** 
Days          3.1308     1.1613   2.696  0.00702 ** 
NSFC          1.5592     0.5871   2.656  0.00792 ** 
SciFi        -2.6457     1.6126  -1.641  0.10086    
Gd           -4.4912     1.8544  -2.422  0.01544 *  
DGA           2.6952     1.2724   2.118  0.03417 *  
Romance       0.8458     0.4323   1.956  0.05042 .  
WR            9.9812     3.9107   2.552  0.01070 *  
Musical       1.5835     0.7441   2.128  0.03332 *  
R            -0.782


Call:
glm(formula = Ch ~ Dir + Scr + Edi + Dan + Gdr + Gd + PGA + DGA + 
    Musical + Romance + SciFi + Days + R + NSFC + WR, family = binomial, 
    data = train_data)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -14.1482     3.1630  -4.473 7.71e-06 ***
Dir           1.3869     0.5614   2.470  0.01350 *  
Scr           0.8285     0.5507   1.505  0.13243    
Edi           1.1055     0.4445   2.487  0.01287 *  
Dan           2.4480     1.4805   1.653  0.09824 .  
Gdr           1.5094     0.4769   3.165  0.00155 ** 
Gd           -4.4912     1.8544  -2.422  0.01544 *  
PGA           3.2979     0.4959   6.650 2.92e-11 ***
DGA           2.6952     1.2724   2.118  0.03417 *  
Musical       1.5835     0.7441   2.128  0.03332 *  
Romance       0.8458     0.4323   1.956  0.05042 .  
SciFi        -2.6457     1.6126  -1.641  0.10086    
Days          3.1308     1.1613   2.696  0.00702 ** 
R            -0.7827     0.4715  -1.660  0.09695 .  
NSFC          1.559

All have equal AIC so pick any of the models

# Calculate AUC and sensitivity of final model

In [None]:
# Predict probabilities
predicted_probabilities <- predict(stepwise_model, type="response", newdata=test_data)
print(predicted_probabilities)
roc_curve = roc(test_data$Ch, predicted_probabilities)

# Output AUC
print(roc_curve)
print(paste("AUC: ", auc(roc_curve)))