<a href="https://colab.research.google.com/github/Dinke265/NSTIP/blob/main/NSTIP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rpy2



In [None]:
%load_ext rpy2.ipython


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
%cd /content/drive/My Drive/NSTIP/

/content/drive/My Drive/NSTIP


In [None]:
%%R

install.packages("tidyverse")
install.packages("dplyr")
install.packages("readr")
install.packages("visdat")

library(tidyverse)
library(dplyr)
library(readr)
library(visdat)

In [None]:
%%R
allgradsdata <- read_csv("/content/drive/My Drive/NSTIP/AllGraduatesData2022.csv")
stemgrads <- read_csv("/content/drive/My Drive/NSTIP/STEMgraduates.csv")
head(allgradsdata)

In [79]:
%%R
head(stemgrads)

names(stemgrads) <- tolower(names(stemgrads))

print(names(stemgrads))

 [1] "public"                 "private"                "prog_indicator...3"    
 [4] "stem"                   "sti"                    "ict"                   
 [7] "miscellaneous programs" "females"                "males"                 
[10] "level"                  "prog_indicator...11"   


In [None]:
%%R
# Print the unique categories for the level
print(unique(stemgrads$level))


In [86]:
%%R
#Clean the level variable

stemgrads <- stemgrads %>%
  mutate(level = case_when(
    level == "Degree" ~ "Bachelors",
    level == "Degree Level" ~ "Bachelors",
    level == "Undergraduate" ~ "Bachelors",
    level == "BACHELOR'S" ~ "Bachelors",
    level == "DEGREE" ~ "Bachelors",
    level == "Bachelor's degree" ~ "Bachelors",
    level == "Bacherors" ~ "Bachelors",
    level == "Bacheror" ~ "Bachelors",
    level == "Bacheros" ~ "Bachelors",
    level == "Masters Level" ~ "Masters",
    level == "MASTERS DEGREE" ~ "Masters",
    level == "UNDERGRADUATE" ~ "Bachelors",
    level == "MASTER'S" ~ "Masters",
    level == "Diploma Level" ~ "Diploma",
    level == "Foundetion Diploma" ~ "Diploma",
    level == "Diploma Level 5" ~ "Diploma",
    level == "NCC LEVEL 5 DIPLOMA" ~ "Diploma",
    level == "NCC LEVEL 4 DIPLOMA" ~ "Diploma",
    level == "diploma" ~ "Diploma",
    level == "DIPLOMA" ~ "Diploma",
    level == "Level 4" ~ "Diploma",
    level == "Dipoma" ~ "Diploma",
    level == "Level 6" ~ "Bachelors",
    level == "Chichewa Program" ~ "Certificate",
    level == "English Program" ~ "Certificate",
    level == "CERTIFICATE" ~ "Certificate",
    level == "PhD" ~ "Doctorate",
    level == "Doctors" ~ "Doctorate",
    level == "7" ~ "Bachelors",
    level == "9" ~ "Masters",
    level == "10" ~ "Doctorate",
    level == "PHD" ~ "Doctorate",
    level == "4" ~ "Certificate",
    level == "certificate" ~ "Certificate",
    level == "5" ~ "Diploma",
    level == "6" ~ "Diploma",
     level == "Diplomas" ~ "Diploma",
    level == "Post Graduate" ~ "Masters",
    level == "Postgraduate" ~ "Masters",
    level == "CA Level 1" ~ "Diploma",
    level == "Postgraduate Diploma Level" ~ "Postgraduate Other",
    TRUE ~ as.character(level)
  ))


In [87]:
%%R
print(unique(stemgrads$level)) # Check the categories for the level have been correctly replaced

[1] "Bachelors"          "Diploma"            "Doctorate"         
[4] "Masters"            "Certificate"        "Postgraduate Other"
[7] NA                  


In [None]:
%%R
#Renaming some variables
stemgrads <- stemgrads%>%
  rename(
    program_type = `prog_indicator...3`)

print(names(stemgrads))

Creating an indicator variable for institution type.
1= public institution
0= private institution

In [93]:
%%R

stemgrads <- stemgrads %>%
  mutate(
    institution_type = case_when(
      !is.na(public) ~ 1,
      !is.na(private) ~ 0,
      TRUE ~ NA_integer_
    )
  )

stemgrads <- stemgrads %>%
  mutate(row_id = row_number()) %>%
  pivot_longer(cols = c(private, public), names_to = "type_institution", values_to = "institution_value") %>%
  filter(!is.na(institution_value)) %>%
  mutate(institution_type = ifelse(institution_type == "public", 1, 0)) %>%
  select(-row_id)

In [95]:
%%R
stemgrads <- stemgrads[order(stemgrads$institution_value), ]
write.csv(stemgrads, "stemgradsdata.csv", row.names = TRUE)


# **Check missingness**

In [96]:
%%R
stemgradsdata <- stemgrads[c("program_type","males", "females","level", "type_institution",
                         "institution_value")]

In [99]:
%%R
stemgradsdata <- stemgradsdata %>%
    mutate(program_type= case_when(
        program_type ==1 ~ "STEM",
        program_type ==2 ~ "ICT",
        program_type ==3 ~ "STI",
        program_type ==4 ~ "Humanities, Arts and Social Sciences",
    ))

In [103]:
%%R
missingness<- sapply(stemgradsdata, function(x) sum(is.na(x)))
print(missingness)


     program_type             males           females             level 
                1                11                14                 1 
 type_institution institution_value 
                0                 0 


In [None]:
%%R
rows_with_na <- apply(is.na(stemgradsdata), 1, any)

observations_with_missing <- stemgradsdata[rows_with_na, ]

print(observations_with_missing)

# Convert to data frame
missing_values <- as.data.frame(t(missingness))
names(missing_values) <- c("Missing_Values")

In [None]:
%%R
#check for outliers, assume any numbers greater 300 need to be checked
outliers_males <- sum(stemgradsdata$males > 300, na.rm = TRUE)
outliers_females <- sum(stemgradsdata$females > 300, na.rm = TRUE)

# Combine into a data frame
outliers_summary <- data.frame(Variable = c("males", "females"),
                               Outliers = c(outliers_males, outliers_females))

print(outliers_summary)
# Filter rows that have any missing value
rows_with_na <- apply(is.na(stemgradsdata), 1, any)
observations_with_missing <- stemgradsdata[rows_with_na, ]

# save CSV file
write.csv(observations_with_missing, "missing.csv", row.names = FALSE)

outliers <- stemgradsdata[(stemgradsdata$males > 300 & !is.na(stemgradsdata$males)) |
                          (stemgradsdata$females > 300 & !is.na(stemgradsdata$females)), ]


write.csv(outliers, "outliers.csv", row.names = FALSE)

Numbers of males and females by degree level

In [106]:
%%R
stemgradsdata %>%
  group_by(level) %>%
  summarise(
    total_males = sum(males, na.rm = TRUE),
    total_females = sum(females, na.rm = TRUE)
  ) %>%
  print()


# A tibble: 7 × 3
  level              total_males total_females
  <chr>                    <dbl>         <dbl>
1 Bachelors                 6505          4999
2 Certificate                171           151
3 Diploma                   2522          3433
4 Doctorate                   13             6
5 Masters                    372           226
6 Postgraduate Other          16             8
7 <NA>                         0             0


# **Read STEM, STI and ICT data**

In [None]:
%%R
stemgrads <- read_csv("/content/drive/My Drive/NSTIP/STISTEMgraduates.csv")

New names:
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
• `` -> `...15`
• `` -> `...16`
• `` -> `...17`
• `` -> `...18`
• `` -> `...19`
• `` -> `...20`
• `` -> `...21`
• `` -> `...22`
• `` -> `...23`
• `` -> `...24`
• `` -> `...25`
• `` -> `...26`
• `` -> `...27`
Rows: 1196 Columns: 27
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (8): PUBLIC INSTITUTION, PRIVATE INSTITUTUIN, STEM PROGRAM, STI PROGRAM...
dbl  (2): NO.OF FEMALES, NO.OF MALES
lgl (17): ...11, ...12, ...13, ...14, ...15, ...16, ...17, ...18, ...19, ......

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
