# Data Merging - Non-Streaming OULAD

In [None]:
### Load libraries, necessary data
import pandas as pd
import zipfile
zipped = zipfile.ZipFile("C:/Users/jacma/OneDrive/school work/UMD/Data Science Boot Camp/Group Project/Data/OULAD.zip") 

# may not need all of these
assessments = pd.read_csv(zipped.open('assessments.csv')) # just info about the assessments, may not need
courses = pd.read_csv(zipped.open('courses.csv'))
studentAssessments = pd.read_csv(zipped.open('studentAssessment.csv'))
studentInfo = pd.read_csv(zipped.open('studentInfo.csv'))
studentRegistration = pd.read_csv(zipped.open('studentRegistration.csv'))
studentVLE = pd.read_csv(zipped.open('studentVle.csv')) # virtual learning environment
VLEdata = pd.read_csv(zipped.open('vle.csv'))

assessments = assessments.rename(columns={'date': 'due_date'})

# Merge studentInfo & studentRegistration dataframes 
common_columns = list(set(studentInfo.columns) & set(studentRegistration.columns)) # ['code_module','code_presentation','id_student']
df_studentInfo=pd.merge(studentInfo, studentRegistration, how='outer', on=common_columns)

# checks on merge
assert studentInfo.shape[1] + studentRegistration.shape[1] - len(common_columns) == df_studentInfo.shape[1]
assert studentInfo.shape[0] == studentRegistration.shape[0] == df_studentInfo.shape[0]

# Merge courses with above (adds column module_presentation_length)
common_columns = list(set(courses.columns) & set(df_studentInfo.columns)) # ['code_module', 'code_presentation']
df_studentInfo=pd.merge(courses, df_studentInfo, how='outer', on=common_columns)

# Merge assessments, studentAssessment dataframes 
common_columns = list(set(assessments.columns) & set(studentAssessments.columns)) # ['id_assessment']
df_assessment=pd.merge(studentAssessments, assessments, how='left', on=common_columns)

# Merge studentVLE, VLEdata dataframes 
common_columns = list(set(studentVLE.columns) & set(VLEdata.columns)) # ['id_site', 'code_presentation', 'code_module']
df_vle = pd.merge(studentVLE,VLEdata, how='left', on=common_columns)

# Merge df_studentInfo, df_assessment dataframes 
common_columns = list(set(df_studentInfo.columns) & set(df_assessment.columns)) # ['code_module', 'id_student', 'code_presentation']
df_student_assessment=pd.merge(df_assessment, df_studentInfo, how='left', on=common_columns)

# Downcast to smaller data types based on your column stats
df_vle = df_vle.astype({
    "id_student": "int32",
    "id_site": "int32",
    "date": "int16",
    "sum_click": "int16",
    "week_from": "Int8", # nullable integer type (Int8, Int16, Int32) instead of plain numpy ints allows for NaNs
    "week_to": "Int8" # nullable integer type (Int8, Int16, Int32) instead of plain numpy ints allows for NaNs
})

df_vle["code_module"] = df_vle["code_module"].astype("category")
df_vle["code_presentation"] = df_vle["code_presentation"].astype("category")
df_vle["activity_type"] = df_vle["activity_type"].astype("category")

df_student_assessment = df_student_assessment.astype({
    "id_assessment": "int32",
    "id_student": "int32",
    "date_submitted": "int16",
    "is_banked": "int8",
    "module_presentation_length": "int16",
    "num_of_prev_attempts": "int8",
    "studied_credits": "int16",
    "score": "float32",
    "due_date": "Int16",              # could use 'Int16' if you want integer + NaNs
    "weight": "float32",
    "date_registration": "float32",     # safer with NaNs
    "date_unregistration": "float32"
})

categorical_cols = ["code_module", "code_presentation", "assessment_type","gender", "region", 
                    "highest_education", "imd_band","age_band", "disability", "final_result"]
df_student_assessment[categorical_cols] = df_student_assessment[categorical_cols].astype("category")

# Merge df_student_assessment, df_vle dataframes 
common_columns = list(set(df_student_assessment.columns) & set(df_vle.columns)) # ['code_module', 'id_student', 'code_presentation']
df=pd.merge(df_vle, df_student_assessment, how='left', on=common_columns)  # memory issue??

# df.to_csv("C:/Users/jacma/OneDrive/school work/UMD/Data Science Boot Camp/Group Project/Data/merged_data.csv", index=False)

del assessments,courses,studentAssessments,studentInfo,studentRegistration,studentVLE,VLEdata,common_columns,df_vle,df_student_assessment,categorical_cols

# Data Loading (with merged csv saved)

In [None]:
### Load libraries, necessary data
import pandas as pd
streaming = pd.read_csv("C:/Users/jacma/OneDrive/school work/UMD/Data Science Boot Camp/Group Project/Data/Streaming_OULAD_dataset4classes.csv")
# 

# df = pd.read_csv("C:/Users/jacma/OneDrive/school work/UMD/Data Science Boot Camp/Group Project/Data/merged_data.csv")
# date: final submission date of the assessment calculated as the number of days since the start of the module
# sum_click: number of times a student interacts with the material in that day
# week_from: week from which the material is planned to be used
# week_to: week until which the material is planned to be used
# date_submitted: date of student submission, measured as the number of days since the start of the module presentation
# is_banked:status flag indicating that the assessment result has been transferred from a previous presentation
# score: 0 to 100. < 40 is Fail
# weight: % weight of the assessment. Typically, Exams are treated separately and have weight 100%; the sum of all other assessments is 100%
# module_presentation_length:
# num_of_previous_attempts: number times the student has attempted this module
# studied_credits: total number of credits for the modules the student is currently studying
# date_registration: number of days measured relative to the start of the module-presentation
# date_unregistration: number of days measured relative to the start of the module-presentation. Students who completed course have this field empty. withdraw as final result value in studentInfo.csv

  df = pd.read_csv("C:/Users/jacma/OneDrive/school work/UMD/Data Science Boot Camp/Group Project/Data/merged_data.csv")


# OULAD Streaming Data

### Data Exploration / Summary

In [None]:
import numpy as np

### Data Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
### histograms
### plots of potentially related variables
### creative plots - multiple encodings (try to demonstrate preliminary intuition of inclusion of certain variables)

# color by outcome class, see if there is grouping/clusters (suggests predictability)

### Modeling and Analysis

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.base import clone

# OULAD Merged Data

### Data Exploration / Summary

In [None]:
import numpy as np

In [None]:
# df.info()
### assess missingness, sparseness of variables (i.e. are any mostly 0?)
# --> missingno
# streaming.isna().sum()
# **many zeros**, may need to stratify / under or oversample

In [None]:
### Describe Continuous Variables
#df.iloc[:,4:].describe()
df.describe()

In [None]:
### Checking Categorical variables - which vary within student
def nominal_vary_over_time_checker(data,variables):
    # calculate mean count of nominal variables across students, if 1, doesn't vary over time for any student
    means = []
    for var_name in variables:
        count = (
            data
            .groupby('id_student')[var_name]
            .nunique() # count number of unique values for current variable
        )
        count=count[count != 0] # exclude rows with 0, i.e. information unavailable (?)
        means.append(count.mean()) # if the mean is 1, they all have exactly one value for the variable
    return means


mean_counts = nominal_vary_over_time_checker(df,["code_module", "code_presentation", "assessment_type",'gender','region',
                                                 'highest_education','imd_band','age_band','disability','final_result'])
mean_counts
# code module: some students were in 2 or 3, (students took 1.0829 modules on average)
# code presentation: some students were in 2 or 3, (students took 1.08824 presentations on average)
# assessment type: some students took 2 or 3, or none, (students took 1.7859 different types of assessments)
# age band: some students may have aged into another band (very few students did, 1.0218)
# final result: if students had multiple assessments/modules, they could have multiple final results (no clear interpretation, this is just average "result consistency")

del mean_counts

[np.float64(1.0829562015801182),
 np.float64(1.0882488302523587),
 np.float64(1.7859835503769705),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.002184715558602),
 np.float64(1.0),
 np.float64(1.0593728581220014)]

In [None]:
### Describe Categorical Variables
def variable_summary(data,cat_variables):
    summary_dict = {}
    for var_name in cat_variables:
        var_info = []
        counts = data[var_name].value_counts()
        var_info.append(counts)
        var_info.append(counts/counts.sum())
        summary_dict[var_name] = var_info
    return(summary_dict)

df_unique_students = df.drop_duplicates(subset='id_student', keep='first')
# df_unique_students.to_csv("C:/Users/jacma/OneDrive/school work/UMD/Data Science Boot Camp/Group Project/Data/unique_students.csv", index=False)
discrete_vars = variable_summary(df_unique_students,['gender','region','highest_education','imd_band','age_band','disability','final_result']) # only variables that do not vary within student,
print(discrete_vars)
# we have 26074 unique student IDs in the data set, but 3115 of them are missing at least one of the above variables
# so, out of 22959 students, 
#   -52.5% are male, 
#   -around 10% from each of Scotland, East Anglian Region, South Region, London Region, 
#   -44.4% had A level or equivalent education, 38% less than A level, about 1% for each of Post-Grad qualification and no formal qualification
#   -imd_band is somewhat uniformly distributed, all around 10% but 30-40 was 11.1% (most), 90-100 was 8.5% (least)
#   -almost 70% were aged 0-35, 30% aged 35-55, leaving only .7% over 55
#   -about 9.1% of the students were marked as having a disability 
#   -47% passed, 22% failed, 18% withdrawn, 13% distinction (ignores if students took multiple courses)

In [None]:
# df[df['id_student'] == 1852968].describe() # count tells us missingness
# calculate mean std of cts variables across students, if 0, it doesn't vary over time

### Data Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
### discrete variables
# Plotting the first series
test = df_unique_students.groupby(['region', 'gender']).size().reset_index(name='count').sort_values(by='region')

np.array_split(test,13)


regions = df_unique_students['region'].unique()
x_pos = np.arange(len(regions))

plt.bar(x_pos, data_series1, label='Series 1', color='skyblue')
plt.bar(x_pos, data_series2, bottom=data_series1, label='Series 2', color='lightcoral')

# Adding labels, title, and legend
plt.xlabel('Categories')
plt.ylabel('Values')
plt.title('Stacked Bar Plot Example')
plt.xticks(x_pos, regions)
plt.legend()
plt.show()

### histograms
### plots of potentially related variables
### creative plots - multiple encodings (try to demonstrate preliminary intuition of inclusion of certain variables)

# color by outcome class, see if there is grouping/clusters (suggests predictability)


# separate by modules, code presentation


# for cts variables (i.e. those that change over time), select only a certain number of students (at random) for plots
# --> plot number of interactions against date, color by time

### Modeling and Analysis

In [39]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.base import clone