# Read data from CSV

In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path("..")))
from scripts.read_data import read_data

data = read_data()
data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,single,2nd phase - general contingent,6th choice,Animation and Multimedia Design,Daytime,Secondary education,122.0,Portuguese,Basic Ed 3rd Cycle,Other - 11th Year,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,single,International student (bachelor),2nd choice,Tourism,Daytime,Secondary education,160.0,Portuguese,Secondary Education - 12th Year or Eq.,Higher Ed - Degree,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,single,1st phase - general contingent,6th choice,Communication Design,Daytime,Secondary education,122.0,Portuguese,Basic Ed 1st Cycle (4th/5th),Basic Ed 1st Cycle (4th/5th),...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,single,2nd phase - general contingent,3rd choice,Journalism and Communication,Daytime,Secondary education,122.0,Portuguese,Basic Ed 2nd Cycle (6th–8th),Basic Ed 1st Cycle (4th/5th),...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,married,Over 23 years old,2nd choice,Social Service (evening attendance),Evening,Secondary education,100.0,Portuguese,Basic Ed 1st Cycle (4th/5th),Basic Ed 2nd Cycle (6th–8th),...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [3]:
import pandas as pd
from scripts.consts import CATEGORICAL_COLUMN_NAMES

numeric_data = data.drop(columns = CATEGORICAL_COLUMN_NAMES)

statistics = pd.DataFrame({
    "Mean": numeric_data.mean(numeric_only=True),
    "Median": numeric_data.median(numeric_only=True),
    "Min": numeric_data.min(numeric_only=True),
    "Max": numeric_data.max(numeric_only=True),
    "Standard deviation": numeric_data.std(numeric_only=True),
    "5. percentile": numeric_data.quantile(0.05, numeric_only=True),
    "95. percentile": numeric_data.quantile(0.95, numeric_only=True),
    "Missing values": numeric_data.isnull().sum()
}).T.round(2)

statistics.reset_index().to_csv("../statistics/numeric_stats.csv", index=False)

statistics

Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
Mean,132.61,126.98,23.27,0.71,6.27,8.3,4.71,10.64,0.14,0.54,6.23,8.06,4.44,10.23,0.15,11.57,1.23,0.0
Median,133.1,126.1,20.0,0.0,6.0,8.0,5.0,12.29,0.0,0.0,6.0,8.0,5.0,12.2,0.0,11.1,1.4,0.32
Min,95.0,95.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06
Max,190.0,190.0,70.0,20.0,26.0,45.0,26.0,18.88,12.0,19.0,23.0,33.0,20.0,18.57,12.0,16.2,3.7,3.51
Standard deviation,13.19,14.48,7.59,2.36,2.48,4.18,3.09,4.84,0.69,1.92,2.2,3.95,3.01,5.21,0.75,2.66,1.38,2.27
5. percentile,110.0,103.42,18.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06
95. percentile,157.0,153.5,41.0,6.0,11.0,15.0,9.0,14.86,1.0,4.0,10.0,15.0,8.0,14.98,1.0,16.2,3.7,3.51
Missing values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
categorical_data = data[CATEGORICAL_COLUMN_NAMES]

rows_count = len(categorical_data)

for column_name in CATEGORICAL_COLUMN_NAMES:
    
    counts = categorical_data[column_name].value_counts(dropna=False)
    rows = []

    for category_name, count in counts.items():
        
        share = round(count / rows_count, 4)
        percent = str(round(share * 100, 2)) + "%"
        
        rows.append({
            "Category": category_name,
            "Share": share,
            "Percent": percent
        })
        
    column_categories = pd.DataFrame(rows)
    path = f"../statistics/categorical_stats/{column_name.replace("/", " ")}.csv"
    column_categories.to_csv(path, index=False)
    
    print("===== ", column_name, " =====")
    print("Missing: ", categorical_data[column_name].isna().sum())
    print(column_categories.head())
    print()
    

=====  Marital status  =====
Missing:  0
            Category   Share Percent
0             single  0.8858  88.58%
1            married  0.0857   8.57%
2           divorced  0.0206   2.06%
3        facto union  0.0057   0.57%
4  legally separated  0.0014   0.14%

=====  Application mode  =====
Missing:  0
                                       Category   Share Percent
0                1st phase - general contingent  0.3861  38.61%
1                2nd phase - general contingent  0.1971  19.71%
2                             Over 23 years old  0.1774  17.74%
3                              Change of course  0.0705   7.05%
4  Technological specialization diploma holders  0.0481   4.81%

=====  Application order  =====
Missing:  0
     Category   Share Percent
0  2nd choice  0.6840   68.4%
1  3rd choice  0.1236  12.36%
2  4th choice  0.0698   6.98%
3  5th choice  0.0563   5.63%
4  6th choice  0.0348   3.48%

=====  Course  =====
Missing:  0
                       Category   Share Percent
0 