# Imports

In [1]:
import pandas as pd
import pyreadstat as spss
import os
import numpy as np
import random 

from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from scipy import stats
import plotly.graph_objs as go

# Functions and methods

In [2]:
# ETL STAGE
def train_val_test_split(db,propTrain=.7,propTest=.5):
    # after train size, the remainder is splitted by propTest
    random.seed(42)    
    # Min and Max grade in database
    min_LG=np.min(db['LastGrade'].values)
    max_LG=np.max(db['LastGrade'].values)
    print('Minimum grade in DB '+str(min_LG))
    print('Maximum grade in DB '+str(max_LG))

    # Selecting random positions for the 3 databases: Train, Test, Validation.    
    trainPosition=[]
    testPosition=[]
    validationPosition=[]
    # making sure all databases have all grades
    for g in range(min_LG,max_LG+1):
        rows=db[db['LastGrade']==g].index.tolist()
        trainPosition=trainPosition+random.sample(rows,k=np.ceil(propTrain*len(rows)).astype(int))
        rows=list(set(rows).difference(set(trainPosition)))
        validationPosition=validationPosition+random.sample(rows,k=np.ceil((1-propTest)*len(rows)).astype(int))
        testPosition=testPosition+list(set(rows).difference(set(validationPosition)))

    # Generating databases
    dbTrain=db.loc[trainPosition,:].copy()    
    dbValidation=db.loc[validationPosition,:].copy()
    dbTest=db.loc[testPosition,:].copy()
    return dbTrain, dbValidation, dbTest

In [3]:
class CustomStandarScaler():
    def __init__(self):
        self.mean=None
        self.std=None
        
    def fit(self,db):
        dm=db.groupby(['Cod_Estab']).agg(
            measure_L_std=('measure_L',lambda x:x.std()),
            measure_M_std=('measure_M',lambda x:x.std())
            )
        dm.reset_index(inplace=True)
        self.std = dm
        return self
    
    def transform(self,db):
        db = db.merge(self.std,how='left',left_on='Cod_Estab',right_on='Cod_Estab')
        db['measure_M_std']=db['measure_M']/db['measure_M_std']
        db['measure_L_std']=db['measure_L']/db['measure_L_std']

        db = db[db.measure_M_std > -20]
        db = db[db.measure_L_std > -20]
        return db
        

In [4]:
class CustomEnsembleModel():
    def __init__(self):
        self.listModelsByGrade=None
        self.listFeatures=None
        self.grades=None
        
    def transform(self):
        pass
    
    def fit(self,db,grades,listFeatures):
    
        # To keep track of models, features selected
        listModelsByGrade=[[],[],[],[]]
        self.listFeatures=listFeatures
        self.grades=grades

        # Model estimation
        for grade in grades:
            # SVC
            types_svc=[]
            models_svc=[]
            for i3 in ['rbf','linear']:
                config=(i3,[])
                type='SVC'
                model=self.trainFunc(db,listFeatures,config,type,grade)
                types_svc=types_svc+[type]
                models_svc=models_svc+model   
            # KNC
            types_knn=[]
            models_knn=[]
            for i1 in [1]:
                for i2 in ['distance','uniform']:
                    for i3 in ['ball_tree', 'kd_tree', 'brute']:
                        config=(i1,i2,i3)
                        type='KNC'
                        model=self.trainFunc(db,listFeatures,config,type,grade)
                        types_knn=types_knn+[type]
                        models_knn=models_knn+model       
            # DTC
            types_tree=[]
            models_tree=[]        
            for i4 in ['best','random']:
                config=(i4,[])
                type='DTC'
                model=self.trainFunc(db,listFeatures,config,type,grade)
                types_tree=types_tree+[type]
                models_tree=models_tree+model 

            # One position by grade
            listModelsByGrade.append([types_svc,models_svc,types_knn,models_knn,types_tree,models_tree])
            self.listModelsByGrade=listModelsByGrade
        return self
    
    def trainFunc(self, db, listFeatures, input, type, Grade):    
        X=db[listFeatures].values
        db['LastGrade_aux']=1*(db['LastGrade']>=Grade)
        y=db['LastGrade_aux'].values
        if type=='KNC':    
            # KNC
            knn_k=input[0]
            knn_weights=input[1]    
            knn_algorithm=input[2]
            y=pd.get_dummies(db[['LastGrade_aux']].astype(str))
            y=y[y.columns].values
            knn_clf = KNeighborsClassifier(n_neighbors=knn_k, weights=knn_weights,algorithm=knn_algorithm)
            knn_clf.fit(X, y)
            output=knn_clf
        elif type=='SVC':
            # SVC
            svc_kernel=input[0]
            svm_clf = SVC(gamma="auto", kernel=svc_kernel,random_state=42)
            svm_clf.fit(X, y) 
            output=svm_clf
        elif type=='DTC':
            # DTC
            tree_splitter=input[0]
            tree_dtc = DecisionTreeClassifier(random_state=42,splitter=tree_splitter)
            tree_dtc.fit(X, y) 
            output=tree_dtc
        return [output]
    
    def prediction(self, dbP, models, types, Grade):
        dbP['LastGrade_aux']=1*(dbP['LastGrade']>=Grade)
        X=dbP[self.listFeatures].values
        for m in range(0,len(models)):
            model=models[m]
            if types[m]=='KNC':   
                if m==0:
                    preds=model.predict(X)@np.sort(dbP['LastGrade_aux'].unique())[:,np.newaxis]
                else:                                
                    preds=np.concatenate((preds,model.predict(X)@np.sort(dbP['LastGrade_aux'].unique())[:,np.newaxis]),axis=1)                
            else:
                if m==0:
                    preds=model.predict(X)[:,np.newaxis]
                else:                
                    preds=np.concatenate((preds,model.predict(X)[:,np.newaxis]),axis=1)                
        return preds
    
    def voter(self, dbP, preds, Grade):
        dbP['LastGrade_aux']=1*(dbP['LastGrade']>=Grade)
        m = stats.mode(preds,axis=1)
        y_pred=m[0].ravel()*(m[1].ravel()>=np.max(m[1].ravel()))
        dbP['y_pred']=y_pred
        y=dbP['LastGrade_aux'].values    
        Precision=np.sum((y_pred==1)*(y==1))/np.sum(y_pred==1)
        Recall=np.sum((y_pred==1)*(y==1))/np.sum(y==1)
        F1=2*(Precision*Recall)/(Precision+Recall) 
        Precision=np.round(Precision,decimals=2)
        Recall=np.round(Recall,decimals=2)
        F1=np.round(F1,decimals=2)
        return [Precision,Recall,F1,y_pred]
    
    def predict(self, db):
    
        listPreds=[] # keep track of best prediction by model type
        grades = self.grades
        listModelsByGrade = self.listModelsByGrade

        for grade in grades:

            # access trained models 
            types_svc = listModelsByGrade[grade][0]
            models_svc = listModelsByGrade[grade][1]
            types_knn = listModelsByGrade[grade][2]
            models_knn =  listModelsByGrade[grade][3]
            types_tree = listModelsByGrade[grade][4]
            models_tree = listModelsByGrade[grade][5]

            # SVC
            preds=self.prediction(db,models_svc,types_svc,grade)   
            metric=self.voter(db,preds,grade) # find the best prediction
            listPreds=metric[3][:,np.newaxis]

            # KNC
            preds=self.prediction(db,models_knn,types_knn,grade)   
            metric=self.voter(db,preds,grade) # find the best prediction
            listPreds=np.concatenate((listPreds,metric[3][:,np.newaxis]),axis=1)

            # TREE
            preds=self.prediction(db,models_tree,types_tree,grade)   
            metric=self.voter(db,preds,grade) # find the best prediction
            listPreds=np.concatenate((listPreds,metric[3][:,np.newaxis]),axis=1)

            metric=self.voter(db,listPreds,grade) # find the best of best predictions
            
            name='PrecisionLastGrade_'+str(grade)
            db[name]=metric[0] #saving for ploting

            name='RecallLastGrade_'+str(grade)
            db[name]=metric[1] #saving for ploting

            name='F1LastGrade_'+str(grade)        
            db[name]=metric[2] #saving for ploting

            name='predLastGrade_'+str(grade)
            db[name]=metric[3] #To identify students

            print('Grade='+str(grade)+' Precision='+str(np.round(metric[0],decimals=2))+' Recall='+str(np.round(metric[1],decimals=2))+' F1='+str(np.round(metric[2],decimals=2)))
        return db
    
    

# Cleaning data

#

In [2]:
# DATA FOR 2014
DataFile='/Users/fipm/OneDrive/Research/Mineduc/BaseDatosPruebas/2014/2014 - 3o primaria/2014 - 3ro Primaria - Versión Final.sav'
df, meta = spss.read_sav(DataFile, encoding='LATIN1', apply_value_formats=True)


In [14]:
db = df.filter(['Cod_Estab','measure_L','measure_M','Cod_Personal'])

In [16]:
# Beginning of student progression and standarized scores
# FOURTH GRADE STUDENTS

listEstu=db['Cod_Personal'].unique()
Path='/Users/fipm/OneDrive/Research/Mineduc/Sire/'

listDB=[
    'FOTO_ESTUDIANTE_2015_UTF8.csv',
    'FOTO_ESTUDIANTE_2016_UTF8.csv',
    'FOTO_ESTUDIANTE_2017_UFT8.csv',
    'FOTO_ESTUDIANTE_2018_UTF8.csv',
    'FOTO_ESTUDIANTE_2019_UTF8.csv',
    'FOTO_ESTUDIANTE_2020_UTF8.csv',

    ]
listGrados=[4,5,6,7,8,9]
for i in range(0,len(listDB)):
    print(listDB[i])
    opGrade=listGrados[i]
    # 4. FOTO_ESTUDIANTE
    studentDB = pd.read_csv(
        Path+listDB[i]
        ,sep='\t',index_col = False, encoding='utf-8'
        ,usecols=['COD_ESTABLECIMIENTO','COD_PERSONAL','COD_ESTUDIANTE','NOM_GRADO']
        )


    # 41. PREPRIMARIA BILINGUE
    # 42. PREPRIMARIA PARVULOS
    # 43. PRIMARIA DE NIÑOS
    # 44. PRIMARIA DE ADULTOS
    # 45. BASICO  
    # 46. DIVERSIFICADO

    selecionarGrado=opGrade
    # school clasification
    Niveles=[43.,43.,43.,43.,43.,43.,43.,45.,45.,45.]
    grados=['','','','TERCERO','CUARTO','QUINTO' ,'SEXTO' ,'PRIMERO BASICO','SEGUNDO BASICO','TERCERO BASICO']
    grado=grados[selecionarGrado]
    nivel=Niveles[selecionarGrado]

    print('Grado : '+grado)
    print('Nivel : '+str(nivel))


    # finding schools of student digeduca DB in SIRE
    eP=studentDB[studentDB['COD_PERSONAL'].isin(listEstu)].copy()
    eP['Nivel']=(eP['COD_ESTABLECIMIENTO']/1)%100
    condicion=(eP['NOM_GRADO']==grado)
    condicion=condicion*(eP['Nivel']==nivel)        
    eP=eP[condicion] # selecting fourth grade students 
    listSireEstu=eP['COD_PERSONAL'].unique()

    # creating dataframe to store results.
    if i==0:
        toDF=dict()
        toDF['COD_PERSONAL']=listEstu
        PSDF=pd.DataFrame(toDF)
        PSDF['LastGrade']=3
    inDB=PSDF['COD_PERSONAL'].isin(listSireEstu)
    name='G'+str(opGrade)
    PSDF[name]=inDB*1
    PSDF['LastGrade']=PSDF['LastGrade']+PSDF[name]
    print('done '+str(opGrade)+' grade')

FOTO_ESTUDIANTE_2015_UTF8.csv
Grado : CUARTO
Nivel : 43.0
done 4 grade
FOTO_ESTUDIANTE_2016_UTF8.csv
Grado : QUINTO
Nivel : 43.0
done 5 grade
FOTO_ESTUDIANTE_2017_UFT8.csv
Grado : SEXTO
Nivel : 43.0
done 6 grade
FOTO_ESTUDIANTE_2018_UTF8.csv
Grado : PRIMERO BASICO
Nivel : 45.0
done 7 grade
FOTO_ESTUDIANTE_2019_UTF8.csv
Grado : SEGUNDO BASICO
Nivel : 45.0
done 8 grade
FOTO_ESTUDIANTE_2020_UTF8.csv
Grado : TERCERO BASICO
Nivel : 45.0
done 9 grade


In [21]:
db = df.filter(['Cod_Estab','measure_L','measure_M','Cod_Personal'])
db=db.merge(PSDF,left_on='Cod_Personal',right_on='COD_PERSONAL',how='left')
db.drop(columns=['COD_PERSONAL'], inplace=True)
db.to_csv('/Users/fipm/OneDrive/GitHubProjects/grade_progression/raw_data/data.csv',
          index=False)

# starts main program

In [5]:
# Loading data
file_path = os.path.join('..','raw_data','data.csv')
db = pd.read_csv(file_path)

In [6]:
dbTrain, dbValidation, dbTest = train_val_test_split(db,propTrain=.7,propTest=.5)
css = CustomStandarScaler()
css.fit(dbTrain);
dbTrain = css.transform(dbTrain)
dbValidation = css.transform(dbValidation)
dbTest = css.transform(dbTest)

Minimum grade in DB 3
Maximum grade in DB 9


In [7]:
# Measures for student performance at school
listFeaturesQ=[
        'measure_M_std',
        'measure_L_std',
 ]

# Principal component analysis for student performace
pca = PCA(n_components=1)
pca.fit(dbTrain[listFeaturesQ].values)
dbTrain['studentPerformace']= pca.transform(dbTrain[listFeaturesQ].values)[:,[0]]
dbValidation['studentPerformace']= pca.transform(dbValidation[listFeaturesQ].values)[:,[0]]
dbTest['studentPerformace']= pca.transform(dbTest[listFeaturesQ].values)[:,[0]]


In [8]:
# Ensemble learning, voting classifiers


listFeatures=['studentPerformace'] # for training

print('Variables used: ')
print(listFeatures)


grades=[4,5,6,7,8,9]    

# TRAINING STAGE
print('Training')

cem = CustomEnsembleModel()
listModelsByGrade=cem.fit(dbTrain, grades, listFeatures) 

Variables used: 
['studentPerformace']
Training


In [59]:
# TRAINING STAGE
print('Train')
dbTrain=cem.predict(dbTrain)  

Train
Grade=4 Precision=1.0 Recall=1.0 F1=1.0
Grade=5 Precision=1.0 Recall=1.0 F1=1.0
Grade=6 Precision=1.0 Recall=0.98 F1=0.99
Grade=7 Precision=1.0 Recall=0.65 F1=0.78
Grade=8 Precision=1.0 Recall=0.49 F1=0.66
Grade=9 Precision=1.0 Recall=0.28 F1=0.44


In [9]:
# VALIDATING STAGE
print('Validation')
dbValidation=cem.predict(dbValidation)  

# TESTING STAGE
print('Test')
dbTest=cem.predict(dbTest)       
print('Done')  

Validation
Grade=4 Precision=0.9 Recall=0.88 F1=0.89
Grade=5 Precision=0.79 Recall=0.76 F1=0.77
Grade=6 Precision=0.74 Recall=0.68 F1=0.71
Grade=7 Precision=0.66 Recall=0.38 F1=0.48
Grade=8 Precision=0.62 Recall=0.28 F1=0.38
Grade=9 Precision=0.55 Recall=0.14 F1=0.22
Test
Grade=4 Precision=0.89 Recall=0.87 F1=0.88
Grade=5 Precision=0.8 Recall=0.75 F1=0.77
Grade=6 Precision=0.74 Recall=0.68 F1=0.71
Grade=7 Precision=0.66 Recall=0.37 F1=0.48
Grade=8 Precision=0.62 Recall=0.27 F1=0.38
Grade=9 Precision=0.59 Recall=0.14 F1=0.23
Done


In [10]:
# Preparing data for plots

dbTest2plot=dbTest.copy()

listNames=[]
for grade in grades:
    listNames.append('PrecisionLastGrade_'+str(grade))
    listNames.append('RecallLastGrade_'+str(grade))
    listNames.append('F1LastGrade_'+str(grade))
dbTest2plot=dbTest2plot[listNames]
dbTest2plot=dbTest2plot.melt(var_name='NameIndex2',value_name='Index')
dbTest2plot=dbTest2plot.drop_duplicates(subset=['NameIndex2'],keep='first')
dbTest2plot.reset_index(drop=True, inplace=True)
dbTest2plot['LastGrade']=dbTest2plot['NameIndex2'].str[-1]
dbTest2plot['Index Name']=dbTest2plot['NameIndex2'].str.split('LastGrade_').str[0]
dbTest2plot=dbTest2plot[['LastGrade','Index Name','Index']].copy()

# Plot of metric performance
data=[
    go.Bar(x=dbTest2plot[dbTest2plot['LastGrade']==g]["Index Name"],y=dbTest2plot[dbTest2plot['LastGrade']==g]["Index"],name='Attending Grade >='+str(g))
    for g in dbTest2plot['LastGrade'].unique()
]
layout = go.Layout(title_text='Model Performace for Third Grade Students Progression in Guatemala',
                xaxis_title='Metrics',
                yaxis_title='Percentage'
                )

fig = go.Figure(data=data, layout=layout)
fig.show()