# ML task for school level dataset (regression and classification)

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('school_dataset_sample.csv')

In [3]:
# get only one year for test (possible years: 2018,2019,2020)
df = df[df['year']==2020]

In [12]:
df = df.reset_index()

In [13]:
col = [
       'eotypename_min', 'eoregname_min', 'eoareaname_min', 'eotername_min', 'eoparent_min',
       'region', 'sexM', 'sexF', 'phys_mat', 'foreign_phyl', 'universal', 'ukr_phyl', 'technology', 'bio_phys', 
       'economy', 'history', 'sport', 'math', 'law', 'other', 'martial_sport', 'philosophy', 'ecology', 'arts', 'geography', 
       'physics', 'chem_tech', 'biology', 'phys_chem', 'young_spec', 'qual_work', 'it_tech', 'bio_chem', 'bio_tech', 'hung_lang_max', 
       'ukr_lang_max', 'rus_lang_max', 'roman_lang_max', 'pol_lang_max', 'mold_lang_max', 'other_lang_max',

       "mathball100_mean",
       'Degree of institution', 'Number of full-time teaching staff in total', 'Number of full-time non-teaching staff in total', 
       'Number of full-time teaching staff of retirement age', 'Number of students in total', 'Number of classes in total', 
       'Total number of students and children receiving preschool education', 
       'Expenditures for the operation of the institution (general fund), UAH thousand (cash expenditures)', 
       'Expenditures for the operation of the institution (general fund), UAH thousand (plan)'
       
]

In [14]:
pd.set_option('display.max_columns', None)
df[col].head()

Unnamed: 0,eotypename_min,eoregname_min,eoareaname_min,eotername_min,eoparent_min,region,sexM,sexF,phys_mat,foreign_phyl,universal,ukr_phyl,technology,bio_phys,economy,history,sport,math,law,other,martial_sport,philosophy,ecology,arts,geography,physics,chem_tech,biology,phys_chem,young_spec,qual_work,it_tech,bio_chem,bio_tech,hung_lang_max,ukr_lang_max,rus_lang_max,roman_lang_max,pol_lang_max,mold_lang_max,other_lang_max,mathball100_mean,Degree of institution,Number of full-time teaching staff in total,Number of full-time non-teaching staff in total,Number of full-time teaching staff of retirement age,Number of students in total,Number of classes in total,Total number of students and children receiving preschool education,"Expenditures for the operation of the institution (general fund), UAH thousand (cash expenditures)","Expenditures for the operation of the institution (general fund), UAH thousand (plan)"
0,ліцей,Луганська область,Троїцький район,с.Демино-Олександрівка,"Відділ освіти, молоді та спорту Троїцької сели...",Луганська,6,3,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,98.0,I-III,15.0,6.0,2.0,62,8,,3620.1,4862.1
1,ліцей,Луганська область,Троїцький район,с.Лантратівка,"Відділ освіти, молоді та спорту Троїцької сели...",Луганська,2,2,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,98.0,I-III,17.0,9.0,4.0,96,11,,4298.7,5290.8
2,навчально-виховний комплекс,м.Київ,м.Київ. Шевченківський район міста,Шевченківський район міста,Управління освіти Шевченківської районної в мі...,місто Київ,23,25,0,19,0,0,0,0,0,0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,169.96875,I-III,53.0,34.0,14.0,782,27,,24347.516714,17562.874
3,ліцей,Луганська область,Троїцький район,смт Троїцьке,"Відділ освіти, молоді та спорту Троїцької сели...",Луганська,11,10,0,0,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,96.285714,I-III,41.0,25.0,5.0,362,21,,11349.3,14139.4
4,середня загальноосвітня школа,Сумська область,Глухівський район,смт Шалигине,"Відділ освіти, молоді та спорту Шалигинської с...",Сумська,5,4,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,56.0,I-III,29.0,22.0,4.0,149,15,,6697.0,7721.6


In [10]:
class Preprocessing:
    DATA_TYPES = ['train','test']

    def __init__(self,
                 drop_strange_st_data: bool = True,
                 column_for_onehot: list = [],
                 scale_columns: list = []):

        self.drop_strange_st_data = drop_strange_st_data
        self.column_for_onehot = column_for_onehot
        self.scale_columns = scale_columns

        self.encoder = OneHotEncoder(handle_unknown="ignore")
        self.scaler = MinMaxScaler()
        self.is_encoder_fit = False
        self.is_scaler_fit = False

    def process(self, 
                df: pd.DataFrame,
                data_type: str = 'train') -> pd.DataFrame:

        if data_type not in self.DATA_TYPES:
            raise ValueError(f'Unexpected data type. Data type must be one of {self.DATA_TYPES}.')
        
        if ((not self.is_encoder_fit) and (self.column_for_onehot)) and data_type == 'test':
            raise ValueError(f'Please, use train dataset first.')
            
        if ((not self.is_scaler_fit) and (self.scale_columns)) and data_type == 'test':
            raise ValueError(f'Please, use train dataset first.')

        if self.drop_strange_st_data:
            df = self._drop_strange_data_students_data(df)
        
        if self.column_for_onehot:
            df = self._one_hot_encoding(
                df = df,
                column_for_onehot = self.column_for_onehot,
                data_type = data_type
            )
            self.is_encoder_fit = True
        
        if self.scale_columns:
            df = self._scaling_columns(
                df = df,
                scale_columns = self.scale_columns,
                data_type = data_type
            )
            self.is_scaler_fit = True

        object_columns = self._get_object_columns(df)
        df,obj2num = self._encode_str_values(
            df = df,
            obj_columns = object_columns
        )

        self.obj2num = obj2num

        return df

    def _one_hot_encoding(self,
                          df: pd.DataFrame,
                          column_for_onehot: list,
                          data_type: str) -> pd.DataFrame:

        if data_type == 'train':
            data = self.encoder.fit_transform(df[column_for_onehot]).toarray()

        else: 
            data = self.encoder.transform(df[column_for_onehot]).toarray()

        df[self.encoder.get_feature_names_out(column_for_onehot)] = data
        df = df.drop(columns = column_for_onehot)

        return  df
    
    def _scaling_columns(self,
                         df: pd.DataFrame,
                         scale_columns: list,
                         data_type: str) -> pd.DataFrame:
        if data_type == 'train':
            df[scale_columns] = self.scaler.fit_transform(df[scale_columns])
        else: 
            df[scale_columns] = self.scaler.transform(df[scale_columns])

        return df
    
    @staticmethod
    def _encode_str_values(df: pd.DataFrame,
                           obj_columns: list) -> pd.DataFrame:
        obj2num = []
        for col in obj_columns:
            obj2num.append({col:{obj: i for i,obj in enumerate(set(df[col]))}})
            cur_num = obj2num[-1]
            df[col] = df[col].map(lambda x:cur_num[col][x])

        
        return (df,obj2num)

    @staticmethod
    def _drop_strange_data_students_data(df: pd.DataFrame) -> pd.DataFrame:
        return df.reset_index(drop=True)
    
    @staticmethod
    def _get_object_columns(df:pd.DataFrame) -> list:
        obj_col = []
        for column in df.columns:
            if df[column].dtype == 'object':
                obj_col.append(column)

        return obj_col

# Regression

In [16]:
X = df[col]

In [17]:
X = X.fillna(-1)

In [18]:
X.head()

Unnamed: 0,eotypename_min,eoregname_min,eoareaname_min,eotername_min,eoparent_min,region,sexM,sexF,phys_mat,foreign_phyl,universal,ukr_phyl,technology,bio_phys,economy,history,sport,math,law,other,martial_sport,philosophy,ecology,arts,geography,physics,chem_tech,biology,phys_chem,young_spec,qual_work,it_tech,bio_chem,bio_tech,hung_lang_max,ukr_lang_max,rus_lang_max,roman_lang_max,pol_lang_max,mold_lang_max,other_lang_max,mathball100_mean,Degree of institution,Number of full-time teaching staff in total,Number of full-time non-teaching staff in total,Number of full-time teaching staff of retirement age,Number of students in total,Number of classes in total,Total number of students and children receiving preschool education,"Expenditures for the operation of the institution (general fund), UAH thousand (cash expenditures)","Expenditures for the operation of the institution (general fund), UAH thousand (plan)"
0,ліцей,Луганська область,Троїцький район,с.Демино-Олександрівка,"Відділ освіти, молоді та спорту Троїцької сели...",Луганська,6,3,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,98.0,I-III,15.0,6.0,2.0,62,8,-1.0,3620.1,4862.1
1,ліцей,Луганська область,Троїцький район,с.Лантратівка,"Відділ освіти, молоді та спорту Троїцької сели...",Луганська,2,2,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,98.0,I-III,17.0,9.0,4.0,96,11,-1.0,4298.7,5290.8
2,навчально-виховний комплекс,м.Київ,м.Київ. Шевченківський район міста,Шевченківський район міста,Управління освіти Шевченківської районної в мі...,місто Київ,23,25,0,19,0,0,0,0,0,0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,169.96875,I-III,53.0,34.0,14.0,782,27,-1.0,24347.516714,17562.874
3,ліцей,Луганська область,Троїцький район,смт Троїцьке,"Відділ освіти, молоді та спорту Троїцької сели...",Луганська,11,10,0,0,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,96.285714,I-III,41.0,25.0,5.0,362,21,-1.0,11349.3,14139.4
4,середня загальноосвітня школа,Сумська область,Глухівський район,смт Шалигине,"Відділ освіти, молоді та спорту Шалигинської с...",Сумська,5,4,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,56.0,I-III,29.0,22.0,4.0,149,15,-1.0,6697.0,7721.6


In [19]:
train_df,test_df = train_test_split(X,test_size=0.2,random_state=42)

In [20]:
onehotencode_col = ["eotypename_min", "Degree of institution"]

In [21]:
preprocessor = Preprocessing(
    drop_strange_st_data = False,
    column_for_onehot = onehotencode_col
)

In [22]:
preprocess_train_df = preprocessor.process(
    df = train_df,
    data_type = 'train'
)
preprocess_train_df

Unnamed: 0,eoregname_min,eoareaname_min,eotername_min,eoparent_min,region,sexM,sexF,phys_mat,foreign_phyl,universal,ukr_phyl,technology,bio_phys,economy,history,sport,math,law,other,martial_sport,philosophy,ecology,arts,geography,physics,chem_tech,biology,phys_chem,young_spec,qual_work,it_tech,bio_chem,bio_tech,hung_lang_max,ukr_lang_max,rus_lang_max,roman_lang_max,pol_lang_max,mold_lang_max,other_lang_max,mathball100_mean,Number of full-time teaching staff in total,Number of full-time non-teaching staff in total,Number of full-time teaching staff of retirement age,Number of students in total,Number of classes in total,Total number of students and children receiving preschool education,"Expenditures for the operation of the institution (general fund), UAH thousand (cash expenditures)","Expenditures for the operation of the institution (general fund), UAH thousand (plan)",eotypename_min_гімназія,eotypename_min_колегіум,eotypename_min_ліцей,eotypename_min_навчально-виховне об'єднання,eotypename_min_навчально-виховний комплекс,eotypename_min_середня загальноосвітня школа,eotypename_min_спеціальна загальноосвітня школа,eotypename_min_спеціалізована школа,Degree of institution_0,Degree of institution_I,Degree of institution_I-II,Degree of institution_I-III,Degree of institution_II-III,Degree of institution_III
4983,9,357,682,725,17,16,16,0,0,0,0,0,0,0,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,143.000000,33.0,25.0,3.0,410,22,-1.0,9709.600000,10662.00000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1867,1,250,1693,748,3,3,6,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,126.400000,17.0,15.0,0.0,68,7,-1.0,3198.780000,4686.40000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1652,17,182,3111,1134,15,8,9,0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,110.000000,28.0,16.0,5.0,306,17,-1.0,6877.700000,8892.40000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
387,6,351,585,538,19,11,15,0,0,0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,109.071429,25.0,16.0,3.0,318,15,-1.0,7696.500000,8943.30000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5909,11,16,2174,867,2,11,5,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,130.000000,21.0,17.0,7.0,269,13,-1.0,6276.812000,8081.80800,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,10,593,2915,956,5,1,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,1,0,0,0,0,0,138.000000,17.0,12.0,3.0,128,11,-1.0,4202.700000,5501.80000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5191,14,215,1072,349,11,22,24,0,0,0,23,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,154.347826,53.0,31.0,2.0,820,30,-1.0,14666.100000,3831.40000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5226,23,317,683,610,21,17,26,0,17,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,131.809524,41.0,27.0,9.0,700,25,-1.0,12223.400000,19191.60000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5390,8,450,36,1182,13,2,2,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,112.333333,15.0,17.0,3.0,115,11,-1.0,7926.312295,7317.95015,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [23]:
preprocess_test_df = preprocessor.process(
    df = test_df,
    data_type = 'test'
)

In [24]:
rf_reg = RandomForestRegressor(
    random_state=42,
    max_depth=10,
    n_estimators=500
)

In [25]:
y_train = preprocess_train_df['mathball100_mean']
X_train = preprocess_train_df.drop(columns=['mathball100_mean'])
rf_reg.fit(X_train,y_train)

RandomForestRegressor(max_depth=10, n_estimators=500, random_state=42)

In [26]:
rf_reg.score(X_train,y_train)

0.5517610685751313

In [27]:
y_test = preprocess_test_df['mathball100_mean']
X_test = preprocess_test_df.drop(columns=['mathball100_mean'])
rf_reg.score(X_test,y_test)

0.20544528707538579

In [28]:
from sklearn.metrics import mean_squared_error
y_pred = rf_reg.predict(X_test)
mean_squared_error(y_test, y_pred)

1379.7121979992523

In [29]:
y_pred[:5],y_test[:5]

(array([124.49472074, 108.25188707, 119.47572706, 109.5324418 ,
        105.06759694]),
 2966     87.250000
 6961    136.666667
 3214     97.842105
 437      83.428571
 263     114.000000
 Name: mathball100_mean, dtype: float64)

# Classification

In [30]:
X = df[col]
X = X.fillna(-1)

In [31]:
def encode_statuc_binary(x):
    if x > 120:
        return 1
    return 0

X['mathball_status'] = X.mathball100_mean.map(encode_statuc_binary)

In [32]:
train_df,test_df = train_test_split(X, test_size=0.2, random_state=42)

In [33]:
onehotencode_col = ["eotypename_min", "Degree of institution"]
preprocessor = Preprocessing(
    drop_strange_st_data = False,
    column_for_onehot = onehotencode_col
)

In [34]:
preprocess_train_df = preprocessor.process(
    df = train_df,
    data_type = 'train'
)
preprocess_test_df = preprocessor.process(
    df = test_df,
    data_type = 'test'
)

In [35]:
rf_cf = RandomForestClassifier(
    random_state=42,
    max_depth=10,
    n_estimators=500
)

In [36]:
y_train = preprocess_train_df['mathball_status']
X_train = preprocess_train_df.drop(columns=['mathball_status', 'mathball100_mean'])
rf_cf.fit(X_train,y_train)

RandomForestClassifier(max_depth=10, n_estimators=500, random_state=42)

In [37]:
rf_cf.score(X_train,y_train)

0.7633832976445396

In [38]:
y_test = preprocess_test_df['mathball_status']
X_test = preprocess_test_df.drop(columns=['mathball_status', 'mathball100_mean'])
rf_cf.score(X_test,y_test)

0.6423982869379015

In [39]:
pred_train_y = rf_cf.predict(X_train)
pred_test_y = rf_cf.predict(X_test)

In [40]:
print('TRAIN DATA:')
print(classification_report(y_train,pred_train_y))
print('\n\n')
print('TEST DATA:')
print(classification_report(y_test,pred_test_y))

TRAIN DATA:
              precision    recall  f1-score   support

           0       0.69      0.87      0.77      2583
           1       0.86      0.67      0.75      3021

    accuracy                           0.76      5604
   macro avg       0.78      0.77      0.76      5604
weighted avg       0.78      0.76      0.76      5604




TEST DATA:
              precision    recall  f1-score   support

           0       0.59      0.73      0.65       643
           1       0.71      0.57      0.63       758

    accuracy                           0.64      1401
   macro avg       0.65      0.65      0.64      1401
weighted avg       0.66      0.64      0.64      1401



## Subjects classification task compare

In [4]:
subject = {
    'Ukrainian': 'ukrball100_mean',
    'History': 'histball100_mean',
    'Math':'mathball100_mean',
    'Physics': 'physball100_mean',
    'Chemistry': 'chemball100_mean',
    'Biology': 'bioball100_mean',
    'Geography':'geoball100_mean',
    'English':'engball100_mean',
    'French': 'fraball100_mean',
    'German': 'deuball100_mean',
    'Spanish': 'spaball100_mean',
}

In [5]:
# col = [
#        'sexM', 'sexF', 'phys_mat', 'foreign_phyl',
#        'universal', 'ukr_phyl', 'technology', 'bio_phys', 'economy', 'history',
#        'sport', 'math', 'law', 'other', 'martial_sport', 'philosophy',
#        'ecology', 'arts', 'geography', 'physics', 'chem_tech', 'biology',
#        'phys_chem', 'young_spec', 'qual_work', 'hung_lang_max', 'ukr_lang_max',
#        'rus_lang_max', 'roman_lang_max', 'pol_lang_max', 'mold_lang_max',
#        'other_lang_max', 'eotypename_min', 'eoregname_min', 'eoareaname_min',
#        'eotername_min', 'eoparent_min',
#        'Ступінь закладу ( I, I-II, I-III)',
#        'Кількість штатних працівників педагогічного персоналу  разом',
#        'Кількість штатних працівників непедагогічного персоналу  разом',
#        'Кількість штатних працівників педагогічного персоналу  пенсійного віку',
#        'Питома вага непедагогічного персоналу у загальній кількості працівників, %**',
#        'Питома вага педагогічних працівників пенсійного віку у загальній кількості педагогічних працівників, %**',
#        'Кількість учнів разом', 'Кількість класів разом',
#        'Фактична наповнюваність класів, учнів на клас**',
#        'Показник співвідношення кількості учнів на одного вчителя, учнів на вчителя**',
#        'Видатки на функціонування закладу (загальний фонд), тис.грн.\n2018рік (касові видатки)',
#        'Видатки на функціонування закладу (загальний фонд), тис.грн.\n2019 рік (план)',
#        'Видатки на 1 учня, 2018, тис. грн.**',
#        'Видатки на 1 клас, тис. грн.** 2018 рік (каса)',
#        'Разом кількість учнів, та дітей, які здобувають дошкільну освіту',
#        'Видатки на функціонування закладу (загальний фонд), тис.грн.\n2019 рік (касові видатки)',
#        'Видатки на функціонування закладу (загальний фонд), тис.грн.\n2020 рік (план)',
#        'Видатки на 1 учня, 2019, тис. грн.**',
#        'Видатки на 1 клас, тис. грн.** 2019 рік (каса)', 'Опорна',
#        'Код бюджету', 'Форма власності', 'Код ДІСО', 'ЄДРПОУ', 'Примітка',
#        'Питома вага непедагогічного персоналу у загальній кількості працівників, %',
#        'Питома вага педагогічних працівників пенсійного віку у загальній кількості педагогічних працівників, %',
#        'Фактична наповнюваність класів, учнів на клас',
#        'Показник співвідношення кількості учнів на одного вчителя, учнів на вчителя',
#        'Видатки на функціонування закладу (загальний фонд), тис.грн.\n2020 рік (касові видатки)',
#        'Видатки на функціонування закладу (загальний фонд), тис.грн.\n2021 рік (план)',
#        'Видатки на 1 учня, 2020, тис. грн.',
#        'Видатки на 1 клас, тис. грн. 2020 рік (каса)'
# ]

In [6]:
col = [
       'eotypename_min', 'eoregname_min', 'eoareaname_min', 'eotername_min', 'eoparent_min',
       'region', 'sexM', 'sexF', 'phys_mat', 'foreign_phyl', 'universal', 'ukr_phyl', 'technology', 'bio_phys', 
       'economy', 'history', 'sport', 'math', 'law', 'other', 'martial_sport', 'philosophy', 'ecology', 'arts', 'geography', 
       'physics', 'chem_tech', 'biology', 'phys_chem', 'young_spec', 'qual_work', 'it_tech', 'bio_chem', 'bio_tech', 'hung_lang_max', 
       'ukr_lang_max', 'rus_lang_max', 'roman_lang_max', 'pol_lang_max', 'mold_lang_max', 'other_lang_max',
       'Degree of institution', 'Number of full-time teaching staff in total', 'Number of full-time non-teaching staff in total', 
       'Number of full-time teaching staff of retirement age', 'Number of students in total', 'Number of classes in total', 
       'Total number of students and children receiving preschool education', 
       'Expenditures for the operation of the institution (general fund), UAH thousand (cash expenditures)', 
       'Expenditures for the operation of the institution (general fund), UAH thousand (plan)'
       
]

In [51]:
df = pd.read_csv('school_dataset_sample.csv')

In [52]:
len(df)

18479

In [53]:
from sklearn import metrics

In [54]:
len(df)

18479

In [72]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

def encode_statuc_binary(x):
    if x > 120:
        return 1
    return 0


data_metrics = []
for year in [2018,2019,2020]:
    for subject_name, score in subject.items():
        column = col.copy()
        column.append(score)
        X = df[df['year']==year].copy()

        X = X[column]
        X = X.dropna(subset=[score])
        X = X.reset_index()
        X = X.fillna(-1)

        
        X[f'{score[:-8]}_status'] = X[score].map(encode_statuc_binary)

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        auc_scores = []
        auc_baseline_scores = []
        acc_scores = []
        acc_baseline_scores = []
        f1_scores = []
        f1_baseline_scores = []
        for train_index,test_index in skf.split(X, X[f'{score[:-8]}_status'] ):
            train_df = X.iloc[train_index,:]
            test_df = X.iloc[test_index,:]

            onehotencode_col = ["eotypename_min", "Degree of institution"]
            preprocessor = Preprocessing(
                drop_strange_st_data = False,
                column_for_onehot = onehotencode_col
            )

            preprocess_train_df = preprocessor.process(
                df = train_df,
                data_type = 'train'
            )
            preprocess_test_df = preprocessor.process(
                df = test_df,
                data_type = 'test'
            )

            rf_cf = RandomForestClassifier(
                random_state=42,
                max_depth=20,
                n_estimators=200,
        
            )
            dummy_clf = DummyClassifier(strategy="most_frequent", random_state=42)

            y_train = preprocess_train_df[f'{score[:-8]}_status']
            X_train = preprocess_train_df.drop(columns=[f'{score[:-8]}_status', score])
            rf_cf.fit(X_train,y_train)
            dummy_clf.fit(X_train,y_train)

            y_test = preprocess_test_df[f'{score[:-8]}_status']
            X_test = preprocess_test_df.drop(columns=[f'{score[:-8]}_status', score])

            y_pred_test = rf_cf.predict(X_test)
            fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_test)

            #baseline
            y_pred_test_baseline = dummy_clf.predict(X_test)
            fpr_baseline, tpr_baseline, thresholds = metrics.roc_curve(y_test, y_pred_test_baseline)

            acc_scores.append(metrics.accuracy_score(y_test, y_pred_test))
            acc_baseline_scores.append(metrics.accuracy_score(y_test, y_pred_test_baseline))
            auc_scores.append(metrics.auc(fpr, tpr))
            auc_baseline_scores.append(metrics.auc(fpr_baseline, tpr_baseline))
            f1_scores.append(metrics.f1_score(y_test, y_pred_test))
            f1_baseline_scores.append(metrics.f1_score(y_test, y_pred_test_baseline))

        data_metrics.append(
            {
                "subject": subject_name,
                "year": year,
                "metric": "f1",
                "score": sum(f1_scores)/len(f1_scores),
                "baseline_score": sum(f1_baseline_scores)/len(f1_baseline_scores)
            }
        )

        data_metrics.append(
            {
                "subject": subject_name,
                "year": year,
                "metric": "AUC",
                "score": sum(auc_scores)/len(auc_scores),
                "baseline_score": sum(auc_baseline_scores)/len(auc_baseline_scores),
            }
        )

        data_metrics.append(
            {
                "subject": subject_name,
                "year": year,
                "metric": "Accuracy",
                "score": sum(acc_scores)/len(acc_scores),
                "baseline_score": sum(acc_baseline_scores)/len(acc_baseline_scores),
            }
        )

df_metric = pd.DataFrame(data_metrics)
df_metric.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

Unnamed: 0,subject,year,metric,score,baseline_score
0,Ukrainian,2018,f1,0.803726,0.78032
1,Ukrainian,2018,AUC,0.682774,0.5
2,Ukrainian,2018,Accuracy,0.731751,0.639774
3,History,2018,f1,0.549162,0.0
4,History,2018,AUC,0.664458,0.5


In [73]:
import altair  as alt

# charts = []
# for year in [2018, 2019, 2020]:
#     charts.append(alt.Chart(df_metric[df_metric['year']==year]).mark_bar().encode(
#     column=alt.Column('subject'),
#     x=alt.X('metric', title=None),
#     y=alt.Y('score'),
#     color=alt.Color('metric', scale=alt.Scale())
#     ).properties(
#         width=50, 
#         height=100,
#         title=alt.TitleParams(
#             text=f'Classification metrics. Dataset year {year}',
#             fontSize=20,
#             anchor='middle'
#         )))

# charts[0]&charts[1]&charts[2]

In [74]:
# alt.Chart(df_metric[df_metric['year']==2020]).mark_bar().encode(
#     column=alt.Column('subject'),
#     x=alt.X('metric', title=None),
#     y=alt.Y('score'),
#     color=alt.Color('metric', scale=alt.Scale())
#     ).properties(
#         width=50, 
#         height=200,
#         title=alt.TitleParams(
#             text=f'Classification metrics. Dataset year {year}',
#             fontSize=20,
#             anchor='middle'
#         ))

In [75]:
# add acurracy, baseline dashline, Cross-validation

# Multiclass

In [76]:
plots = []
first=True
for subj in subject.keys():
    if first:
        chart = alt.Chart(df_metric[(df_metric['year']==2020)&(df_metric['subject']==subj)]).mark_bar().encode(
            x=alt.X('metric', title=None, axis=alt.Axis(labelAngle=-45)),
            y=alt.Y('score', title='score'),
            color=alt.Color('metric', scale=alt.Scale(scheme='accent'))
        ).properties(
            width=30, 
            height=200,
            title=alt.TitleParams(
                text=subj,
                fontSize=10,
                anchor='middle'
            ))
        line_y = alt.Chart(
            df_metric[(df_metric['year']==2020)&(df_metric['subject']==subj)]
            ).mark_line(strokeDash=[2, 3], stroke='red').encode(
            y=alt.Y('baseline_score', title='score'),
            x = alt.X('metric', title=None, axis=alt.Axis(labelAngle=-45)),
            size=alt.value(2)
        )
    else:
        chart = alt.Chart(df_metric[(df_metric['year']==2020)&(df_metric['subject']==subj)]).mark_bar().encode(
            x=alt.X('metric', title=None, axis=alt.Axis(labelAngle=-45)),
            y=alt.Y('score',axis=None, title='score'),
            color=alt.Color('metric', scale=alt.Scale())
        ).properties(
            width=30, 
            height=200,
            title=alt.TitleParams(
                text=subj,
                fontSize=10,
                anchor='middle'
            ))
        line_y = alt.Chart(
            df_metric[(df_metric['year']==2020)&(df_metric['subject']==subj)]
            ).mark_line(strokeDash=[2, 3], stroke='red').encode(
            y=alt.Y('baseline_score',axis=None,  title='score'),
            x = alt.X('metric', title=None, axis=alt.Axis(labelAngle=-45)),
            size=alt.value(2)
        )
    plots.append(chart+line_y)
    first=False

In [77]:
alt.hconcat(*plots).resolve_scale(
    y='shared'
).properties(
        title=alt.TitleParams(
            text=f'Classification metrics. Dataset year {year}',
            fontSize=20,
            anchor='middle'
        ))

In [78]:
years_plots = []

for year in [2018, 2019, 2020]:
    plots = []
    first=True
    for subj in subject.keys():
        if first:
            chart = alt.Chart(df_metric[(df_metric['year']==year)&(df_metric['subject']==subj)]).mark_bar().encode(
                x=alt.X('metric', title=None, axis=alt.Axis(labelAngle=-45)),
                y=alt.Y('score', title='score'),
                color=alt.Color('metric', scale=alt.Scale(scheme='accent'))
            ).properties(
                width=30, 
                height=100,
                title=alt.TitleParams(
                    text=subj,
                    fontSize=10,
                    anchor='middle'
                ))
            line_y = alt.Chart(
                df_metric[(df_metric['year']==year)&(df_metric['subject']==subj)]
                ).mark_line(strokeDash=[2, 3], stroke='red').encode(
                y=alt.Y('baseline_score', title='score'),
                x = alt.X('metric', title=None, axis=alt.Axis(labelAngle=-45)),
                size=alt.value(2)
            )
        else:
            chart = alt.Chart(df_metric[(df_metric['year']==year)&(df_metric['subject']==subj)]).mark_bar().encode(
                x=alt.X('metric', title=None, axis=alt.Axis(labelAngle=-45)),
                y=alt.Y('score',axis=None, title='score'),
                color=alt.Color('metric', scale=alt.Scale())
            ).properties(
                width=30, 
                height=100,
                title=alt.TitleParams(
                    text=subj,
                    fontSize=10,
                    anchor='middle'
                ))
            line_y = alt.Chart(
                df_metric[(df_metric['year']==year)&(df_metric['subject']==subj)]
                ).mark_line(strokeDash=[2, 3], stroke='red').encode(
                y=alt.Y('baseline_score',axis=None, title='score'),
                x = alt.X('metric', title=None, axis=alt.Axis(labelAngle=-45)),
                size=alt.value(2)
            )
        plots.append(chart+line_y)
        first=False

    years_plots.append(alt.hconcat(*plots).resolve_scale(
    y='shared'
    ).properties(
            title=alt.TitleParams(
                text=f'Classification metrics. Dataset year {year}',
                fontSize=20,
                anchor='middle'
            )))

In [79]:
years_plots[0]&years_plots[1]&years_plots[2]