# Best Algorithms
The Algorithms that will be used in the experimentation will be:

* Linear Regression + New Data powered to 2 + ShuffleSplit [4.43, 2.58]
* Decision Tree Regressor max_depth 6 + Polynomial Features degree 2 + ShuffleSplit [4.38, 75.8]
* Random Forest Regressor max_depth 10 + New Data Powered to 2 + ShuffleSplit [4.33, 8.47]

Nuevos
* Linear Regression + Old Data + PCA of 35 components + ShuffleSplit
* Decision Tree Regressor max_depth 6 + Old Data + PCA with 33 components + ShuffleSplit
* Random Forest Regressor max_depth 11 + New Data powered to 2 + ShuffleSplit

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import PolynomialFeatures
%run -i ../../tools/predict_functions.py

In [175]:
def Prepare_data(data, c, New=False):
    """Returns the data ready to be used in a prediction.
    
    -data: full dataset
    -c: column/subject to predict
    
    returns:
        -X2: Predictive variables powered to 2
        -Poly_X: Predictive variables transformed by a PolynomialFeatures object with degree = 2
        -Y: Dataframe with the chosen target and the others"""
    data = data.sort_values(by=c)
    
    New_x_list = ['ESTU_GENERO', 'ESTU_ACT_PROX_ANNO', 'COD_INTERDISCIPLINAR', 'COLE_CARACTER', 'ESTU_RESIDE_DEPTO',
                  'FAMI_APORTANTES', 'FAMI_NUM_HERMANOS_EDUSUPERIOR', 'COLE_JORNADA', 'FAMI_OCUPA_MADRE', 'ESTU_CARRDESEADA_RAZON',
                  'FAMI_PERSONAS_HOGAR', 'ESTU_RAZONINSTITUTO', 'FAMI_OCUPA_PADRE', 'FAMI_EDUCA_PADRE', 'FAMI_NUM_HERMANOS',
                  'FAMI_EDUCA_MADRE', 'COLE_VALOR_PENSION', 'ESTU_RESIDE_MCPIO', 'ESTU_NACIMIENTO_MES',
                  'ESTU_IES_COD_DESEADA', 'ESTU_NACIMIENTO_DIA', 'ESTU_NACIMIENTO_ANNO', 'ESTU_CARRDESEADA_COD',
                  'COLE_COD_ICFES', 'FAMI_INGRESO_FMILIAR_MENSUAL']
    
    y_list = ['PUNT_BIOLOGIA', 'PUNT_MATEMATICAS', 'PUNT_FILOSOFIA', 'PUNT_FISICA', 'PUNT_HISTORIA', 'PUNT_QUIMICA', 
              'PUNT_LENGUAJE', 'PUNT_GEOGRAFIA', 'PUNT_INTERDISCIPLINAR', 'PUNT_IDIOMA']
    
    X_list = data.columns.difference(y_list)
    
    New_X_data = data.filter(items = New_x_list)
    Y = data.filter(items = y_list)
    X_data = data.filter(items = X_list)
    
    X2 = New_X_data**2
    
    pca_LR = PCA(n_components=35)
    LR_Data = pca_LR.fit_transform(X_data)
    
    pca_DT = PCA(n_components=33)
    DT_Data = pca_DT(X_data)
    
    return(X2, LR_Data, DT_Data, Y)

In [176]:
def Get_Scores(X2, LR_Data, DT_Data, Y, c):
    """Train the three models (LinearRegression, DecisionTreeRegressor, RandomForestRegressor), and 
    test them. All of this with a cross_val_score object
    
    -X2: Predictive variables powered to 2
    -Poly_X: Predictive variables transformed by a PolynomialFeatures object with degree = 2
    -Y: Dataframe with the chosen target and the others
    -c: Chosen column/subject
    
    Returns:
        -Scores of the trained models"""
    
    cv = ShuffleSplit(n = X2.shape[0], n_iter=5, test_size=0.2)
    
    LR = LinearRegression(n_jobs=4)
    LR_scores = cross_val_score(LR, LR_Data, Y[c], scoring='mean_absolute_error', cv = cv, n_jobs=4)
    
    DT = DecisionTreeRegressor(max_depth=6)
    DT_scores = cross_val_score(DT, DT_Data, Y[c], scoring='mean_absolute_error', cv = cv, n_jobs=4)
    
    RF = RandomForestRegressor(max_depth=10, n_jobs=4)
    RF_scores = cross_val_score(RF, X2, Y[c], scoring='mean_absolute_error', cv = cv, n_jobs=4)
    
    return(-LR_scores, -DT_scores, -RF_scores)

In [177]:
def Show_Score(LR_scores, DT_scores, RF_scores, i):
    """Shows the Mean Score and the Standard Deviation of the given scores."""
    print('Scores for the column:', i)
    print('Linear Regression:      ','Mean Score', np.mean(LR_scores), 'STD Scores', np.std(LR_scores))
    print('Decision Tree Regressor:', 'Mean Score', np.mean(DT_scores), 'STD Scores', np.std(DT_scores))
    print('Random Forest:          ', 'Mean Score', np.mean(RF_scores), 'STD Scores', np.std(RF_scores))

In [178]:
def Check_slice(lista):
    """Check if the Dataset contains all the required columns to train, and all the subject to predict.
    -lista: List of the columns of the dataset to be used."""
    lo = len(list(set(X_list)-set(lista)))
    Tl = len(list(set(y_list)-set(lista)))
    print('Targets Ready') if Tl == 0 else print('There are no ', Tl, 'in the DataFrame')
    print('Ready to Train') if lo == 0 else print('There are no ', lo, 'in the DataFrame: ', 
                                                  list(set(X_list)-set(lista)))

In [116]:
X_list = ['ESTU_GENERO', 'ESTU_ACT_PROX_ANNO', 'COD_INTERDISCIPLINAR', 'COLE_CARACTER', 'ESTU_RESIDE_DEPTO',
          'FAMI_APORTANTES', 'FAMI_NUM_HERMANOS_EDUSUPERIOR', 'COLE_JORNADA', 'FAMI_OCUPA_MADRE', 'ESTU_CARRDESEADA_RAZON',
          'FAMI_PERSONAS_HOGAR', 'ESTU_RAZONINSTITUTO', 'FAMI_OCUPA_PADRE', 'FAMI_EDUCA_PADRE', 'FAMI_NUM_HERMANOS',
          'FAMI_EDUCA_MADRE', 'COLE_VALOR_PENSION', 'ESTU_RESIDE_MCPIO', 'ESTU_NACIMIENTO_MES',
          'ESTU_IES_COD_DESEADA', 'ESTU_NACIMIENTO_DIA', 'ESTU_NACIMIENTO_ANNO', 'ESTU_CARRDESEADA_COD',
          'COLE_COD_ICFES', 'FAMI_INGRESO_FMILIAR_MENSUAL']
y_list = ['PUNT_BIOLOGIA', 'PUNT_MATEMATICAS', 'PUNT_FILOSOFIA', 'PUNT_FISICA', 'PUNT_HISTORIA', 'PUNT_QUIMICA', 
          'PUNT_LENGUAJE', 'PUNT_GEOGRAFIA', 'PUNT_INTERDISCIPLINAR', 'PUNT_IDIOMA']

____

# Testing of the functions

In [146]:
data_2000_2 = pd.read_csv('../../DatosFTPICFES/SABER11/SB11-BASES_DE_DATOS/READY/2000_2.csv', delimiter=';')

In [147]:
Check_slice(data_2000_2.columns)

Targets Ready
Ready to Train


In [148]:
i = 'PUNT_BIOLOGIA'
X2, Poly_X, Y = Prepare_data(data_2000_2, i)
LR_scores, DT_scores, RF_scores = Get_Scores(X2, Poly_X, Y, i)
Show_Score(LR_scores, DT_scores, RF_scores, i)

Training Linear Regression...
Training Decision Tree Regressor...
Training Random Forest Regressor...
Scores for the column: PUNT_BIOLOGIA
Linear Regression:       Mean Score 4.07021185276 STD Scores 0.0116656949613
Decision Tree Regressor: Mean Score 4.06089698179 STD Scores 0.00455511393733
Random Forest:           Mean Score 3.98533883479 STD Scores 0.00805709400176


First try... gets even a better score with the random forest

In [180]:
def Predict_all_Subjects(dataset):
    """Uses the Automation functions to train and test models for each Subject in the given Dataset.
    dataset: Dataset to be used for the models
    
    Returns:
        -One Dataset with all the Scores indexed by subject"""
    LR_List = []
    DT_List = []
    RF_List = []
    idx_List = []
    for i in y_list:
        X2, Poly_X, Y = Prepare_data(dataset, i)
        LR_scores, DT_scores, RF_scores = Get_Scores(X2, Poly_X, Y, i)
        Show_Score(LR_scores, DT_scores, RF_scores, i)
        idx_List.append(i)
        LR_List.append(np.mean(LR_scores))
        DT_List.append(np.mean(DT_scores))
        RF_List.append(np.mean(RF_scores))
    Scores_List = {'Subject': idx_List, 'LR': LR_List, 'DTR': DT_List, 'RFR': RF_List}
    Scores_DF = pd.DataFrame(Scores_List)
    return(Scores_DF)

In [152]:
LR_List = []
DT_List = []
RF_List = []
idx_List = []
for i in y_list:
    X2, Poly_X, Y = Prepare_data(data_2000_2, i)
    LR_scores, DT_scores, RF_scores = Get_Scores(X2, Poly_X, Y, i)
    Show_Score(LR_scores, DT_scores, RF_scores, i)
    idx_List.append(i)
    LR_List.append(np.mean(LR_scores))
    DT_List.append(np.mean(DT_scores))
    RF_List.append(np.mean(RF_scores))
Scores_List = {'Subject': idx_List, 'LR': LR_List, 'DTR': DT_List, 'RFR': RF_List}
Scores_2001_1 = pd.DataFrame(Scores_List)

Scores for the column: PUNT_BIOLOGIA
Linear Regression:       Mean Score 4.0598060564 STD Scores 0.00883410401984
Decision Tree Regressor: Mean Score 4.06791610287 STD Scores 0.0109249737968
Random Forest:           Mean Score 3.98404734235 STD Scores 0.00585179342112
Scores for the column: PUNT_MATEMATICAS
Linear Regression:       Mean Score 3.92437241153 STD Scores 0.00628508163154
Decision Tree Regressor: Mean Score 3.91676347222 STD Scores 0.00597959777981
Random Forest:           Mean Score 3.920904494 STD Scores 0.00923057636998
Scores for the column: PUNT_FILOSOFIA
Linear Regression:       Mean Score 4.79343867274 STD Scores 0.00539130964524
Decision Tree Regressor: Mean Score 4.76119503103 STD Scores 0.00736513241637
Random Forest:           Mean Score 4.71823034481 STD Scores 0.015255380715
Scores for the column: PUNT_FISICA
Linear Regression:       Mean Score 4.48546687208 STD Scores 0.010395521712
Decision Tree Regressor: Mean Score 4.49474340999 STD Scores 0.0141001763667
R

___

# Trying Fit with 1st Dataset and Test with 2nd Dataset

In [154]:
data_2000 = pd.read_csv('../../DatosFTPICFES/SABER11/SB11-BASES_DE_DATOS/READY/2000_1.csv', delimiter=';')

In [155]:
data_2000_2 = pd.read_csv('../../DatosFTPICFES/SABER11/SB11-BASES_DE_DATOS/READY/2000_2.csv', delimiter=';')

In [185]:
def Test_with_2sets(dataset1, dataset2, c):
    """Train three models with the first Dataset, and Test the models with the second Dataset,
    -Shows the Mean Absolute Error of each model.
    
    -dataset1: First dataset, will be used to train the models
    -dataset2: Second dataset, will be used to test the models
    -c: Column/Subject to predict
    
    Returns:
        -The trained Models
        -The Score of each Model"""
    X2, Poly_X, Y = Prepare_data(dataset1, c)
    X2_2, Poly_X_2, Y_2 = Prepare_data(dataset2, c)
    
    ##################TRAINING SIDE######################
    LR = LinearRegression(n_jobs=4);
    LR.fit(X2, Y[c]);
    
    DT = DecisionTreeRegressor(max_depth=6);
    DT.fit(Poly_X, Y[c]);
    
    RF = RandomForestRegressor(max_depth=10, n_jobs=4);
    RF.fit(X2, Y[c]);
    ####################################################
    ##################TESTING SIDE######################
    LR_Score = MAD(Y_2[c], LR.predict(X2_2))
    DT_Score = MAD(Y_2[c], DT.predict(Poly_X_2))
    RF_Score = MAD(Y_2[c], RF.predict(X2_2))
    print('Scores for the Columns/Subject:', c)
    print('Score LR:', LR_Score)
    print('Score DT:', DT_Score)
    print('Score RF:', RF_Score)
    print()
    ####################################################
    
    return(LR, LR_Score, DT, DT_Score, RF, RF_Score)

In [186]:
def Test_all_Subjects(dataset1, dataset2):
    """Uses the Test_with_2sets function to do the corresponding predictions for each Subject.
    
    Shows the Prediction of the Subject for each model. 
    
    dataset1: First dataset, will be used to predict
    dataset2: Second dataset, will be used to test
    
    Returns:
        -One DataFrame with all the Scores for each Subject."""
    LR_List = []
    DT_List = []
    RF_List = []
    idx_List = []
    for i in y_list:
        LR, LR_Score, DT, DT_Score, RF, RF_Score = Test_with_2sets(dataset1, dataset2, i)
        idx_List.append(i)
        LR_List.append(LR_score)
        DT_List.append(DT_score)
        RF_List.append(RF_score)
    Scores_List = {'Subject': idx_List, 'LR': LR_List, 'DTR': DT_List, 'RFR': RF_List}
    Score_DF = pd.DataFrame(Scores_List)
    return(Score_DF)

In [171]:
LR, DT, RF = Test_with_2sets(data_2000, data_2000_2, 'PUNT_BIOLOGIA')

Score LR: 104.44124393187163
Score DT: 5.112135524676685
Score RF: 5.164256732720748
