In [77]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import Markdown as md
from datetime import date, datetime
%matplotlib inline
#sns.set(rc={"figure.figsize":(20, 20)})

from pandas import DataFrame
from typing import List
%matplotlib inline
import numpy as np

In [78]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score
from sklearn.metrics import f1_score

In [79]:
# COLUMNS
FECHAI = 'Fecha-I'
FECHAO = 'Fecha-O'
VLOO = 'Vlo-O'
ORIO = 'Ori-O'
DESO = 'Des-O'
EMPO = 'Emp-O'
VLOI = 'Vlo-I'
DIA = 'DIA'
MES = 'MES'
ANIO = 'AÑO'
DIANOM = 'DIANOM'
TIPOVUELO = 'TIPOVUELO'
OPERA = 'OPERA'
SIGLADES = 'SIGLADES'
COHORT = 'cohort'
# 
TEMPORADA_ALTA = 'temporada_alta'
DIF_MIN = 'dif_min'
ATRASO_15 = 'atraso_15'
PERIODO_DIA = 'periodo_dia'
MORNING = 'mañana'
AFTERNOON = 'tarde'
NIGHT = 'noche'

COL_TEMPORAL = 'col_tmp'

DATE = 'Date'
TIME = 'time'
HOUR = 'hour'
MINUTE = 'minute'


COLUM_FILTER = [FECHAI,VLOI, DIA, MES, ANIO, DIANOM, 
                TIPOVUELO, OPERA, SIGLADES, TEMPORADA_ALTA, ATRASO_15, PERIODO_DIA,
                 'AnteriorDelay', 'AnteriorEarly', 'number_of_flights_same_opera',
                'number_of_flights_before', 'number_of_flights_same_dest']

COLUM_SYNTHETIC_FEATURES = [TEMPORADA_ALTA, DIF_MIN, ATRASO_15, PERIODO_DIA]




STR_WINTER_START = '2017-12-15'
STR_WINTER_END = '2017-12-31'
STR_WINTER_BIS_START = '2017-01-01'
STR_WINTER_BIS_END = '2017-03-03'
STR_JULY_START = '2017-07-15'
STR_JULY_END = '2017-07-31'
STR_SEPT_START = '2017-09-11'
STR_SEPT_END = '2017-09-30'


STR_MANANA_START= '5:00'
STR_MANANA_END = '11:59'
STR_TARDE_START = '12:00'
STR_TARDE_END = '18:59'

PARAM_GRID = {
    'n_estimators': [10, 20, 50, 10, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 8, 16, 32, 64, 128, 256, 512],
    'criterion': ['gini', 'entropy'],
    'n_jobs': [2]
}


columns = [TEMPORADA_ALTA, DIF_MIN, ATRASO_15, PERIODO_DIA]
def convert_str_to_date(str_date: str) -> datetime:
    """convert string to date format '%Y-%m-%d'

    Args:
        str_date (str): string ex: '2017-01-01'

    Returns:
        datetime: string with datetime type
    """
    return datetime.strptime(str_date, '%Y-%m-%d').date()

def convert_str_to_time(str_date: str) -> datetime:
    """convert string to time format '%H:%M'

    Args:
        str_date (str): string '23:59'

    Returns:
        datetime: string with datetime type
    """
    return datetime.strptime(str_date, '%H:%M').time()

def add_temporada_alta_flag(df: DataFrame, dates: List) -> DataFrame:
    """Add temporada alta flag according to array of dates

    Args:
        df (DataFrame): _description_
        dates (List): _description_

    Returns:
        DataFrame: _description_
    """
    df[TEMPORADA_ALTA] = 0
    for date in dates:
        df.loc[(df[DATE] >= date[0]) & (df[DATE]<=date[1]), TEMPORADA_ALTA] = 1
    return df.copy()
def add_periodo_dia_flag(df: DataFrame, times: List) -> DataFrame:
    """Add temporada alta flag according to array of dates

    Args:
        df (DataFrame): _description_
        dates (List): _description_

    Returns:
        DataFrame: _description_
    """
    # default
    df[PERIODO_DIA] = NIGHT
    df.loc[(df[TIME] >= times[0][0]) & (df[TIME]<=times[0][1]), PERIODO_DIA] = MORNING
    df.loc[(df[TIME] >= times[1][0]) & (df[TIME]<=times[1][1]), PERIODO_DIA] = AFTERNOON
    return df.copy()

In [80]:
def generate_temporada_alta_set()-> list:
    """
    This function generate arrays to generate temporada_alta elements, source from constant
    dates ahre the following, 
        15-Dic y 3-Mar, o 15-Jul y 31-Jul, o 11-Sep y 30-Sep,
    output: array of element, with the following 
    """
    str_winter_start = convert_str_to_date(STR_WINTER_START)
    str_winter_end = convert_str_to_date(STR_WINTER_END)
    str_winter_bis_start = convert_str_to_date(STR_WINTER_BIS_START)
    str_winter_bis_end = convert_str_to_date(STR_WINTER_BIS_END)
    str_July_start = convert_str_to_date(STR_JULY_START)
    str_July_end = convert_str_to_date(STR_JULY_END)
    str_Sept_start = convert_str_to_date(STR_SEPT_START)
    str_Sept_end = convert_str_to_date(STR_SEPT_END)

    dates = [[str_winter_start, str_winter_end],
             [str_winter_bis_start, str_winter_bis_end],
             [str_July_start, str_July_end],
             [str_Sept_start, str_Sept_end]]
    return dates

def generate_day_section():
    """
    split a day iin three part, morning, afternoon and night
    """
    str_mañana_start = convert_str_to_time(STR_MANANA_START)
    str_mañana_end = convert_str_to_time(STR_MANANA_END)
    str_tarde_start = convert_str_to_time(STR_TARDE_START)
    str_tarde_end = convert_str_to_time(STR_TARDE_END)
    times = [[str_mañana_start, str_mañana_end],
             [str_tarde_start, str_tarde_end],
            ]
    return times

def split_date(df:DataFrame, column_name:str)-> DataFrame:
    """_summary_

    Args:
        df (DataFrame): _description_
        column_name (str): _description_

    Returns:
        DataFrame: _description_
    """
    df[column_name] = pd.to_datetime(df[column_name])
    df[DATE] = df[column_name].dt.date
    df[TIME] = df[column_name].dt.time
    return df.copy()

def add_temporada_alta_flag(df: DataFrame, dates: List) -> DataFrame:
    """Add temporada alta flag according to array of dates

    Args:
        df (DataFrame): _description_
        dates (List): _description_

    Returns:
        DataFrame: _description_
    """
    df[TEMPORADA_ALTA] = 0
    for date in dates:
        df.loc[(df[DATE] >= date[0]) & (df[DATE]<=date[1]), TEMPORADA_ALTA] = 1
    return df.copy()

def add_periodo_dia_flag(df: DataFrame, times: List) -> DataFrame:
    """Add temporada alta flag according to array of dates

    Args:
        df (DataFrame): _description_
        dates (List): _description_

    Returns:
        DataFrame: _description_
    """
    # default
    df[PERIODO_DIA] = NIGHT
    df.loc[(df[TIME] >= times[0][0]) & (df[TIME]<=times[0][1]), PERIODO_DIA] = MORNING
    df.loc[(df[TIME] >= times[1][0]) & (df[TIME]<=times[1][1]), PERIODO_DIA] = AFTERNOON
    return df.copy()

def add_dif_min_and_atraso_15(df:DataFrame) ->DataFrame:
    df[DIF_MIN] = df[FECHAO]-df[FECHAI]
    df[DIF_MIN] = df[DIF_MIN].dt.total_seconds()//60
    df[ATRASO_15] = 0
    df.loc[df[DIF_MIN]>15, ATRASO_15] = 1
    return df



In [81]:
def delete_operational_columns(df:DataFrame)-> DataFrame:
    del df[FECHAO]
    del df[VLOO]
    del df[ORIO]
    del df[DESO]
    del df[EMPO]
    return df

def filter_column(df:DataFrame, columns:str =  COLUM_FILTER)->DataFrame:
    return df[columns]


def previous_flight (df):
    df.sort_values(FECHAI, inplace = True)
    df["Early"] = df[DIF_MIN].shift(1)
    df['AnteriorEarly'] = 0
    df.loc[df["Early"]<0, 'AnteriorEarly'] = 1
    df.loc[df['AnteriorEarly'].isna(), 'AnteriorEarly'] = 0
    df["Delay"] = df[DIF_MIN].shift(1)
    df['AnteriorDelay'] = 0
    df.loc[df["Delay"]>0, 'AnteriorDelay'] = 1
    del df['Delay']
    del df['Early']
    return df

def number_of_flights_before (df):

    df.sort_values(FECHAI, inplace = True)
    dft = df.groupby([FECHAI])[FECHAI].count().rename("col_tmp")
    dft = dft.reset_index()
    dft["number_of_flights_before"] = dft["col_tmp"].shift(1)
    del dft['col_tmp']
    df = pd.merge(df,dft)
    df.loc[df['number_of_flights_before'].isna(), 'number_of_flights_before'] = 0

    return df


def number_of_flights_same_opera(df):
    dft = df.groupby([FECHAI, 'OPERA'])[FECHAI].count().rename("number_of_flights_same_opera")
    dft = dft.reset_index()
    dft['number_of_flights_same_opera'] = dft['number_of_flights_same_opera'] -1 
    df = pd.merge(df,dft)
    df.loc[df['number_of_flights_same_opera'].isna(), 'number_of_flights_same_opera'] = 0
    return df

def number_of_flights_same_dest(df):
    dft = df.groupby([FECHAI, 'SIGLADES'])[FECHAI].count().rename("number_of_flights_same_dest")
    dft = dft.reset_index()
    dft['number_of_flights_same_dest'] = dft['number_of_flights_same_dest'] -1 
    df = pd.merge(df,dft)
    df.loc[df['number_of_flights_same_dest'].isna(), 'number_of_flights_same_dest'] = 0
    return df

def one_hot_encoder(df:DataFrame)-> DataFrame:
    
    df = pd.get_dummies(df, columns = [OPERA])
    df = pd.get_dummies(df, columns = [SIGLADES])
    df = pd.get_dummies(df, columns = [TEMPORADA_ALTA]) 
    df = pd.get_dummies(df, columns = [PERIODO_DIA]) 
    df = pd.get_dummies(df, columns = [VLOI])
    df = pd.get_dummies(df, columns = [DIANOM])
    return df 


def to_bool_tipo_vuelo(df:DataFrame)-> DataFrame:
    df[f"{TIPOVUELO}_"] = df[TIPOVUELO]
    df.loc[df[f"{TIPOVUELO}_"]== 'I', TIPOVUELO] = 1
    df.loc[df[f"{TIPOVUELO}_"]== 'N', TIPOVUELO] = 0
    del df[f"{TIPOVUELO}_"]
    return df

def fit_grid_model(
    rfc: RandomForestClassifier,
    param_grid,
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
):  # pragma: no cover
    """GridSeach

    Args:
        rfc (RandomForestClassifier): ML Model
        param_grid (dict): JSON parameters
        X_train (pd.DataFrame): train df
        y_train (pd.DataFrame): goal df
        model_type (str): model name

    Returns:
        [type]: model trained
    """
    model = GridSearchCV(
        estimator=rfc,
        param_grid=param_grid,
        cv=3,
    )
    model.fit(X_train, y_train)
    return model

In [141]:
def fit_compute_importance(clf, X, y, X_test, y_test, k = 4):
    kf = KFold(n_splits=4)
    kf.get_n_splits(X)
    for i in range(0,5):
        print(f"iteration: {i}")
        for train_index, test_index in kf.split(X_train):
            X_train_, X_test_ = X.iloc[train_index], X.iloc[test_index]
            y_train_, y_test_ = y.iloc[train_index], y.iloc[test_index]
            clf.fit(X_train_, y_train_)
            y_pred = clf.predict(X_test_)
    
        y_pred = clf.predict(X_test)
        print(f"f1 score test set: {f1_score(y_test, y_pred, zero_division=1)*100:.2f}%\n")

In [142]:
dates = generate_temporada_alta_set()
times = generate_day_section()


no sirven
* Ori-I y Pri-o un sólo lugar de origen
* Des-I y SIGLADES son lo mismo
- OPERA == Emp-I

modificadores
- Vlo-I != Vlo-O, es diferente hubo atrasosi el número de vuelo cambio o el origen cambio, significa un retraso? Des-I
- Des-I != Des-O, es diferente, atraso
# no se usaran para evitar sesgos
- Fecha-O


# All features

In [143]:
df = pd.read_csv("../sources/raw/dataset_SCL.csv", dtype={'Vlo-O':str, 'Vlo-I':str})
df[FECHAO] = pd.to_datetime(df[FECHAO])
df = (df.pipe(split_date, FECHAI)
        .pipe(add_temporada_alta_flag, dates)
        .pipe(add_periodo_dia_flag, times)
        #.pipe(apply_cohort)
        .pipe(add_dif_min_and_atraso_15)
        .pipe(delete_operational_columns)
        .pipe(previous_flight)
        .pipe(number_of_flights_before)
        .pipe(number_of_flights_same_opera)
        .pipe(number_of_flights_same_dest)
        .pipe(filter_column, COLUM_FILTER)
        .pipe(to_bool_tipo_vuelo)
        .pipe(one_hot_encoder) # generate one_hot_encoder
     )

In [144]:
columns = df.columns
columns = columns.tolist()
columns.remove(FECHAI)
df = df[columns]
columns.remove(ATRASO_15)

In [157]:
df[ATRASO_15].unique().tolist()

[0, 1]

In [160]:
df[columns]

Unnamed: 0,DIA,MES,AÑO,TIPOVUELO,AnteriorDelay,AnteriorEarly,number_of_flights_same_opera,number_of_flights_before,number_of_flights_same_dest,OPERA_Aerolineas Argentinas,...,Vlo-I_993,Vlo-I_9955,Vlo-I_9956,DIANOM_Domingo,DIANOM_Jueves,DIANOM_Lunes,DIANOM_Martes,DIANOM_Miercoles,DIANOM_Sabado,DIANOM_Viernes
0,1,1,2017,1,0,0,0,0.0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,1,2017,1,0,1,0,1.0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,1,2017,1,0,1,0,1.0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1,1,2017,1,1,0,0,1.0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,1,2017,1,1,0,0,1.0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68201,31,12,2017,1,0,1,0,1.0,0,0,...,0,0,0,1,0,0,0,0,0,0
68202,1,1,2018,1,1,0,0,2.0,0,0,...,0,0,0,0,0,1,0,0,0,0
68203,31,12,2017,1,1,0,0,1.0,0,0,...,0,0,0,1,0,0,0,0,0,0
68204,1,1,2018,1,0,1,0,1.0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [167]:
X_train, X_test, y_train, y_test = train_test_split(
        df[columns],
        df[ATRASO_15],
        test_size=.8,
        random_state=42,
        stratify = df[ATRASO_15]
    )

In [168]:
# Select 4 new classifiers
clfs = {#"Logistic Regression": LogisticRegression(solver = 'saga',  random_state=0,  max_iter=99 ),
        "Support Vector rbf": SVC(kernel='rbf'),
        "Support Vector sigmoid": SVC(kernel='sigmoid'),
        "KNN":KNeighborsClassifier(n_neighbors=15),
        "Decision Tree": DecisionTreeClassifier(),
        "GradientBoostingClassifier": GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=42)
       }

In [169]:
X_train.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [170]:
from imblearn.over_sampling import SMOTE

In [172]:
#Print results
for name, clf in clfs.items():
    print("====="*20)
    print(f"{name} classifier\n")
    
    sm = SMOTE(k_neighbors=5)
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
    
    #fit_compute_importance(clf, X_train, y_train, X_test, y_test, k = 4)
    fit_compute_importance(clf, X_train_oversampled, y_train_oversampled, X_test, y_test)

Support Vector rbf classifier

iteration: 0
f1 score test set: 0.00%

iteration: 1
f1 score test set: 0.00%

iteration: 2
f1 score test set: 0.00%

iteration: 3
f1 score test set: 0.00%

iteration: 4
f1 score test set: 0.00%

Support Vector sigmoid classifier

iteration: 0
f1 score test set: 0.00%

iteration: 1
f1 score test set: 0.00%

iteration: 2
f1 score test set: 0.00%

iteration: 3
f1 score test set: 0.00%

iteration: 4
f1 score test set: 0.00%

KNN classifier

iteration: 0
f1 score test set: 9.94%

iteration: 1
f1 score test set: 9.94%

iteration: 2
f1 score test set: 9.94%

iteration: 3
f1 score test set: 9.94%

iteration: 4
f1 score test set: 9.94%

Decision Tree classifier

iteration: 0
f1 score test set: 28.51%

iteration: 1
f1 score test set: 28.42%

iteration: 2
f1 score test set: 28.56%

iteration: 3
f1 score test set: 28.52%

iteration: 4
f1 score test set: 28.81%

GradientBoostingClassifier classifier

iteration: 0
f1 score test set: 18.43%

iteration: 1
f1 score test s

In [173]:
X_train, X_test, y_train, y_test = train_test_split(
        df[columns],
        df[ATRASO_15],
        test_size=.8,
        random_state=42
    )

In [174]:
# Select 4 new classifiers
clfs = {#"Logistic Regression": LogisticRegression(solver = 'saga',  random_state=0,  max_iter=99 ),
        "Support Vector rbf": SVC(kernel='rbf'),
        "Support Vector sigmoid": SVC(kernel='sigmoid'),
        "KNN":KNeighborsClassifier(n_neighbors=15),
        "Decision Tree": DecisionTreeClassifier(),
        "GradientBoostingClassifier": GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=42)
       }

In [175]:
X_train.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [170]:
from imblearn.over_sampling import SMOTE

In [176]:
#Print results
for name, clf in clfs.items():
    print("====="*20)
    print(f"{name} classifier\n")
    
    sm = SMOTE(k_neighbors=5)
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
    
    #fit_compute_importance(clf, X_train, y_train, X_test, y_test, k = 4)
    fit_compute_importance(clf, X_train_oversampled, y_train_oversampled, X_test, y_test)

Support Vector rbf classifier

iteration: 0
f1 score test set: 0.00%

iteration: 1
f1 score test set: 0.00%

iteration: 2
f1 score test set: 0.00%

iteration: 3
f1 score test set: 0.00%

iteration: 4
f1 score test set: 0.00%

Support Vector sigmoid classifier

iteration: 0
f1 score test set: 0.00%

iteration: 1
f1 score test set: 0.00%

iteration: 2
f1 score test set: 0.00%

iteration: 3
f1 score test set: 0.00%

iteration: 4
f1 score test set: 0.00%

KNN classifier

iteration: 0
f1 score test set: 9.96%

iteration: 1
f1 score test set: 9.96%

iteration: 2
f1 score test set: 9.96%

iteration: 3
f1 score test set: 9.96%

iteration: 4
f1 score test set: 9.96%

Decision Tree classifier

iteration: 0
f1 score test set: 27.76%

iteration: 1
f1 score test set: 27.65%

iteration: 2
f1 score test set: 27.99%

iteration: 3
f1 score test set: 28.01%

iteration: 4
f1 score test set: 28.29%

GradientBoostingClassifier classifier

iteration: 0
f1 score test set: 17.81%

iteration: 1
f1 score test s

In [167]:
X_train, X_test, y_train, y_test = train_test_split(
        df[columns],
        df[ATRASO_15],
        test_size=.8,
        random_state=42,
        stratify = df[ATRASO_15]
    )

In [168]:
# Select 4 new classifiers
clfs = {#"Logistic Regression": LogisticRegression(solver = 'saga',  random_state=0,  max_iter=99 ),
        "Support Vector rbf": SVC(kernel='rbf'),
        "Support Vector sigmoid": SVC(kernel='sigmoid'),
        "KNN":KNeighborsClassifier(n_neighbors=15),
        "Decision Tree": DecisionTreeClassifier(),
        "GradientBoostingClassifier": GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=42)
       }

In [169]:
X_train.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [170]:
from imblearn.over_sampling import SMOTE

In [177]:
#Print results
for name, clf in clfs.items():
    print("====="*20)
    print(f"{name} classifier\n")
    
    
    #fit_compute_importance(clf, X_train, y_train, X_test, y_test, k = 4)
    fit_compute_importance(clf, X_train, y_train, X_test, y_test)

Support Vector rbf classifier

iteration: 0
f1 score test set: 0.00%

iteration: 1
f1 score test set: 0.00%

iteration: 2
f1 score test set: 0.00%

iteration: 3
f1 score test set: 0.00%

iteration: 4
f1 score test set: 0.00%

Support Vector sigmoid classifier

iteration: 0
f1 score test set: 0.00%

iteration: 1
f1 score test set: 0.00%

iteration: 2
f1 score test set: 0.00%

iteration: 3
f1 score test set: 0.00%

iteration: 4
f1 score test set: 0.00%

KNN classifier

iteration: 0
f1 score test set: 9.96%

iteration: 1
f1 score test set: 9.96%

iteration: 2
f1 score test set: 9.96%

iteration: 3
f1 score test set: 9.96%

iteration: 4
f1 score test set: 9.96%

Decision Tree classifier

iteration: 0
f1 score test set: 28.03%

iteration: 1
f1 score test set: 28.24%

iteration: 2
f1 score test set: 28.21%

iteration: 3
f1 score test set: 27.56%

iteration: 4
f1 score test set: 28.34%

GradientBoostingClassifier classifier

iteration: 0
f1 score test set: 17.81%

iteration: 1
f1 score test s

In [None]:
import numpy as np
from sklearn.decomposition import PCA

In [47]:
pca = PCA(n_components=10)
pca.fit(X_train)

PCA(n_components=10)

In [48]:
print(pca.explained_variance_ratio_)

[0.81266497 0.12878776 0.00430308 0.00403015 0.00397122 0.00369971
 0.00353327 0.00328659 0.00260366 0.0018815 ]
[1033.13013499  411.27902755   75.17771933   72.75451877   72.22058661
   69.70803813   68.12208643   65.70101133   58.47786571   49.71093624]


In [50]:
scaler = StandardScaler()

# Fit on training set only.
scaler.fit(X_train)

# Apply transform to both the training set and the test set.
X_train_pca = scaler.transform(X_train)
X_test_pca = scaler.transform(X_test)

In [56]:
# Print results
for name, clf in clfs.items():
    print("====="*20)
    print(f"{name} classifier\n")
    fit_compute_importance(clf, X_train_pca, y_train, X_test_pca, y_test)

Decision Tree classifier

Mean accuracy score on the test set: 74.09%

GradientBoostingClassifier classifier

Mean accuracy score on the test set: 81.43%



In [59]:
y_train.unique()

array([0, 1])

In [61]:
rfc_st = RandomForestClassifier(random_state=42)
model = fit_grid_model(rfc_st, PARAM_GRID, X_train_pca, y_train)
y_pred = model.predict(X_test_pca)

1

In [63]:
t = model.predict(X_test)

In [67]:
np.unique(t)

array([0])

In [70]:
f1_score(y_test, y_pred,labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')

0.11846565566458521

In [None]:
import matplotlib.pyplot as plt

feat_importances = pd.DataFrame(clf.feature_importances_, index=X_train.columns, columns=["Importance"])
feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
feat_importances.plot(kind='bar', figsize=(8,6))