In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn import set_config
from sklearn.compose import make_column_transformer

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

In [None]:
X_train = pd.read_csv('/content/gdrive/MyDrive/Data Science Pro/Projekt końcowy/input_data_train_X')
y_train = pd.read_csv('/content/gdrive/MyDrive/Data Science Pro/Projekt końcowy/input_data_train_y')
X_test = pd.read_csv('/content/gdrive/MyDrive/Data Science Pro/Projekt końcowy/input_data_test_X')
y_test = pd.read_csv('/content/gdrive/MyDrive/Data Science Pro/Projekt końcowy/input_data_test_y')

In [None]:
#converting strings into binary values
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
def ts_col_names(first_ts_col, last_ts_col):
  return list(X_train.iloc[:, first_ts_col: last_ts_col].columns)

In [None]:
#count_eX
counts_e2 = ts_col_names(13, 37)
counts_e3 = ts_col_names(37, 61)
counts_e4 = ts_col_names(61, 85)
counts_e5 = ts_col_names(85, 109)
counts_e6plus = ts_col_names(109, 133)
#sum_eX
sum_e2 = ts_col_names(133, 157)
sum_e3 = ts_col_names(157, 181)
sum_e4 = ts_col_names(181, 205)
sum_e5 = ts_col_names(205, 229)
sum_e6plus = ts_col_names(229, 253)
#number_of_rock_bursts
num_rock_bursts = ts_col_names(277,301)
#highest_bump_energy
h_bump_energy = ts_col_names(325, 349)

max_gactivities = ts_col_names(349, 373)
max_genergies = ts_col_names(373, 397)
avg_gactivities = ts_col_names(397, 421)
avg_genergies = ts_col_names(421, 445)
max_diff_gactivities = ts_col_names(445, 469)
max_diff_genergies = ts_col_names(469, 493)
avg_diff_gactivities = ts_col_names(493, 517)
avg_diff_genergies = ts_col_names(517, 541)

In [None]:
#list of time series columns that will be used in further calucaltions
ts_to_agg = [max_gactivities, max_genergies, avg_gactivities, avg_genergies]
ts_to_agg_diff = [
    max_diff_gactivities, max_diff_genergies, 
    avg_diff_gactivities, avg_diff_genergies
    ]
ts_to_sum = [
    counts_e2, counts_e3, counts_e4, counts_e5, counts_e6plus, 
    sum_e2, sum_e3, sum_e4, sum_e5, sum_e6plus, 
    num_rock_bursts, h_bump_energy
    ]
ts_to_agg_combined = ts_to_agg + ts_to_agg_diff

flattened_ts_to_agg = [col for ts in ts_to_agg for col in ts]
flattened_ts_to_agg_diff = [col for ts in ts_to_agg_diff for col in ts]
flattened_ts_to_sum = [col for ts in ts_to_sum for col in ts]

In [None]:
#dropping the non-required column
X_train.drop('main_working_id', axis=1, inplace=True)
X_test.drop('main_working_id', axis=1, inplace=True)

In [None]:
#Here is custom Transformer - it sums whole 24 hours time series
class SumTimeSeries(BaseEstimator, TransformerMixin):
  def __init__ (self, flattened_ts_to_sum, ts_to_sum):
    self.flattened_ts_to_sum=flattened_ts_to_sum
    self.ts_to_sum = ts_to_sum

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    X_agg = X.copy()
    for ts in self.ts_to_sum:
      X_agg['sum_'+ts[0][: -2]] = X_agg[ts].sum(axis=1)
    return X_agg

In [None]:
class TimeSeriesStatistics(BaseEstimator, TransformerMixin):
    def __init__(self, flattened_ts_to_agg, flattened_ts_to_agg_diff, ts_to_agg, ts_to_agg_diff, last_X_hours=5):
        self.flattened_ts_to_agg = flattened_ts_to_agg
        self.flattened_ts_to_agg_diff = flattened_ts_to_agg_diff
        self.ts_to_agg = ts_to_agg
        self.ts_to_agg_diff = ts_to_agg_diff
        self.last_X_hours = last_X_hours
        self.transformed_column_names = None

    def fit(self, X, y=None):
        return self

    def ts_compute_stats(self, X, ts, abs_, last_X_hours):
        # 24 hours statistics:
        X[f'avg_' + ts[0][:-2]] = X[ts].mean(axis=1)
        X[f'std_' + ts[0][:-2]] = X[ts].std(axis=1)
        X[f'max_' + ts[0][:-2]] = X[ts].max(axis=1)
        if abs_:
            X['abs_avg_' + ts[0][:-2]] = X[ts].abs().mean(axis=1)
            X['max_abs_' + ts[0][:-2]] = X[ts].abs().max(axis=1)

        # last X hours statistics:
        X[f'avg_{last_X_hours}h_' + ts[0][:-2]] = X[ts[-last_X_hours:]].mean(axis=1)
        X[f'std_{last_X_hours}h_' + ts[0][:-2]] = X[ts[-last_X_hours:]].std(axis=1)

        slopes = []
        for _, row in X.iterrows():
          Y = row[ts[-last_X_hours:]].astype(np.float64)
          X_data = np.arange(len(Y))
          
          # check if all values in Y are the same
          if np.all(Y == Y[0]):
              slope = 0
          else:
              slope = np.polyfit(X_data, Y, 1, rcond=1e-8)[0]
        
          slopes.append(slope)
        X['slope_of_lr_' + ts[0][:-2]] = slopes

    def transform(self, X, y=None):
        X_copy = X.copy()

        for ts_list in self.ts_to_agg:
            ts = [col_name for col_name in X_copy.columns if any(col_name.startswith(ts_name) for ts_name in ts_list)]
            print(f"Processing ts_to_agg columns: {ts}")
            self.ts_compute_stats(X_copy, ts, False, self.last_X_hours)

        for ts_list in self.ts_to_agg_diff:
            ts = [col_name for col_name in X_copy.columns if any(col_name.startswith(ts_name) for ts_name in ts_list)]
            print(f"Processing ts_to_agg_diff columns: {ts}")
            self.ts_compute_stats(X_copy, ts, True, self.last_X_hours)

        #column name update after time series aggregations
        self.transformed_column_names = X_copy.columns

        return X_copy

In [None]:
def cor_selector(X, y, num_feats: int):
  '''
Calculating Pearson correlation between each feature and prediction. Finding the 
most important features basing on correlation.
  '''
  cor_list = []
  features = X.columns.tolist()
  for feature in features:
    cor = np.corrcoef(X[feature], y)[0, 1]
    cor_list.append(cor)
  #Nan -> 0
  cor_list = [0 if np.isnan(i) else i for i in cor_list]
  cor_feature = X.iloc[:, 
                       np.argsort(np.abs(cor_list))
                       [-num_feats: ]
                       ].columns.tolist()
  #feature selection
  cor_support = [True if i in cor_feature else False for i in features]
  
  return cor_support, cor_feature


In [None]:
def RFE_selector(X, y, num_feats):
  'Finding the most important features basing on recursive feature elimination'

  rfe_selector = RFE(
      estimator=LogisticRegression(), n_features_to_select=num_feats, 
      step=10)
  rfe_selector.fit(X, y)
  rfe_support = rfe_selector.get_support()
  rfe_feature = X.loc[:, rfe_support].columns.tolist()
  
  return rfe_support, rfe_feature

In [None]:
def Lasso_selector(X, y, num_feats):
  '''
  Finding the most important features basing on meta-transformer 
  that used alongside Logistic Regression estimator that assigns importance 
  to each feature.
  '''

  lr_selector = SelectFromModel(
      LogisticRegression(penalty='l2'), max_features=num_feats)
  lr_selector.fit(X, y)
  lr_support = lr_selector.get_support()
  lr_feature = X.loc[:, lr_support].columns.tolist()

  return lr_support, lr_feature

In [None]:
def RanFor_selector(X, y, num_feats):
  '''
  Finding the most important features basing on meta-transformer 
  that used alongside Random Forest estimator that assigns importance 
  to each feature.
  '''

  rf_selector = SelectFromModel(
      RandomForestClassifier(n_estimators=60), max_features=num_feats)
  rf_selector.fit(X, y)
  rf_support = rf_selector.get_support()
  rf_feature = X.loc[:, rf_support].columns.tolist()

  return rf_support, rf_feature

In [None]:
def LGBMC_optimizer(X, y, num_feats, selector=False):
  '''
  Finding best LGBMClassifier hyperparameters for SelectFromModel
  feature selection
  '''

  lgbc = LGBMClassifier()

  max_depth = [-1, 0, 1, 5, 20]
  num_leaves = [1, 2, 16 ,32]
  reg_lambda = [0, 1, 5, 10, 25, 100]
  learning_rate = [0.01, 0.05, 0.1, 1]
  min_gain_to_split = [0, 0.1, 1]
  min_child_weight = [1e-3, 1e-1, 0, 10, 40]

  params = {
      'max_depth': max_depth,
      'num_leaves': num_leaves,
      'reg_lambda': reg_lambda,
      'learning_rate': learning_rate,
      'min_gain_to_split': min_gain_to_split,
      'min_child_weight': min_child_weight,
       }

  kfold = StratifiedKFold(3, shuffle=True, random_state=42)
  optimizer = RandomizedSearchCV(estimator=lgbc,
                                 param_distributions=params,
                                 scoring='roc_auc',
                                 cv=kfold)
  optimizer.fit(X, y)

  return optimizer.best_params_, optimizer.best_estimator_


In [None]:
def LGBMC_selector(X, y, num_feats, optimization=False):
  '''
  Finding the most important features basing on meta-transformer 
  that used alongside LGBMClassifier that assigns importance 
  to each feature.
  If optimization = False it is base LGBClassifier without optimizing
  hyperparameters. Setting it to True runs LGBMC_optimizer function.
  '''

  if optimization:
    best_params, lgbc = LGBMC_optimizer(X, y, num_feats, selector=True)
  else:
    lgbc = LGBMClassifier()
  lgbc_selector = SelectFromModel(lgbc, max_features=num_feats)
  lgbc_selector.fit(X, y)
  lgbc_support = lgbc_selector.get_support()
  lgbc_feature = X.loc[:, lgbc_support].columns.tolist()

  return lgbc_support, lgbc_feature

In [None]:
#third transformer designed for feature selection composed of Pearson correlation,
#RFE and SelectFromModel (Logistic Regression and Random Forest)
class FeatureSelectionTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, num_feats=50, optimization=False):
    self.num_feats = num_feats
    self.optimization = optimization

  def fit(self, X, y):

    cor_support, cor_feature = cor_selector(X, y, self.num_feats)
    rfe_support, rfe_feature = RFE_selector(X, y, self.num_feats)
    rf_support, rf_feature = RanFor_selector(X, y, self.num_feats)
    lgbc_support, lgbc_feature = LGBMC_selector(X, y, self.num_feats)

    feature_name = list(X.columns)
    feature_selection_df = pd.DataFrame({
        'Feature': feature_name,
        'Pearson': cor_support,
        'RFE': rfe_support,
        'Random_forest': rf_support,
        'LightGBM': lgbc_support
        })

    feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
    feature_selection_df = feature_selection_df.sort_values(
        ['Total', 'Feature'],
         ascending=False)
    imp_features = feature_selection_df[
        feature_selection_df['Total'] > 1
        ]['Feature'].values
    self.imp_features = imp_features

    return self

  def transform(self, X, y=None):
    X_selected = X.copy()
    X_selected = X[self.imp_features]

    return X_selected

In [None]:
def LGBMC_optimizer(X, y, num_feats, selector=False):
  '''
  Finding best LGBMClassifier hyperparameters for SelectFromModel
  feature selection
  '''

  lgbc = LGBMClassifier()

  max_depth = [-1, 0, 1, 5, 20]
  num_leaves = [1, 2, 16 ,32]
  reg_lambda = [0, 1, 5, 10, 25, 100]
  learning_rate = [0.01, 0.05, 0.1, 1]
  min_gain_to_split = [0, 0.1, 1]
  min_child_weight = [1e-3, 1e-1, 0, 10, 40]

  params = {
      'max_depth': max_depth,
      'num_leaves': num_leaves,
      'reg_lambda': reg_lambda,
      'learning_rate': learning_rate,
      'min_gain_to_split': min_gain_to_split,
      'min_child_weight': min_child_weight,
       }

  kfold = StratifiedKFold(3, shuffle=True, random_state=42)
  optimizer = RandomizedSearchCV(estimator=lgbc,
                                 param_distributions=params,
                                 scoring='roc_auc',
                                 cv=kfold)
  optimizer.fit(X, y)

  return optimizer.best_params_, optimizer.best_estimator_


In [None]:
def LGBMC_selector(X, y, num_feats, optimization=False):
  '''
  Finding the most important features basing on meta-transformer 
  that used alongside LGBMClassifier that assigns importance 
  to each feature.
  If optimization = False it is base LGBClassifier without optimizing
  hyperparameters. Setting it to True runs LGBMC_optimizer function.
  '''

  if optimization:
    best_params, lgbc = LGBMC_optimizer(X, y, num_feats, selector=True)
  else:
    lgbc = LGBMClassifier()
  lgbc_selector = SelectFromModel(lgbc, max_features=num_feats)
  lgbc_selector.fit(X, y)
  lgbc_support = lgbc_selector.get_support()
  lgbc_feature = X.loc[:, lgbc_support].columns.tolist()

  return lgbc_support, lgbc_feature

In [None]:
class ArrayToDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, column_names):
        self.column_names = column_names

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return pd.DataFrame(X, columns=self.column_names)

In [None]:
cat_features = ['latest_seismic_assessment', 'latest_seismoacoustic_assessment', 
                'latest_comprehensive_assessment', 'latest_hazards_assessment']

#1st transformer - categorical variables encoding
cat_trf = Pipeline(steps=[
        ('cat_encode', OrdinalEncoder())
])

#2nd transformer - scaler
scaler_trf = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [None]:
sum_ts_trf = SumTimeSeries(flattened_ts_to_sum, ts_to_sum)
ts_stats_trf = TimeSeriesStatistics(flattened_ts_to_agg, flattened_ts_to_agg_diff, 
                                    ts_to_agg, ts_to_agg_diff, last_X_hours=5)

X_train_transformed_sum_ts = sum_ts_trf.fit_transform(X_train)
X_train_transformed_ts_stats = ts_stats_trf.fit_transform(X_train_transformed_sum_ts)

num_features = [col for col in X_train_transformed_ts_stats.columns if col not in cat_features]

In [None]:
#data preprocessing using all transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('ts_sum_trf', sum_ts_trf, flattened_ts_to_sum),
        ('ts_stats_trf', ts_stats_trf, flattened_ts_to_agg + flattened_ts_to_agg_diff),
        ('cat_enc_trf', cat_trf, cat_features),
        ('scaler', scaler_trf, num_features)
    ], remainder='passthrough')

In [None]:
#class balance
sm = SMOTE(sampling_strategy='auto', k_neighbors=8, random_state=42)

In [None]:
selector_trf = ColumnTransformer([
        ('selector', FeatureSelectionTransformer(optimization=True), slice(None))   
])

In [None]:
#model LigtGBM
model = LGBMClassifier(
    num_leaves=1500, num_iterations=80, 
    min_data_in_leaf=750, max_depth=22,
    learning_rate=0.03, lambda_l2=10,
    lambda_l1=1, boosting_type='gbdt'
)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', sm),
    ('to_df', ArrayToDataFrame(X_train_transformed_ts_stats.columns)),
    ('selector', selector_trf),
    ('model', model)
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
set_config(display='diagram')