# Data Processing

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
np.seterr(divide = 'ignore') 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_train_X = pd.read_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/input_data_train_X')
data_train_y = pd.read_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/input_data_train_y')
data_test_X = pd.read_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/input_data_test_X')
data_test_y = pd.read_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/input_data_test_y')
working_site_data = pd.read_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/DATA/working_site_metadata.csv')

In [None]:
data_train_X.shape

## This is third version of dataset processing. 

New series were created:
* log_max_avg_diff_genergy = log(max_genergy - avg_genergy)
* log_max_avg_diff_diff_genergy = log(max_difference_in_genergy - avg_difference_in_genergy)
* log_max_avg_diff_gactivity = log(max_gactivity - avg_gactivity)
* log_max_avg_diff_diff_gactivity = log(max_difference_in_gactivity - avg_difference_in_gactivity)
* log_ave_genergy = log(sum / count)
---
The main idea was to extract features like:
* minimum
* maximum
* std
* quantiles: 0,25 ; 0.5 ; 0.75
* information if there is non-zero value in the test series (True/ False),
* number of positive values in a series,
* hours from last non-zero measurement,
* number of times a series increased in comparison to the previous hour's recording,
These statistic were computed over the last 4, 8 and 24 hours. 

Above extraction methodology affect following time series:
* count_e, 
* sum_e, 
* number_of_rock_bursts
* number_of_destressing_blasts
* avg_gactivity
* avg_genergy
----
For series:
* avg_difference_in_gactivity 
* avg_difference_in_genergy 

computed:
* max abs value 
* number of positive values in a series,
over last 4, 8, 24 hours
---
Categorical variables:
* latest_seismic_assesment
* latest_seismoacoustic_assessment 
* latest_comprehensive_assessment
* latetest_hazards_assessment
* mining_hazard_assessment (from working_site_metada additional dataset)
were encoded with one-hot encoding.
----
---------------------------------------------
From extra dataset following features were used:
* mining_hazard_assessment (as mentioned above)
* main_working_height 

In [None]:
def ts_col_names(first_ts_col, last_ts_col):
  return list(data_train_X.iloc[:, first_ts_col: last_ts_col].columns)

In [None]:
#count_eX
counts_e2 = ts_col_names(13, 37)
counts_e3 = ts_col_names(37, 61)
counts_e4 = ts_col_names(61, 85)
counts_e5 = ts_col_names(85, 109)
counts_e6plus = ts_col_names(109, 133)
#sum_eX
sum_e2 = ts_col_names(133, 157)
sum_e3 = ts_col_names(157, 181)
sum_e4 = ts_col_names(181, 205)
sum_e5 = ts_col_names(205, 229)
sum_e6plus = ts_col_names(229, 253)
#total_number_of_bumps
total_number_of_bumps = ts_col_names(253, 277)
#number_of_rock_bursts
num_rock_bursts = ts_col_names(277,301)
#number_of_destressing_blasts
num_des_blasts = ts_col_names(301, 325)
#highest_bump_energy
h_bump_energy = ts_col_names(325, 349)
#other time series
max_gactivities = ts_col_names(349, 373)
max_genergies = ts_col_names(373, 397)
avg_gactivities = ts_col_names(397, 421)
avg_genergies = ts_col_names(421, 445)
max_diff_gactivities = ts_col_names(445, 469)
max_diff_genergies = ts_col_names(469, 493)
avg_diff_gactivities = ts_col_names(493, 517)
avg_diff_genergies = ts_col_names(517, 541)

In [None]:
ts_to_agg = [counts_e2, counts_e3, counts_e4, counts_e5, counts_e6plus, 
             sum_e2, sum_e3, sum_e4, sum_e5, sum_e6plus,
             num_rock_bursts, num_des_blasts,
             max_gactivities, max_genergies, avg_gactivities, avg_genergies,
             max_diff_gactivities, max_diff_genergies, 
             avg_diff_gactivities, avg_diff_genergies]

# New series deriving

## log_max_avg series
* log_max_avg_diff_genergy = log(max_genergy - avg_genergy)
* log_max_avg_diff_diff_genergy = log(max_difference_in_genergy - avg_difference_in_genergy)
* log_max_avg_diff_gactivity = log(max_gactivity - avg_gactivity)
* log_max_avg_diff_diff_gactivity = log(max_difference_in_gactivity - avg_difference_in_gactivity)

In [None]:
def log_max_avg(df, max_feature, avg_feature, save_col_names=0):

  new_col_names = []

  for i in range (len(max_feature)):

    col_name = ('log__'+df[max_feature].columns[i][:-2] + 
                '-' + 
                df[avg_feature].columns[i][:-2]+'_'+str(i+1))
    
    df[col_name] = np.log(df[max_feature[i]] - df[avg_feature[i]])
    df[col_name].replace([np.inf, -np.inf], 0, inplace=True)


    if save_col_names:
      new_col_names.append(col_name)

  return df, new_col_names

In [None]:
#new series in train dataset
data_train_X, log_max_avg_genergies = log_max_avg(
    data_train_X, max_genergies, 
    avg_genergies, 1)
data_train_X, log_diff_max_avg_genergies = log_max_avg(
    data_train_X, max_diff_genergies,
     avg_diff_genergies, 1)
data_train_X, log_max_avg_gactivities = log_max_avg(
    data_train_X, max_gactivities,
     avg_gactivities, 1)
data_train_X, log_diff_max_avg_gactivities = log_max_avg(
    data_train_X, max_diff_gactivities,
     avg_diff_gactivities, 1)

In [None]:
#same new series in test dataset
log_max_avg(data_test_X, max_genergies, avg_genergies)
log_max_avg(data_test_X, max_diff_genergies, avg_diff_genergies)
log_max_avg(data_test_X, max_gactivities, avg_gactivities)
log_max_avg(data_test_X, max_diff_gactivities, avg_diff_gactivities)

## log_ave_genergy = log(sum / count)

In [None]:
def log_sum_count(df, sum_feature, count_feature, save_col_names=0):
  new_col_names = []
  for i in range (len(sum_feature)):

    col_name = ('log__'+df[sum_feature].columns[i][:-2] + 
                '/' + 
                df[count_feature].columns[i][:-2]+'_'+str(i+1))
    df[col_name] = np.log(df[sum_feature[i]].div(df[count_feature[i]]).fillna(0))
    df[col_name].replace([-np.inf, np.inf], 0, inplace=True)


    if save_col_names:
      new_col_names.append(col_name)

  return df, new_col_names

In [None]:
#new series in train dataset
data_train_X, sum_count_cols_e2 = log_sum_count(data_train_X, counts_e2, sum_e2, 1)
data_train_X, sum_count_cols_e3 = log_sum_count(data_train_X, counts_e3, sum_e3, 1)
data_train_X, sum_count_cols_e4 = log_sum_count(data_train_X, counts_e4, sum_e4, 1)
data_train_X, sum_count_cols_e5 = log_sum_count(data_train_X, counts_e5, sum_e5, 1)
data_train_X, sum_count_cols_e6plus = log_sum_count(data_train_X, counts_e6plus, sum_e6plus, 1)

In [None]:
#same new series in test dataset
log_sum_count(data_test_X, counts_e2, sum_e2)
log_sum_count(data_test_X, counts_e3, sum_e3)
log_sum_count(data_test_X, counts_e4, sum_e4)
log_sum_count(data_test_X, counts_e5, sum_e5)
log_sum_count(data_test_X, counts_e6plus, sum_e6plus)

# Statistics computing

* minimum
* maximum
* std
* quantiles: 0,25 ; 0.5 ; 0.75
* information if there is non-zero value in the test series (True/ False),
* number of positive values in a series,
* hours from last non-zero measurement,
* number of times a series increased in comparison to the previous hour's recording, 



In [None]:
def last_nonzero_idx(x):
  return (len(x) - np.where(x != 0)[0][-1]-1) if any(x != 0) else len(x)+1

In [None]:
def num_times_increased(feature):
  counter = 0
  for i in range(1, len(feature)):
    if (feature[i] - feature[i-1]) > 0:
      counter += 1
  return counter

In [None]:
ts_1 = [sum_count_cols_e2, sum_count_cols_e3, sum_count_cols_e4, sum_count_cols_e5,
        sum_count_cols_e6plus, num_rock_bursts, h_bump_energy]
ts_2 = [log_max_avg_genergies, log_diff_max_avg_genergies, 
        log_max_avg_gactivities, log_diff_max_avg_gactivities] 

In [None]:
def ts_compute_stats(df, ts, abs, last_X_hours):
    
    col_name = ts[0][:-2]

    for hour in last_X_hours:

      #min
      df[f'{hour}_min_'+ col_name] = df[ts[-hour: ]].min(axis=1)
      #max
      df[f'{hour}_max_'+ col_name] = df[ts[-hour: ]].max(axis=1)
      #std
      df[f'{hour}_std_'+ col_name] = df[ts[-hour: ]].std(axis=1)
      #quantiles
      df[f'{hour}_q0.25_'+ col_name] = df[ts[-hour: ]].quantile(q=0.25, axis=1)
      df[f'{hour}_q0.5_'+ col_name] = df[ts[-hour: ]].quantile(q=0.5, axis=1)
      df[f'{hour}_q0.75_'+ col_name] = df[ts[-hour: ]].quantile(q=0.75, axis=1)
      #number of times a series increased in comparison to the previous hour's recording
      df[f'{hour}_num_incr_' + col_name] = df[ts[-hour: ]].apply(
          num_times_increased,
           axis=1
           )
      #number of positive values in the series
      df[f'{hour}_num_pos_'] = df[df[ts[-hour: ]]>0][ts].count(axis=1)

      if abs:
        #abs_max
        df[f'{hour}_max_abs_'+ col_name] = df[ts[-hour: ]].abs().max(axis=1)
      else:
        #hours from last non-zero observation
        df[f'{hour}_h_from_l_nonzero_' + col_name] = df[ts[-hour: ]].apply(
            last_nonzero_idx,
             axis=1
             )

    return df

In [None]:
hours = [4, 8, 24]
#computing stats for X_train without features for which no ABS calcutation is
#needed
for ts in ts_1:
  ts_compute_stats(data_train_X, ts, False, hours)

In [None]:
#computing stats for X_train without features for which ABS calcutation is
#needed
for ts in ts_2:
  ts_compute_stats(data_train_X, ts, True, hours)

In [None]:
#computing stats for X_test without features for which no ABS calcutation is
#needed
for ts in ts_1:
  ts_compute_stats(data_test_X, ts, False, hours)

In [None]:
#computing stats for X_test without features for which ABS calcutation is
#needed
for ts in ts_2:
  ts_compute_stats(data_test_X, ts, False, hours)

## Two features from working_site_metadata derived to base dataframe

In [None]:
def find_height(x, dictionary):
    for key in dictionary:
        if x == key:
            return (dictionary[key][0])

In [None]:
def find_h_assessment(x, dictionary):
    for key in dictionary:
        if x == key:
            return (dictionary[key][1])

In [None]:
dict_id = {}
for i, row in working_site_data.iterrows():
    dict_id[row['main_working_id']] = (row['main_working_height'], row['mining_hazard_assessment'])

In [None]:
data_train_X['height'] = data_train_X['main_working_id'].apply(lambda x: find_height(x, dict_id))
data_test_X['height'] = data_test_X['main_working_id'].apply(lambda x: find_height(x, dict_id))

In [None]:
data_train_X['mining_hazard_assessment'] = data_train_X['main_working_id'].apply(lambda x: find_h_assessment(x, dict_id))
data_test_X['mining_hazard_assessment'] = data_test_X['main_working_id'].apply(lambda x: find_h_assessment(x, dict_id))

# Categorical variables

latest_seismoacoustic_assessment 

In [None]:
data_train_X['latest_seismoacoustic_assessment'].value_counts()

In [None]:
data_train_X['latest_seismoacoustic_assessment'].replace('d', 'c', inplace=True)
#'d' category contains only 48 instances. What's even more important, in the test data there is no category 'd'.
#'d' category will be replaced to 'c'.

In [None]:
data_train_X = pd.get_dummies(data_train_X, drop_first=True)

In [None]:
data_test_X = pd.get_dummies(data_test_X, drop_first=True)

In [None]:
data_test_X = data_test_X.reindex(columns = data_train_X.columns, fill_value=0)

# Removing features

In [None]:
import itertools

In [None]:
#removing features that were used to time series transformation
ts_to_remove = list(itertools.chain.from_iterable(ts_to_agg))

In [None]:
data_train_X.drop(ts_to_remove, axis=1, inplace=True)
data_test_X.drop(ts_to_remove, axis=1, inplace=True)

In [None]:
#main_working_id will be dropped
data_train_X.drop('main_working_id', axis=1, inplace=True)
data_test_X.drop('main_working_id', axis=1, inplace=True)

In [None]:
#also total_number_of bumps will be dropped, as it's exactly the same information as summed
#counts of bumps(count_e*), they're just summed values from all detectors 
data_train_X.drop(total_number_of_bumps, axis=1, inplace=True)
data_test_X.drop(total_number_of_bumps, axis=1, inplace=True)

# New dataset shape

In [None]:
data_train_X.shape, data_test_X.shape

In [None]:
#storing variables
data_train_X.to_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/EXTRACTED DATA/X_train3', index=False)
data_test_X.to_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/EXTRACTED DATA/X_test3', index=False)
