# Data Processing

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_train_X = pd.read_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/input_data_train_X')
data_train_y = pd.read_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/input_data_train_y')
data_test_X = pd.read_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/input_data_test_X')
data_test_y = pd.read_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/input_data_test_y')
working_site_data = pd.read_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/DATA/working_site_metadata.csv')

In [None]:
data_train_X.shape

# Data processing

This is second version of dataset processing (second version of final dataset). 


The main idea was to extract features like:
* minimum
* maximum
* std
* information if there is non-zero value in the test series (True/ False),
* hours from last non-zero measurement.
These statistic were computed over the last 2, 4, 8 and 24 hours. 

Above extraction methodology affect following time series:
* count_e, 
* sum_e, 
* number_of_rock_bursts
* number_of_destressing_blasts
* avg_gactivity
* avg_genergy

For series:
* avg_difference_in_gactivity 
* avg_difference_in_genergy 
computed max abs value over last 2 and 24 hours.

Categorical variables:
* latest_seismic_assesment
* latest_seismoacoustic_assessment 
* latest_comprehensive_assessment
* latetest_hazards_assessment
* mining_hazard_assessment (from working_site_metada additional dataset)
were encoded with one-hot encoding.

---------------------------------------------
From extra dataset following features were used:
* mining_hazard_assessment (as mentioned above)
* main_working_height 
---------------------------------------------
Time series describing maximum statistics were <b>dropped</b> from dataset, as they are probably highly correlated with average statistics:
* max_gactivity.X
* max_genergy.X
* max_difference_in_gactivity.X
* max_difference_in_genergy.X

In [None]:
def max_nonzero_index(row):
    non_zero_cols = row.nonzero()[0]
    
    if len(non_zero_cols) == 0:
        return -1
    else:
        return non_zero_cols[-1]

In [None]:
def features_extracting(data, windows_size, coord1, coord2):
    coords = np.arange(coord1, coord2+1, 24)
    idx = 0
    df_1 = pd.DataFrame()
    df_add = pd.DataFrame()
    while idx < len(coords)-1:
         
        for window in windows_size:
        
            base_col_name = data.iloc[:, coords[idx+1]-window: coords[idx+1]].columns[0][:-2]
            max_nonzero_idx = []

            #min
            df_add[str(window)+'h_min_'+base_col_name] = data.iloc[:, coords[idx+1]-window: coords[idx+1]].min(axis=1)


            #max
            df_add[str(window)+'h_max_'+base_col_name] = data.iloc[:, coords[idx+1]-window: coords[idx+1]].max(axis=1)

            #std
            df_add[str(window)+'h_std_'+base_col_name] = data.iloc[:, coords[idx+1]-window: coords[idx+1]].std(axis=1)

            #if there is a non-zero value in the test series
            df_add[str(window)+'h_nonzero_'+base_col_name] = data.iloc[:, coords[idx+1]-window: coords[idx+1]].any(axis=1)

            #how many hours from the last non-zero observation
            for i, row in data.iloc[:, coords[idx]: coords[idx+1]].iterrows():
                max_idx = 25
                for j in range(len(row)):
                    if row[j] != 0:
                        max_idx = 24-(j+1)
                max_nonzero_idx.append(max_idx)   
            df_add['last_non_0_'+base_col_name] = max_nonzero_idx

        idx += 1
        
    return df_add

In [None]:
def max_abs_value(data, windows_size, coord1, coord2):
    coords = np.arange(coord1, coord2+1, 24)
    idx = 0
    df_1 = pd.DataFrame()
    df_add = pd.DataFrame()
    while idx < len(coords)-1:
         
        for window in windows_size:
            base_col_name = data.iloc[:, coords[idx+1]-window: coords[idx+1]].columns[0][:-2]
            #max_abs_value
            df_add[str(window)+'h_max_abs_val'+base_col_name] = data.iloc[:, coords[idx+1]-window: coords[idx+1]].abs().max(axis=1)

        idx += 1
        
    return df_add

In [None]:
df1_train = features_extracting(data_train_X, (24, 8, 4, 2), 13, 252)
df1_test = features_extracting(data_test_X, (24, 8, 4, 2), 13, 252)
df2_train = features_extracting(data_train_X, (24, 8, 4, 2), 276, 324)
df2_test = features_extracting(data_test_X, (24, 8, 4, 2), 276, 324)

In [None]:
df3_abs = max_abs_value(data_train_X, (24, 2), 397, 445)
df3_test_abs = max_abs_value(data_test_X, (24, 2), 397, 445)

In [None]:
df_train = pd.concat([data_train_X, df1_train, df2_train, df3_abs], axis=1)
df_test = pd.concat([data_test_X, df1_test, df2_test, df3_test_abs], axis=1)

Adding two features from extra dataset.

In [None]:
def find_height(x, dictionary):
    for key in dictionary:
        if x == key:
            return (dictionary[key][0])

In [None]:
def find_h_assessment(x, dictionary):
    for key in dictionary:
        if x == key:
            return (dictionary[key][1])

In [None]:
dict_id = {}
for i, row in working_site_data.iterrows():
    dict_id[row['main_working_id']] = (row['main_working_height'], row['mining_hazard_assessment'])

In [None]:
df_train['height'] = df_train['main_working_id'].apply(lambda x: find_height(x, dict_id))
df_test['height'] = df_test['main_working_id'].apply(lambda x: find_height(x, dict_id))

In [None]:
df_train['mining_hazard_assessment'] = df_train['main_working_id'].apply(lambda x: find_h_assessment(x, dict_id))
df_test['mining_hazard_assessment'] = df_test['main_working_id'].apply(lambda x: find_h_assessment(x, dict_id))

# Dropping features

In [None]:
#firstly - main working id from general features
df_train.drop(['main_working_id'], inplace=True, axis=1)
df_test.drop(['main_working_id'], inplace=True, axis=1)

In [None]:
#removing time series used for aggregation
df_train.drop(df_train.iloc[:, 12: 252], inplace=True, axis=1)
df_test.drop(df_test.iloc[:, 12: 252], inplace=True, axis=1)

In [None]:
#removing time series used for aggregation
df_train.drop(df_train.iloc[:, 36: 84], inplace=True, axis=1)
df_test.drop(df_test.iloc[:, 36: 84], inplace=True, axis=1)

In [None]:
#dropping maximum statistics
df_train.drop(df_train.iloc[:, 60: 108], inplace=True, axis=1) #max_gactivity and max_genrgy 
df_test.drop(df_test.iloc[:, 60: 108], inplace=True, axis=1)

In [None]:
df_train.drop(df_train.iloc[:, 108: 156], inplace=True, axis=1) #max_difference_in_gactivity and max_difference_in_genrgy
df_test.drop(df_test.iloc[:, 108: 156], inplace=True, axis=1)

In [None]:
df_train = pd.get_dummies(df_train, drop_first=True)
df_test = pd.get_dummies(df_test, drop_first = True)
df_test = df_test.reindex(columns = df_train.columns, fill_value=0)

In [None]:
#storing variables
df_train.to_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/EXTRACTED DATA/X_train_v2', index=False)
df_test.to_csv('/content/drive/MyDrive/Data Science Pro/Projekt końcowy/EXTRACTED DATA/X_test_v2', index=False)