In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from helpers import create_cut_and_dummies, get_splits, scatter_plot, random_seed, target_feature


np.random.seed(random_seed)

## 4. Categorization

In [None]:
def check_correctness(train, test, col_name):
    print(train.shape, test.shape)
    print('train and test cols:', all(train.columns == test.columns))
    print(f'{col_name}_missing correctness (train): ', train.shape[0] - train[f'{col_name}_missing'].sum() == train[col_name].notnull().sum())
    print(f'{col_name}_missing correctness (test): ', test.shape[0] - test[f'{col_name}_missing'].sum() == test[col_name].notnull().sum())

def update_datasets_with_categories(train, test, col_name, splits):
    train = create_cut_and_dummies(train, col_name, splits)
    test = create_cut_and_dummies(test, col_name, splits)

    train[f'{col_name}_missing'] = train[col_name].isnull()
    test[f'{col_name}_missing'] = test[col_name].isnull()

    check_correctness(train, test, col_name)

    return (train, test)

def train_tree_and_plot(train, col_name, target_var, max_depth):
    filled_df = train.loc[train[col_name].notnull()]
    print(f'{col_name} filled: {filled_df.shape}')

    clf = tree.DecisionTreeRegressor(max_depth=max_depth)
    clf = clf.fit(np.array(filled_df[col_name]).reshape(-1, 1), np.array(filled_df[target_var]).reshape(-1, 1))

    plt.figure(figsize=(12, 12))
    tree.plot_tree(clf, fontsize=8)
    plt.show()

    return clf

In [None]:
train_df = pd.read_csv(f'../data/barely_processed_train.csv', index_col=0)
test_df = pd.read_csv(f'../data/barely_processed_test.csv', index_col=0)

In [None]:
train_df.info()

## Categorization before imputation

Some of the features are missing a lot of values. Also, some features have sort of bimodal distribution and tradintional imputation techniques can not simulate the distribution. We think that in order to preserve the structure of the original data in at least some form, it would be nice to categorize these features, and after that perform imputation. For the missing values a separate "missing" class will be created in each case.

#### R_Depth

Let's categorize this feature.

In [None]:
scatter_plot(train_df['R_Depth'], train_df[target_feature])

In [None]:
depth_clf = train_tree_and_plot(train_df, 'R_Depth', target_feature, 3)

In [None]:
splits = get_splits(depth_clf)
splits[0] = 0
splits

In [None]:
train_df, test_df = update_datasets_with_categories(train_df, test_df, 'R_Depth', splits)

#### R_SIO3

Let's categorize this feature.

In [None]:
scatter_plot(train_df['R_SIO3'], train_df[target_feature])

In [None]:
sio3_clf = train_tree_and_plot(train_df, 'R_SIO3', target_feature, 2)

In [None]:
splits = get_splits(sio3_clf)
splits[0] = 0
splits

In [None]:
train_df, test_df = update_datasets_with_categories(train_df, test_df, 'R_SIO3', splits)

#### R_PO4

Again, let's classify it.

In [None]:
scatter_plot(train_df['R_PO4'], train_df[target_feature])

In [None]:
po4_clf = train_tree_and_plot(train_df, 'R_PO4', target_feature, 2)

In [None]:
splits = get_splits(po4_clf)
splits[0] = 0
splits

In [None]:
train_df, test_df = update_datasets_with_categories(train_df, test_df, 'R_PO4', splits)

#### R_NO2

Again, let's classify it with a tree.

In [None]:
scatter_plot(train_df['R_NO2'], train_df[target_feature])

In [None]:
no2_clf = train_tree_and_plot(train_df, 'R_NO2', target_feature, 2)

In [None]:
splits = get_splits(no2_clf)
splits[0] = 0
splits

In [None]:
train_df, test_df = update_datasets_with_categories(train_df, test_df, 'R_NO2', splits)

#### R_NO3

Again, let's classify it with a tree.

In [None]:
scatter_plot(train_df['R_NO3'], train_df[target_feature])

In [None]:
no3_clf = train_tree_and_plot(train_df, 'R_NO3', target_feature, 2)

In [None]:
splits = get_splits(no3_clf)
splits[0] = 0
splits

In [None]:
train_df, test_df = update_datasets_with_categories(train_df, test_df, 'R_NO3', splits)

#### R_NH4

Again, let's classify it with a tree.

In [None]:
scatter_plot(train_df['R_NH4'], train_df[target_feature])

In [None]:
nh4_clf = train_tree_and_plot(train_df, 'R_NH4', target_feature, 2)

In [None]:
splits = get_splits(nh4_clf)
splits[0] = 0
splits

In [None]:
train_df, test_df = update_datasets_with_categories(train_df, test_df, 'R_NH4', splits)

#### R_CHLA

Again, let's classify it with a tree.

In [None]:
scatter_plot(train_df['R_CHLA'], train_df[target_feature])

In [None]:
chla_clf = train_tree_and_plot(train_df, 'R_CHLA', target_feature, 2)

In [None]:
splits = get_splits(chla_clf)
splits[0] = 0
splits

In [None]:
train_df, test_df = update_datasets_with_categories(train_df, test_df, 'R_CHLA', splits)

#### R_PHAEO

Again, let's classify it with a tree.

In [None]:
scatter_plot(train_df['R_PHAEO'], train_df[target_feature])

In [None]:
phaeo_clf = train_tree_and_plot(train_df, 'R_PHAEO', target_feature, 2)

In [None]:
splits = get_splits(phaeo_clf)
splits[0] = 0
splits

In [None]:
train_df, test_df = update_datasets_with_categories(train_df, test_df, 'R_PHAEO', splits)

#### R_PRES

Again, let's classify it with a tree.

In [None]:
scatter_plot(train_df['R_PRES'], train_df[target_feature])

In [None]:
pres_clf = train_tree_and_plot(train_df, 'R_PRES', target_feature, 2)

In [None]:
splits = get_splits(pres_clf)
splits[0] = 0
splits

In [None]:
train_df, test_df = update_datasets_with_categories(train_df, test_df, 'R_PRES', splits)

## Save the data

Also convert all boolean variables to int.

In [None]:
train_df.replace({False: 0, True: 1}, inplace=True)
test_df.replace({False: 0, True: 1}, inplace=True)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.to_csv(f'../data/categorized_train.csv')
test_df.to_csv(f'../data/categorized_test.csv')