# Libraries

In [1]:
import numpy as np
import seaborn as sns
from pprint import pprint
from mlpocket.tabular import *
from sklearn.model_selection import train_test_split

sns.set_style('darkgrid')

In [None]:
%%javascript
    IPython.OutputArea.auto_scroll_threshold = 9999

# Config

In [3]:
class Config:
    data_dir = "folder"
    label = "label"
    output_dir = "outputs/submissions/folder"
    test_size = 0.33
    seed = 42

# Data

## Loading

In [None]:
_, _, train = load_data(path=f"../data/raw/{Config.data_dir}/train.csv", labels=[Config.label])
test = load_data(path=f"../data/raw/{Config.data_dir}/test.csv")

## Visualization

In [None]:
print(train.shape)
print(test.shape)

In [None]:
display(train.head())
display(test.head())

In [None]:
train.dtypes

In [None]:
train.columns

In [None]:
train.describe()

### NaN statistics

In [None]:
get_nan_stats({
    "train": train,
    "test": train
}, 
    print_nan_stats=True,
    plot_nan_stats=True
)

### Class Imbalance <span style="color: green;font-weight: bold;">(classification only)</span>

In [None]:
class_values_count = val_count_df(train, Config.label)
display(class_values_count)
class_values_count.plot.pie(y="Value Count", figsize=(5,5), legend=False, ylabel=Config.label)

### Features  <span style="color: green;font-weight: bold;">(data-specific visualization)</span>

#### [Categorical Feature]

In [None]:
feature="cat"

##### Who is highest?

In [None]:
sns.countplot(y=feature, data=train, palette="husl")

##### Who contributes the most to the labels?

In [None]:
sns.countplot(data=train, x=feature, hue=Config.label)

##### What is the distribution of categories for each dataframe

In [None]:
display(compare_value_counts({
    "train": train,
    "test": train
},
    column_name=feature
))

### Investigate category to category relationship

In [None]:
main_category = ""
other_categories = []

In [None]:
pd.concat([train,test]).groupby([main_category])[other_categories].nunique()

In [None]:
# only run this if you noticed a unique combination of "other categories" for each category 
# in the categories columns. On other words, evverything is one in the previous table

# pd.concat([train,test]).groupby([main_category])[[other_categories]].first()

### Data distributions

In [None]:
plot_columns_dist({
    "train": train,
    "test": test
}, 
    exclude=["id"],
    grid_size=(7, 3))

### Correlation of values

In [None]:
plot_corr(train)

### Mutual Information <span style="color: green;font-weight: bold;"> (continuous features)</span>

In [None]:
df_for_mi = train.select_dtypes(include=['float16', 'float32', 'float64']).dropna()

mi_scores = get_mi_scores(df_for_mi, 
                          train.loc[df_for_mi.index, Config.label], 
                          df_for_mi.dtypes == int)

print(df_for_mi.columns)

In [None]:
f,ax = plt.subplots(figsize=(20,10))
sns.barplot(y=mi_scores.index, x=mi_scores.values);

### Mean label <span style="color: green;font-weight: bold;"> (discrete features)</span>

In [None]:
features_to_include = train.select_dtypes(include=['int16', 'int32', 'int64', 'object'])\
    .drop(['id', Config.label], axis=1)\
    .columns.tolist()

features_to_include

In [None]:
plot_mean_label_per_feature(
    train,
    columns=features_to_include,
    label=Config.label)

## Preprocessing

In [None]:
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer

In [11]:
to_drop = set("id")

### Fill missing values

In [6]:
def fill_missing(train, val):
    train, val = train.copy(), val.copy()
    
    # write all the function to deal with missing values here, 
    # CAUTION: should retrun a dataframe!
    
    return train, val

#### To drop if any

In [12]:
to_drop = to_drop.union([])

### Scale features

In [None]:
def scale(train, val):
    train, val = train.copy(), val.copy()
    
    # write all the function to deal with missing values here, 
    # CAUTION: should retrun a dataframe!
    
    return train, val

### Categorical to Dummy

In [None]:
columns = []

#### To drop if any

In [13]:
to_drop = to_drop.union([])

### Drop Columns

In [None]:
print(to_drop)

# Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import GroupKFold

In [None]:
X = train_final.drop(columns=Config.label)
y = train_final[Config.label]
X_test = test_df

## Baseline algorithms

# Testing