# Libraries

In [1]:
import numpy as np
import seaborn as sns
from pprint import pprint
from mlpocket.tabular import *
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LogisticRegression

sns.set_style('darkgrid')

# Config

In [3]:
class Config:
    data_dir = "folder"
    label = "label"
    output_dir = "outputs/submissions/folder"
    test_size = 0.33
    seed = 42

# Data

## Loading

In [None]:
_, _, train = load_data(path=f"../data/raw/{Config.data_dir}/train.csv", labels=[Config.label])
test = load_data(path=f"../data/raw/{Config.data_dir}/test.csv")

## Visualization

In [None]:
print(train.shape)
print(test.shape)

In [None]:
display(train.head())
display(test.head())

In [None]:
train.dtypes

In [None]:
train.columns

In [None]:
train.describe()

### NaN statistics

In [None]:
get_nan_stats(train)

### Data distributions

In [None]:
plot_columns_dist(train)

### Correlation of values

In [None]:
plot_corr(train)

### Class Imbalance

In [None]:
train[Config.label].value_counts(normalize=True)

In [None]:
sns.countplot(x=train[Config.label])

### Features  <span style="color: green;font-weight: bold;">(data-specific visualization)</span>

#### [Categorical Feature]

In [None]:
feature="cat"

##### Who is highest?

In [None]:
sns.countplot(y=feature, data=train, palette="husl")

##### Who contributes the most to the labels?

In [None]:
sns.countplot(data=train, x=feature, hue=Config.label)

## Preprocessing

In [11]:
to_drop = set()

### Fill missing values

In [6]:
def fill_missing(df):
    df = df.copy()
    # write all the function to deal with missing values here
    return df

In [None]:
train_no_missing = fill_missing(train)
test_no_missing = fill_missing(test)

#### To drop if any

In [12]:
to_drop = to_drop.union([])

### Categorical to Dummy

In [None]:
columns = []

In [None]:
train_dummies, test_dummies = to_dummies(train_no_missing, test_no_missing, columns)

#### To drop if any

In [13]:
to_drop = to_drop.union([])

### Drop Columns

In [None]:
train_final = drop(train_dummies, to_drop)
test_final = drop(test_dummies, to_drop)

### Result

In [None]:
len(train_final.columns)

In [None]:
train_final.columns

In [None]:
train_final.head()

# Modeling

## Baseline algorithms

In [None]:
train_final_split, val_final_split = train_test_split(train_final, test_size=Config.test_size, random_state=Config.seed)

In [None]:
X_train, y_train = drop(train_final_split, [Config.label]), train_final_split[Config.label]

In [None]:
X_val, y_val = drop(val_final_split, [Config.label]), val_final_split[Config.label]

### Logistic Regression

# Testing