## Setup

In [2]:
import os 

is_kaggle = False
if os.environ.get('KAGGLE_KERNEL_RUN_TYPE') is not None:
    is_kaggle = True

In [9]:
%%capture
if is_kaggle:
    !pip install ../input/sklearn-1-0/scikit_learn-1.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl

In [None]:
from pathlib import Path
from typing import Tuple

import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
seed = 32

In [None]:
is_kaggle = False
if os.environ.get('KAGGLE_KERNEL_RUN_TYPE') is not None:
    is_kaggle = True

In [None]:
data_path = Path("..") / "data" / "raw"
output_path = Path("..") / 'data' / "submissions"
output_file_name = 'lg.csv'
if is_kaggle:
    data_path = Path('/kaggle') / 'input' / 'jigsaw-unintended-bias-in-toxicity-classification'
    output_path = Path("/kaggle") / "working"
    output_file_name = 'submission.csv'

In [None]:
data_path

## Load Data

In [None]:
df = pd.read_csv(data_path / 'train.csv')

In [None]:
df.head(2)

In [None]:
df['label'] = (df['target'] >= 0.5).astype(int)

In [None]:
x = df['comment_text']
y = df['label']

In [None]:
def get_x_y(df: pd.DataFrame, input_col='comment_text', label_col='label') -> Tuple[pd.DataFrame, pd.DataFrame]:
    return df[input_col], df[label_col]

In [None]:
df_train, df_valid = train_test_split(df, test_size=0.2, random_state=seed)

In [None]:
x_train, y_train= get_x_y(df_train)
x_valid, y_valid = get_x_y(df_valid)

## Preprocess Data

In [None]:
train_text = list(x_train.values)
valid_text = list(x_valid.values)

In [None]:
vectorizer = CountVectorizer(
    stop_words="english", max_features=5000, min_df=0.001, max_df=0.99
)
x_train_prepared = vectorizer.fit_transform(train_text)
x_valid_prepared = vectorizer.transform(valid_text)


In [None]:
x_train_prepared.shape

In [None]:
x_valid_prepared.shape

In [None]:
vectorizer.get_feature_names_out()

## Train Model

In [None]:
mod_lg = LogisticRegression(random_state=seed, solver='liblinear')

In [None]:
_ = mod_lg.fit(x_train_prepared, y_train)

## Evaluate Model

In [None]:
def get_freq_table(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """Get the count and percentage of each unique value in the column"""
    num_count = df[col].value_counts()
    perc_count = df[col].value_counts(normalize=True)
    df_sum = pd.concat([num_count, perc_count], axis=1)
    df_sum.columns = ["count", "percentage"]
    return df_sum

In [None]:
def evaluate_model(df: pd.DataFrame, label_col: str = "label") -> pd.DataFrame:
    y_true = df[label_col].values
    y_pred = df["y_pred"].values
    y_proba = df["y_pred_proba"].values

    acc = metrics.accuracy_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred)
    auc_roc = metrics.roc_auc_score(y_true, y_proba)

    df_result = pd.DataFrame(
        {"metrics": ["accuracy", "f1", "auc_roc"], "value": [acc, f1, auc_roc]}
    )
    return df_result


In [None]:
df_eval = df_valid.reset_index(drop=True).copy()

In [None]:
df_eval['y_pred'] = mod_lg.predict(x_valid_prepared)
df_eval['y_pred_proba'] = mod_lg.predict_proba(x_valid_prepared)[:, 1]

In [None]:
df_eval.head(2)

In [None]:
get_freq_table(df_valid, col='label')

In [None]:
evaluate_model(df_eval)

## Explain Model

In [None]:
df_coef = pd.DataFrame({"name": vectorizer.get_feature_names_out(), "coef": mod_lg.coef_[0]})
df_coef.sort_values('coef', ignore_index=True)

## Make Submission

In [None]:
df_test = pd.read_csv(data_path / 'test.csv')

In [None]:
df_test.head(2)

In [None]:
x_test_prepared = vectorizer.transform(list(df_test.comment_text.values))

In [None]:
prediction = mod_lg.predict_proba(x_test_prepared)[:, 1]

In [None]:
prediction

In [None]:
df_submit = pd.DataFrame({'id': df_test.id.values, 'prediction': prediction})
df_submit.to_csv(output_path / output_file_name, index=False)