In [65]:
import pandas as pd
import numpy as np
from functools import partial

from sklearn.model_selection import train_test_split, StratifiedGroupKFold, StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin, clone, ClassifierMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight

from catboost import CatBoostClassifier, Pool

import textstat
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

import wandb

## Загрузка данных

In [None]:
np.random.seed(59)
X_train = pd.read_csv("data/my/before_feature_engineering/train.csv", sep=",", header=0).drop(columns="Unnamed: 0").iloc[:-300]
y_train = pd.read_csv("data/my/before_feature_engineering/train_labels.csv", sep=",", header=0).drop(columns="Unnamed: 0").iloc[:-300]
X_train_tr = pd.read_csv("data/my/after_feature_engineering/train.csv", sep=",", header=0).drop(columns="Unnamed: 0").iloc[:-300]
groups_train = pd.read_csv("data/my/before_feature_engineering/train_groups.csv", sep=",", header=0).drop(columns="Unnamed: 0")

permutation = np.random.permutation(X_train.shape[0])
X_train = X_train.iloc[permutation]
X_train_tr = X_train_tr.iloc[permutation]
y_train = y_train.iloc[permutation]
groups_train = groups_train.iloc[permutation]

X_test = pd.read_csv("data/my/before_feature_engineering/test.csv", sep=",", header=0).drop(columns="Unnamed: 0")
X_test_tr = pd.read_csv("data/my/after_feature_engineering/test.csv", sep=",", header=0).drop(columns="Unnamed: 0")
y_test = pd.read_csv("data/my/before_feature_engineering/test_labels.csv", sep=",", header=0).drop(columns="Unnamed: 0")

y_train = y_train.fraudulent
y_test = y_test.fraudulent

## Эксперименты с моделями без тюнинга гиперпараметров

### Вспомогательная функция

In [108]:
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators):
        self.estimators = estimators

    def predict(self, X):
        predictions = np.array([estimator.predict(X) for estimator in self.estimators])
        sum_predictions = np.sum(predictions.T, axis=1)
        majority = (sum_predictions > (len(self.estimators) / 2)).astype(int)
        return majority

def cross_validate_and_log_metrics(model, X, y, groups, cv_class=StratifiedGroupKFold, n_splits=5):
    cv = cv_class(n_splits=n_splits, shuffle=True, random_state=42)
    fold_metrics = {'f1': [], 'precision': [], 'recall': []}
    models = []
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups=groups)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model_new = clone(model)
        model_new.fit(X_train, y_train)
        models.append(model_new)
        y_pred = model_new.predict(X_val)
        
        fold_metrics['f1'].append(f1_score(y_val, y_pred, zero_division=0))
        fold_metrics['precision'].append(precision_score(y_val, y_pred, zero_division=0))
        fold_metrics['recall'].append(recall_score(y_val, y_pred, zero_division=0))
        
        wandb.log({
            'fold': fold + 1,
            'fold_f1': fold_metrics['f1'][-1],
            'fold_precision': fold_metrics['precision'][-1],
            'fold_recall': fold_metrics['recall'][-1]
        })
    
    mean_metrics = {
        'mean_f1': np.mean(fold_metrics['f1']),
        'mean_precision': np.mean(fold_metrics['precision']),
        'mean_recall': np.mean(fold_metrics['recall'])
    }
    
    wandb.log({
        **mean_metrics,
        'status': 'completed'
    })
    
    return mean_metrics, MajorityVoteClassifier(models)

def print_train_metrics(metrics):
    for metric in metrics:
        print(f"Train {metric}: {metrics[metric]:.3f}")

def calc_and_print_metrics(model, X, y, is_test=True):
    y_pred = model.predict(X)
    calc_set_string = "Test" if is_test else "Train"
    print(f"{calc_set_string} F1-Score: {f1_score(y, y_pred):.3f}")
    print(f"{calc_set_string} Precision: {precision_score(y, y_pred):.3f}")
    print(f"{calc_set_string} Recall: {recall_score(y, y_pred):.3f}")
            

### Эксперименты

#### Случайаный лес

В качестве бейзлайна, мы брали логистическую регрессию, поэтому в качестве первой модели сейчас можно попробовать случайный лес, так как н позволит уловить нелинейные закономерности в данных.

In [117]:
model_rf = RandomForestClassifier(
    class_weight='balanced',
    random_state=42,
)

n_folds = 5

with wandb.init(project="job-fake-prediction", 
          config={
              "model_type": "random_forest",
              "validation": "stratified_group_kfold",
              "k_folds": n_folds
          }):
    metrics, averaged_model_rf = cross_validate_and_log_metrics(model_rf, X_train_tr, y_train, groups_train, n_splits=n_folds)
    calc_and_print_metrics(averaged_model_rf, X_train_tr, y_train, is_test=False)
    calc_and_print_metrics(averaged_model_rf, X_test_tr, y_test)

Train F1-Score: 1.000
Train Precision: 1.000
Train Recall: 1.000
Test F1-Score: 0.112
Test Precision: 1.000
Test Recall: 0.059


0,1
fold,▁▃▅▆█
fold_f1,▁▁█▄▂
fold_precision,▁████
fold_recall,▁▁█▃▂
mean_f1,▁
mean_precision,▁
mean_recall,▁

0,1
fold,5
fold_f1,0.10526
fold_precision,1
fold_recall,0.05556
mean_f1,0.19034
mean_precision,0.8
mean_recall,0.12475
status,completed


Видно, что модель очень сильно переобучилась и показывает себя хуже себя, чем логистическая регрессия (бейзлайн). Может быть, тюнинг гиперпараметров поможет её регулязировать. Хотя у этой модели есть небольшой плюс -- у нее точность 100% на трейне и тесте. Она выявляет всего 5% фрода, но зато не даёт ложно-положительных результатов.

Скрины метрик (step здесь имеет смысл очередного сплита кросс-валидации):
![](images/random_forest/f1.png)
![](images/random_forest/precision.png)
![](images/random_forest/recall.png)

#### Градиентный бустинг

Известно, что catboost хорошо работает с текстовыми признаками, поэтому обучим его на исходных текстовых признаках после их предобработки. За счет нативной работы с текстовыми признаками, мы ожидаем, что результат может потенциально превзойти предудщие эксперименты по качеству.

In [None]:
raw_text_features = ["title_processed", "description_processed", "company_profile_processed"]
X_train_cb = X_train[raw_text_features].fillna('')
X_test_cb = X_test[raw_text_features].fillna('')

In [112]:
class MetricsLoggerCallback:
    def after_iteration(self, info):
        test_metrics = info.metrics["validation"]
        wandb.log({
            'f1': test_metrics['F1'][-1],
            'precision': test_metrics['Precision:use_weights=false'][-1],
            'recall': test_metrics['Recall:use_weights=false'][-1],
        })
        return True

In [113]:
model_cb = CatBoostClassifier(
        iterations=100,  
        loss_function='Logloss',
        eval_metric='F1',
        custom_metric=['Precision', 'Recall'],
        class_weights=[1, 5],
        text_features=raw_text_features,
        verbose=False,
        use_best_model=False,
        early_stopping_rounds=None,
        
    )


with wandb.init(project="job-fake-prediction", 
          config={
              "model_type": "catboost",
          }):
    model_cb.fit(
        X_train_cb, y_train, 
        callbacks=[MetricsLoggerCallback()],
        eval_set=[Pool(X_test_cb, y_test, text_features=raw_text_features)]
    )
    calc_and_print_metrics(model_cb, X_train_cb, y_train, is_test=False)
    calc_and_print_metrics(model_cb, X_test_cb, y_test)


Train F1-Score: 0.801
Train Precision: 0.705
Train Recall: 0.928
Test F1-Score: 0.446
Test Precision: 1.000
Test Recall: 0.287


0,1
f1,▁▁██▇▇▇▇▆▁▃▃▃▃▃▅▅▆▅▅▅▅▅▅▅▅▅▄▄▄▄▄▄▃▃▄▅▄▅▄
precision,▁▄██████████████████████████████████████
recall,▁█▅▇▇▂▂▅▃▃▁▅▅▅▅▆▆▅▅▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▄▅▄▅▄▄

0,1
f1,0.44615
precision,1.0
recall,0.28713


Скрины метрик на тесте:

![](images/catboost/f1.png)
![](images/catboost/precision.png)
![](images/catboost/recall.png)

Видно, что модель переобучилась, но при этом качество все равно существенно улучшилось по сравнению со всеми предыдущими экспериментами.