## Import library

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import yaml
import re
import matplotlib.pyplot as plt

# pd.set_option('future.no_silent_downcasting', True)

## Load data and tokens

In [2]:
with open('data/entities.json', 'r') as file:
    data = json.load(file)

with open('data/tokens.yml', 'r') as file:
    tokens = yaml.safe_load(file)
    
token_dict = {}
for col in tokens.keys():
    token = tokens[col]['start']
    token_dict[token] = col

token_dict

{'Ⓐ': 'age',
 'Ⓑ': 'birth_date',
 'Ⓒ': 'civil_status',
 'Ⓓ': 'education_level',
 'Ⓔ': 'employer',
 'Ⓕ': 'firstname',
 'Ⓗ': 'link',
 'Ⓘ': 'lob',
 'Ⓙ': 'maiden_name',
 'Ⓚ': 'nationality',
 'Ⓛ': 'observation',
 'Ⓜ': 'occupation',
 'Ⓞ': 'surname',
 'Ⓟ': 'surname_household'}

In [3]:
## Some Useful functions in my own .py

In [4]:
from src.useful_functions import *

## Preprocessing

In [5]:
df = get_preprocessing_done(data, tokens,  token_dict)
df = df.rename(columns={"surname_household": "Is_household"})
X = df.drop(columns=['Is_household'])
y = df['Is_household'].apply(
                                lambda x: 0 if pd.isna(x) else 1
                            )

100%|██████████| 1218/1218 [00:00<00:00, 3335.77it/s]
  df = pd.DataFrame().from_dict(df_dict, orient='index').fillna(value=np.nan)


In [6]:
X

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname
0,25,,Garçon,,,Cyrille,,,,française,,menuisier,Breton
1,30,,Garçon,,,Auguste,,,,Piémontaise,,vitrier,
2,24,,Garçon,,,Pierre,,,,Piémontaise,,vitrier,
3,48,,Homme marié,,,Alexandre,,,,française,,prop re,
4,30,,,,,Zélie,sa fe,,,française,,prop re,Vignat
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25433,,1869,,,,Marie,chef,Pailharès,,idem,,,
25434,,1863,,,Cara,Marie,chef,St Naz en Royans,,idem,,ouv chaus res,
25435,,1886,,,Baretto,Nello,chef,Castel,,italienne,,manoeuvre,
25436,,1887,,,,Annunziata,épouse,idem,,idem,,,Berni-Laureti


In [7]:
y

0        0
1        1
2        1
3        1
4        0
        ..
25433    1
25434    1
25435    1
25436    0
25437    0
Name: Is_household, Length: 25074, dtype: int64

## Gradient Boosting

In [8]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import log_loss

#### Ordinal Encoding

In [9]:
enc = OrdinalEncoder()
X_encoded = enc.fit_transform(X)

In [10]:
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.20, stratify=y)

### Gradient Boosting

In [11]:
param_grid = {
    "learning_rate": [1e-1, 1e-2, 1e-3],
    "max_leaf_nodes": [15, 31, 100],
    "max_depth": [None, 5, 10, 20],
    "min_samples_leaf": [10, 20, 50],
    "l2_regularization": [0.0, 0.1, 1.0]
}

model = HistGradientBoostingClassifier(max_iter=10_000, early_stopping=True, class_weight='balanced', validation_fraction=0.2)
grid_search = GridSearchCV(model, param_grid, cv=3, scoring="accuracy", n_jobs=-1, verbose=10)

grid_search.fit(X_train_enc, y_train_enc)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV 1/3; 7/324] START l2_regularization=0.0, learning_rate=0.1, max_depth=None, max_leaf_nodes=100, min_samples_leaf=10
[CV 1/3; 7/324] END l2_regularization=0.0, learning_rate=0.1, max_depth=None, max_leaf_nodes=100, min_samples_leaf=10;, score=0.996 total time=   1.0s
[CV 1/3; 32/324] START l2_regularization=0.0, learning_rate=0.1, max_depth=20, max_leaf_nodes=31, min_samples_leaf=20
[CV 1/3; 32/324] END l2_regularization=0.0, learning_rate=0.1, max_depth=20, max_leaf_nodes=31, min_samples_leaf=20;, score=0.996 total time=   0.4s
[CV 1/3; 42/324] START l2_regularization=0.0, learning_rate=0.01, max_depth=None, max_leaf_nodes=31, min_samples_leaf=50
[CV 1/3; 42/324] END l2_regularization=0.0, learning_rate=0.01, max_depth=None, max_leaf_nodes=31, min_samples_leaf=50;, score=0.995 total time=   3.5s
[CV 3/3; 52/324] START l2_regularization=0.0, learning_rate=0.01, max_depth=5, max_leaf_nodes=100, min_samples_leaf=10
[CV 3/3

In [12]:
# Print the best parameters found
print("Best parameters:", grid_search.best_params_)

Best parameters: {'l2_regularization': 0.0, 'learning_rate': 0.1, 'max_depth': 10, 'max_leaf_nodes': 100, 'min_samples_leaf': 10}


In [13]:
# Get the best model from the grid search
clf = grid_search.best_estimator_
clf.fit(X_train_enc, y_train_enc)

In [14]:
print(f"Train accuracy: {clf.score(X_train_enc, y_train_enc):.3f}")
print(f"Test accuracy: {clf.score(X_test_enc, y_test_enc):.3f}")

Train accuracy: 0.999
Test accuracy: 0.996


In [15]:
print(f'Train CE loss = {log_loss(y_train_enc, clf.predict_proba(X_train_enc)):.4f}')
print(f'Test CE loss = {log_loss(y_test_enc, clf.predict_proba(X_test_enc)):.4f}')

Train CE loss = 0.0038
Test CE loss = 0.0179


## Encode through pre-trained model

In [16]:
from transformers import BartTokenizer, BartModel

ModuleNotFoundError: No module named 'transformers'

In [None]:
X_str = np.array([' '.join([str(x) for x in X.iloc[i].dropna(inplace=False).values]) for i in range(len(X))], dtype=str)

In [None]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartModel.from_pretrained('facebook/bart-base')

In [None]:
inputs

In [None]:
inputs = tokenizer(X_str[10], return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

In [None]:
last_hidden_states.cpu().detach().numpy().squeeze().shape