In [1]:
import os
import sys
import pandas as pd
import numpy as np
import base64
import datetime
import difflib
import re
import json
from itertools import chain
import matplotlib.pyplot as plt
%load_ext autoreload
%load_ext blackcellmagic
%autoreload 2

%matplotlib inline

In [2]:
from dotenv import load_dotenv
load_dotenv()
sys.path.insert(0, os.getenv('lib_path'))

In [3]:
from read_data import read_data

In [4]:
from filter_data import filter_data

In [5]:
DATA_PATH = os.environ.get("data_path")

In [None]:
data_900 = read_data(
    DATA_PATH,
    merge_with_meds=True,
    merge_with_patient_genes=True,
)
data_900_df = data_900.input_df

In [7]:
from sklearn.ensemble import IsolationForest

In [8]:
outliers_cols = [
                "test_amh_r",
                "patient_age",
                "cause_pco",
                "ds1_pech_licz_10_pon",
                "prev_proc-cumulus_denuded",
                "prev_proc-day_0_mii",
                "day_0_mii",
            ]

In [9]:
data_900_modelling = data_900_df.loc[data_900_df.test_amh_r.notna()&data_900_df.day_0_mii.notna()].copy()
data_900_modelling.cause_pco = data_900_modelling.cause_pco.cat.codes

# Hard rules - remove outliers

In [10]:
data_900_modelling = filter_data(
    data_900_modelling, ~data_900_modelling["process_type"].isin(["DAWKJ", "BIOKJ", "DD", "DS"])
)
data_900_modelling = filter_data(
    data_900_modelling, ~data_900_modelling["lek_Gonadotropiny"].str.contains("Elonva")
)
data_900_modelling = filter_data(data_900_modelling, data_900_modelling["ds1_3_dawka_dzienna"] < 1250)
data_900_modelling = filter_data(data_900_modelling, data_900_modelling["test_amh_r"] < 15.0)
print(data_900_modelling.shape)
data_900_modelling.reset_index(inplace=True,drop=True)

(516, 1618)


In [11]:
data_900_modelling.day_0_mii.max()

27.0

# Isolation Forest

In [12]:
model_data = data_900_modelling.loc[:, outliers_cols].dropna(axis=1).copy()
iforest_model = IsolationForest(
    random_state=2, n_estimators=100, max_samples=256)
iforest_model.fit(model_data)
score_iforest_model = -1 * iforest_model.score_samples(model_data)

In [13]:
import plotly.graph_objects as go

In [14]:
fig = go.Figure(data=go.Scatter(
    y = score_iforest_model,
    mode='markers',
    marker=dict(
        size=16,
        color=score_iforest_model,
        colorscale='Viridis',
        showscale=True
    )
))

fig.show()

In [15]:
model_data.loc[score_iforest_model>0.6]

Unnamed: 0,test_amh_r,patient_age,cause_pco,day_0_mii
70,9.6,30,1,18.0
71,12.15,30,1,24.0
72,12.15,34,1,25.0
98,5.84,30,1,18.0
99,8.69,32,1,14.0
100,8.69,33,1,18.0
101,8.69,35,1,8.0
102,8.69,35,1,17.0
103,8.69,36,1,15.0
117,8.33,31,0,25.0


In [16]:
sum(score_iforest_model>0.6)

25

We decided to perform experiments for 2 datasets - with removed observations>0.6IF score and for full dataset.

After consultations with gynecologysts our final conclusion was to keep the observations - "They are not typical, but certainly can be observed".