In [None]:
import sys
from pathlib import Path

# 1. Define the current notebook's directory
NOTEBOOK_DIR = Path().resolve()
# 2. Define the Project Root (one level up from 'notebooks/eda')
PROJECT_ROOT = NOTEBOOK_DIR.parent.parent

# 3. Add the Project Root to Python's search path
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))
    print(f"Added project root to path: {PROJECT_ROOT}")

In [None]:
from src.data.load_wordbank_data import read_and_clean_wordbank_data
from src.data.wordbank_logistic_fits import compute_curve_fits, plot_curve_fits, compute_and_export_curve_fits

In [None]:
DATA_PATH_WS_PRODUCES_EN_US = PROJECT_ROOT / 'data' / 'raw' / 'Wordbank' / 'wordbank_data_WS_Produces_en_US.csv'
UNILEMMA_PATH_WS_EN_US = PROJECT_ROOT / 'data' / 'raw' / 'Wordbank' / 'uni_lemmas_WS_en_US.csv'

DATA_PATH_WG_PRODUCES_EN_US = PROJECT_ROOT / 'data' / 'raw' / 'Wordbank' / 'wordbank_data_WG_Produces_en_US.csv'
UNILEMMA_PATH_WG_EN_US = PROJECT_ROOT / 'data' / 'raw' / 'Wordbank' / 'uni_lemmas_WG_en_US.csv'

DATA_PATH_WG_UNDERSTANDS_EN_US = PROJECT_ROOT / 'data' / 'raw' / 'Wordbank' / 'wordbank_data_WG_Understands_en_US.csv'

split_dict_WS = {'old_token':'inside/in',
              'new_tokens':['inside', 'in'],
              'new_lemmas':['inside', 'in']
              }

df_WS = read_and_clean_wordbank_data(DATA_PATH_WS_PRODUCES_EN_US,
                                        UNILEMMA_PATH_WS_EN_US,
                                        lang='en', inventory='WS', measure='Produces',
                                        items_to_split=[split_dict_WS], cols_to_drop=['28'] # drop 28 month outlier
                                      )
df_WG = read_and_clean_wordbank_data(DATA_PATH_WG_PRODUCES_EN_US,
                                        UNILEMMA_PATH_WG_EN_US,
                                        lang='en', inventory='WG', measure='Produces',
                                      )

df_WG_U = read_and_clean_wordbank_data(DATA_PATH_WG_UNDERSTANDS_EN_US,
                                        UNILEMMA_PATH_WG_EN_US,
                                        lang='en', inventory='WG', measure='Understands',
                                      )

In [None]:
for dat in [df_WS, df_WG, df_WG_U]:
    print(dat.shape)
    display(dat.head())

In [None]:
dfs = [df_WS, df_WG]
df_curve_fits_produces, df_ambiguous_produces = compute_curve_fits(dfs)

In [None]:
plot_curve_fits(df_curve_fits_produces)

In [None]:
dfs_U = [df_WG_U]
df_curve_fits_understands, df_ambiguous_understands = compute_curve_fits(dfs_U)

In [None]:
plot_curve_fits(df_curve_fits_understands, colors=['#9e1cd6'])

In [None]:
df_curve_fits_produces.sort_values(by='median_aoa', ascending=False)[['uni_lemma', 'token', 'token_clean', 'growth_rate', 'median_aoa']].head(20)

In [None]:
df_curve_fits_understands.sort_values(by='median_aoa', ascending=False)[['uni_lemma', 'token', 'token_clean', 'growth_rate', 'median_aoa']].head(20)

In [None]:
compute_and_export_curve_fits('~/Desktop/wordbank_en_logistic_fits.csv', [df_WS, df_WG], [df_WG_U])

In [None]:
import pandas as pd

fits = pd.read_csv('~/Desktop/wordbank_en_logistic_fits.csv')
fits.head(25)