In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import curve_fit
import math

In [None]:
df_WS = pd.read_csv('~/Desktop/LingPredict data/wordbank_data_WS_Produces_es_MX.csv')
df_WS.drop(columns=['downloaded'], inplace=True)
display(df_WS.head())
print(df_WS.shape)

df_WG = pd.read_csv('~/Desktop/LingPredict data/wordbank_data_WG_Produces_es_MX.csv')
df_WG.drop(columns=['downloaded'], inplace=True)
display(df_WG.head())
df_WG.shape

In [None]:
# Make all item definitions lowercase for consistency
df_WS['item_definition'] = df_WS['item_definition'].str.lower()
df_WG['item_definition'] = df_WG['item_definition'].str.lower()

In [None]:
# Not all words in WG are in WS. Check which ones are missing.
print(df_WS.shape, df_WG.shape)
print(df_WG['item_definition'].isin(df_WS['item_definition']).all())
mask = df_WG['item_definition'].isin(df_WS['item_definition'])
missing = df_WG[~mask]
print(f"Number of WG words not in WS: {(len(missing))}")
missing

In [None]:
display(df_WS[df_WS['item_definition'].str.contains('ver', na=False)])
display(df_WS[df_WS['item_definition'].str.startswith('te', na=False)])
display(df_WS[df_WS['item_definition'].str.contains('qué', na=False)])
display(df_WS[df_WS['item_definition'].str.contains('mano', na=False)])
display(df_WS[df_WS['item_definition'].str.contains('cochera', na=False)])
display(df_WS[df_WS['item_definition'].str.contains('bolsa', na=False)])
display(df_WS[df_WS['item_definition'].str.contains('uno', na=False)])

In [None]:
# Let's try a 2-parameter sigmoid, fixing L=1, instead
def sigmoid(age, k, x0):
    """
    k: Growth rate.
    x0: Inflection point / Median AoA.
    """
    return 1 / (1 + np.exp(-k * (age - x0)))

In [None]:
# --- 2. Helper Functions (Reused from previous steps) ---

def row_to_df_for_fit(row_data):
    """
    Transforms a single wide-format row into a clean long-format DataFrame.
    """
    if isinstance(row_data, pd.Series):
        df_row = row_data.to_frame().T
    else:
        df_row = row_data

    EXCLUDE_COLS = ['item_id', 'item_definition', 'category']
    age_cols = df_row.columns.difference(EXCLUDE_COLS)
    
    proportions_wide = df_row[age_cols]
    
    row_df = pd.melt(
        proportions_wide,
        value_vars=age_cols,
        var_name='Age',
        value_name='Proportion Acquired'
    )
    
    row_df = row_df.dropna(subset=['Proportion Acquired'])
    row_df['Age'] = row_df['Age'].astype(int)
    row_df = row_df.sort_values(by='Age')
    
    return row_df.reset_index(drop=True)

def calculate_sigmoid_params(df_combined):
    """
    Fits the sigmoid curve to the combined long-format data. 
    """
    X = df_combined['Age'].values
    Y = df_combined['Proportion Acquired'].values
    p0 = [0.5, X.mean() if X.size > 0 else 20] 

    try:
        popt, pcov = curve_fit(sigmoid, X, Y, p0=p0, maxfev=5000)
        return pd.Series(
            {'Growth Rate': popt[0], 'Median AoA': popt[1]}
        )
    except RuntimeError:
        return pd.Series({'Growth Rate': np.nan, 'Median AoA': np.nan})

# --- 3. The New Plotting Function (Refactored to use an ax object) ---

def plot_acquisition_curve(ax, word, df_data, k_fit, x0_fit):
    """
    Generates a scatter plot of the raw data, overlays the fitted logistic curve,
    and adds median AoA and 50% lines onto the provided Axes (ax) object.
    """
    
    # Define colors for scatter plot (Requirement 1 & 2)
    palette = {'WS': 'blue', 'WG': 'orange'}
    sns.scatterplot(
        data=df_data,
        x='Age',
        y='Proportion Acquired',
        hue='Inventory',
        palette=palette,
        s=40, # Smaller points for better visibility in a grid
        edgecolor='black',
        alpha=0.7,
        zorder=3,
        ax=ax # Pass the axis object to seaborn
    )

    # --- Generate and Plot Fitted Curve (Requirement 3) ---
    x_range = np.linspace(df_data['Age'].min() - 5, df_data['Age'].max() + 5, 100)
    y_fitted = sigmoid(x_range, k_fit, x0_fit)
    
    ax.plot(
        x_range, 
        y_fitted, 
        color='green', 
        linewidth=1.5, 
        label=f'Fitted Curve (k={k_fit:.2f})'
    )

    # --- Plot Vertical Median AoA Line (Requirement 4) ---
    ax.axvline(
        x=x0_fit,
        color='green',
        linestyle='--',
        linewidth=1,
        label=f'Median AoA ({x0_fit:.1f} mos)',
        alpha=0.7
    )
    
    # --- Plot Horizontal 50% Acquisition Line (Requirement 5) ---
    ax.axhline(
        y=0.5, 
        color='red', 
        linestyle='--', 
        linewidth=1, 
        label='50% Threshold',
        alpha=0.7
    )

    # --- Customization ---
    ax.set_title(f'{word}', fontsize=10)
    ax.set_xlabel('Age (Months)', fontsize=8)
    ax.set_ylabel('Prop. Acquired', fontsize=8)
    ax.set_ylim(0, 1.05)
    ax.set_xlim(df_data['Age'].min() - 2, df_data['Age'].max() + 2)
    ax.grid(axis='both', linestyle=':', alpha=0.5)
    
    # Remove the legend from each subplot to keep the grid clean
    if ax.get_legend() is not None:
        ax.get_legend().remove()
    
# Create the fast lookup dictionary for df_WG
wg_dict = df_WG.set_index('item_definition').T.to_dict('series')

# --- 5. EXECUTION & PLOTTING LOOP (Updated for Grid Layout) ---

# Step A: Compute the Fits (refactored to return plot data explicitly)
def combined_logistic_regression(ws_row, wg_dict):
    item_def = ws_row['item_definition']
    row_WS_long = row_to_df_for_fit(ws_row)
    
    if item_def in wg_dict:
        wg_row_series = wg_dict[item_def]
        row_WG_long = row_to_df_for_fit(wg_row_series)
        row_WG_long['Inventory'] = 'WG' # Tag the data source
        row_WS_long['Inventory'] = 'WS' # Tag WS data here too
        row_df_combined = pd.concat([row_WG_long, row_WS_long], ignore_index=True)
    else:
        # If no WG match, tag the WS data
        row_WS_long['Inventory'] = 'WS' 
        row_df_combined = row_WS_long
    
    # Fit the curve
    fit_params = calculate_sigmoid_params(row_df_combined)
    
    # Add the plotting data to the Series being returned by .apply()
    fit_params['__plot_data__'] = row_df_combined
    
    return fit_params

# Run the fit and store results (results now includes all three keys)
results = df_WS.apply(combined_logistic_regression, axis=1, wg_dict=wg_dict)

# Initialize df_curve_fits and assign columns from the 'results' DataFrame
df_curve_fits = df_WS[['item_definition']].copy()
df_curve_fits['Growth Rate'] = results['Growth Rate']
df_curve_fits['Median AoA'] = results['Median AoA']

# Assign the plot data directly from the collected 'results' Series
df_curve_fits['__plot_data__'] = results['__plot_data__']

print("--- Generated Plots (Displayed in a 6-Column Grid) ---")

# Step B: Setup Grid and Plot
valid_fits = df_curve_fits[~pd.isna(df_curve_fits['Growth Rate'])]
num_plots = len(valid_fits)
COLS = 6 # Your specified number of columns
ROWS = math.ceil(num_plots / COLS)

# Set the overall figure size (adjust as needed for readability)
fig, axes = plt.subplots(ROWS, COLS, figsize=(COLS * 3.5, ROWS * 3))

# Flatten the axes array for simplified, reliable indexing
if not isinstance(axes, np.ndarray):
    # Handles the case where ROWS=1 and COLS=1 (axes is a single object)
    axes = np.array([axes])
else:
    # Handles (1,N), (N,1), and (N,M) grids by flattening them to 1D
    axes = axes.ravel() 

plot_index = 0
for index, row in valid_fits.iterrows():
    word = row['item_definition']
    k_fit = row['Growth Rate']
    x0_fit = row['Median AoA']
    df_plot_data = row['__plot_data__']
    
    # Use the simple 1D index to access the correct subplot
    ax = axes[plot_index] 
    
    # Plot the curve using the current axis
    plot_acquisition_curve(ax, word, df_plot_data, k_fit, x0_fit)
    
    plot_index += 1

# Hide any unused subplots
for i in range(plot_index, ROWS * COLS):
    # Use the simple 1D index to hide axes
    axes[i].axis('off')

# Add a title for the entire figure and adjust layout
fig.suptitle('Combined Acquisition Curve Fits', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.98]) # Adjust rect to make space for suptitle
plt.show()

if len(df_curve_fits) != len(valid_fits):
    print(f"\nSkipped {len(df_curve_fits) - len(valid_fits)} items due to failed curve fit (NaN parameters).")

In [None]:
# Let's redo the analysis dropping the outlier 30-month data point
df_WS_dropped = df_WS.drop(columns=['30'])
print(df_WS_dropped.shape)

In [None]:
# Create the fast lookup dictionary for df_WG
wg_dict = df_WG.set_index('item_definition').T.to_dict('series')

# --- 5. EXECUTION & PLOTTING LOOP (Updated for Grid Layout) ---

# Step A: Compute the Fits (refactored to return plot data explicitly)
def combined_logistic_regression(ws_row, wg_dict):
    item_def = ws_row['item_definition']
    row_WS_long = row_to_df_for_fit(ws_row)
    
    if item_def in wg_dict:
        wg_row_series = wg_dict[item_def]
        row_WG_long = row_to_df_for_fit(wg_row_series)
        row_WG_long['Inventory'] = 'WG' # Tag the data source
        row_WS_long['Inventory'] = 'WS' # Tag WS data here too
        row_df_combined = pd.concat([row_WG_long, row_WS_long], ignore_index=True)
    else:
        # If no WG match, tag the WS data
        row_WS_long['Inventory'] = 'WS' 
        row_df_combined = row_WS_long
    
    # Fit the curve
    fit_params = calculate_sigmoid_params(row_df_combined)
    
    # Add the plotting data to the Series being returned by .apply()
    fit_params['__plot_data__'] = row_df_combined
    
    return fit_params

# Run the fit and store results (results now includes all three keys)
results = df_WS_dropped.apply(combined_logistic_regression, axis=1, wg_dict=wg_dict)

# Initialize df_curve_fits_dropped and assign columns from the 'results' DataFrame
df_curve_fits_dropped = df_WS_dropped[['item_definition']].copy()
df_curve_fits_dropped['Growth Rate'] = results['Growth Rate']
df_curve_fits_dropped['Median AoA'] = results['Median AoA']

# Assign the plot data directly from the collected 'results' Series
df_curve_fits_dropped['__plot_data__'] = results['__plot_data__']

print("--- Generated Plots (Displayed in a 6-Column Grid) ---")

# Step B: Setup Grid and Plot
valid_fits = df_curve_fits_dropped[~pd.isna(df_curve_fits_dropped['Growth Rate'])]
num_plots = len(valid_fits)
COLS = 6 # Your specified number of columns
ROWS = math.ceil(num_plots / COLS)

# Set the overall figure size (adjust as needed for readability)
fig, axes = plt.subplots(ROWS, COLS, figsize=(COLS * 3.5, ROWS * 3))

# Flatten the axes array for simplified, reliable indexing
if not isinstance(axes, np.ndarray):
    # Handles the case where ROWS=1 and COLS=1 (axes is a single object)
    axes = np.array([axes])
else:
    # Handles (1,N), (N,1), and (N,M) grids by flattening them to 1D
    axes = axes.ravel() 

plot_index = 0
for index, row in valid_fits.iterrows():
    word = row['item_definition']
    k_fit = row['Growth Rate']
    x0_fit = row['Median AoA']
    df_plot_data = row['__plot_data__']
    
    # Use the simple 1D index to access the correct subplot
    ax = axes[plot_index] 
    
    # Plot the curve using the current axis
    plot_acquisition_curve(ax, word, df_plot_data, k_fit, x0_fit)
    
    plot_index += 1

# Hide any unused subplots
for i in range(plot_index, ROWS * COLS):
    # Use the simple 1D index to hide axes
    axes[i].axis('off')

# Add a title for the entire figure and adjust layout
fig.suptitle('Combined Acquisition Curve Fits', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.98]) # Adjust rect to make space for suptitle
plt.show()

if len(df_curve_fits_dropped) != len(valid_fits):
    print(f"\nSkipped {len(df_curve_fits_dropped) - len(valid_fits)} items due to failed curve fit (NaN parameters).")


In [None]:
df_WG.head()

In [None]:
# mamá and papá are missing from WG table
# instead they are listed as mamá/mami and papá/papi
# should probably check for other rows with / in the item_definition
display(df_WG[df_WG['item_definition'] == 'papá'])
display(df_WS[df_WS['item_definition'] == 'papá'])
display(df_WG[df_WG['item_definition'] == 'papá/papi'])

In [None]:
slashes_WG = df_WG[df_WG['item_definition'].str.contains('/', na=False)]
display(slashes_WG.head())
print(slashes_WG.shape)

slashes_WS = df_WS[df_WS['item_definition'].str.contains('/', na=False)]
display(slashes_WS.head())
print(slashes_WS.shape)

In [None]:
display(df_WS[df_WS['item_definition'].str.contains('mam', na=False)])
display(df_WS[df_WS['item_definition'].str.contains('pap', na=False)])


In [None]:
uni_lemmas_WS = pd.read_csv('~/Desktop/LingPredict data/uni_lemmas_WS_es_MX.csv')
uni_lemmas_WG = pd.read_csv('~/Desktop/LingPredict data/uni_lemmas_WG_es_MX.csv')

In [None]:
display(uni_lemmas_WS.head())
display(uni_lemmas_WG.head())

In [None]:
display(uni_lemmas_WG[uni_lemmas_WG['uni_lemma'] == 'no'])
display(uni_lemmas_WS[uni_lemmas_WS['uni_lemma'] == 'no'])
display(uni_lemmas_WS[uni_lemmas_WS['item_definition'] == 'no'])
display(df_WS[df_WS['item_definition'] == 'no'])