In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
from matplotlib import cm
from scipy.stats import skew
from statsmodels.nonparametric.smoothers_lowess import lowess

In [5]:
df_1 = pd.read_spss('../final_data/caspian1 data.sav',convert_categoricals=True)
df_3 = pd.read_spss('../final_data/CASPIAN III.sav',convert_categoricals=True)
df_4 = pd.read_spss('../final_data/caspian4-ghorbani.sav',convert_categoricals=True)
df_5 = pd.read_spss('../final_data/caspian5-ghorbani.sav',convert_categoricals=True)


In [6]:
def rename_features(df1,caspian_number):
    df2 = df1.copy()
    if caspian_number==1:
        df2.rename(columns={'univer': 'university','district':'region', 'schoolty':'schoolType'}, inplace=True)    
    elif caspian_number==3:
        df2.rename(columns={'area':'region', 'heighte':'height','weighte':'weight'}, inplace=True)    
    elif caspian_number==4:
        df2.rename(columns={'weight_1': 'weight', 'height_2': 'height', 'universi': 'university','waist_3':'waist','hip_4':'hip','wrist_5':'wrist'}, inplace=True)        
    elif caspian_number==5:
        df2.rename(columns={'weight_1': 'weight', 'height_2': 'height', 'universi': 'university','ap_9':'schoolType','waist_3':'waist','wrist4':'wrist'}, inplace=True)


    return df2

In [7]:
df_1=rename_features(df_1,1)
df_3=rename_features(df_3,3)
df_4=rename_features(df_4,4)
df_5=rename_features(df_5,5)
#add hip column to caspain 3
df_3['hip'] = np.nan

In [8]:
df_1['sex'] = df_1['sex'].apply(lambda x: 'Girl' if x == 'Female' else 'Boy' if x == 'Male' else x)
df_3['sex'] = df_3['sex'].apply(lambda x: 'Girl' if x == 'male' else 'Boy' if x == 'female' else x)
df_4['sex'] = df_4['sex'].apply(lambda x: 'Girl' if x == 'girl' else 'Boy' if x == 'boy' else x)
df_5['sex'] = df_5['sex'].apply(lambda x: 'Girl' if x == 'girl' else 'Boy' if x == 'boy' else x)



In [9]:
def preprocess(dataframes_dict):
    processed_dfs = {}  # Dictionary to store processed DataFrames
    for name, df_org in dataframes_dict.items():
        df = df_org.copy()

        # Filter age
        df = df[(df["age"] >= 7) & (df["age"] <= 18)]
        # please change the type of heught_1 and weight_1 in caspian4 to numeric if you can't do it directly uncomment two line below
        df['height'] = pd.to_numeric(df['height'], errors='coerce')
        df['weight'] = pd.to_numeric(df['weight'], errors='coerce')
        df["bmi1"] = df["weight"] / ((df["height"] / 100) ** 2)
        
        # Remove null tuples
        records_with_nulls = df[
            df[["sex"]].isna().any(axis=1)
        ]
        df = df.dropna(subset=["sex"])
        print(
            f"Number of records with NaN value in sex in {name}: {len(records_with_nulls)}"
        )

        # Store the processed DataFrame in the new dictionary
        processed_dfs[name] = df

    return processed_dfs

df_dict = {'casp1' :df_1 ,'casp3': df_3, 'casp4': df_4,'casp5':df_5}
processed_dfs = preprocess(df_dict)


Number of records with NaN value in sex in casp1: 1
Number of records with NaN value in sex in casp3: 0
Number of records with NaN value in sex in casp4: 0
Number of records with NaN value in sex in casp5: 0


In [10]:
def calculate_z_score(value, L, M, S):
    """
    Calculate the z-score using the LMS method.

    Parameters:
    - value: Observed value (e.g., height, weight, BMI).
    - L: Lambda (skewness parameter).
    - M: Mu (median or central tendency).
    - S: Sigma (coefficient of variation).

    Returns:
    - z_score: Standardized z-score.
    """
    if L == 0:
        z_score = (value / M - 1) / S
    else:
        z_score = (np.power(value / M, L) - 1) / (L * S)
    return z_score

def apply_z_score(group, params,feature):
    # Merge params (Lambda, Median, Sigma) with the group
    group = group.merge(params, on='sex', how='left')
    
    # Apply the z-score calculation row-wise within each group
    group['Z-Score'] = group.apply(
        lambda row: calculate_z_score(row[f'{feature}'], L=row['Lambda'], M=row['Median'], S=row['Sigma']), axis=1
    )
    return group

def calculate_params(group, feature):
    # Calculate Median (M)
    M = group[f'{feature}'].median()

    # Calculate Lambda (L) - Skewness
    L = skew(group[f'{feature}'])

    # Calculate Sigma (S) - Coefficient of Variation (std / mean)
    mean = group[f'{feature}'].mean()
    std_dev = group[f'{feature}'].std()
    S = std_dev / mean if mean != 0 else None  # Avoid division by zero

    return pd.Series({'Lambda': L, 'Median': M, 'Sigma': S})

In [12]:
# Group by Age and Sex, then calculate summary statistics (mean, for example)
for name_df, df_org in processed_dfs.items():
    grouped = df_org.groupby(['age', 'sex'])

    percentile_data = []
    percentile = [3,10,15,25,50,75,85,90,97]
    percentiles_labels = ['P3','P10','P15','P25','P50','P75','P85','P90', 'P97']
    # Now, apply this to each group, passing the params DataFrame
    frac = 0.7
    features = ['height','weight','waist','hip','bmi1']

    #Loop through each feature to calculate the percentiles
    for feature in features:
        params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
        grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))
        ages = sorted(grouped_with_z_score['age_x'].unique())
        sexes = grouped_with_z_score['sex'].unique()
        print(name_df, feature)
        for sex in sexes:
            skip_sex = False  # Add a flag to track whether to skip further processing for this sex

            for age in ages:
                feature_data = grouped_with_z_score[
                    (grouped_with_z_score['age_x'] == age) & (grouped_with_z_score['sex'] == sex)
                ][f'{feature}']

                # Convert to numeric, coercing errors to NaN
                feature_data = pd.to_numeric(feature_data, errors='coerce')
                feature_data = feature_data.dropna()  # Remove NaN values

                # Check if there's enough data
                if feature_data.size == 0:
                    print(f"No valid data for sex: {sex}, age: {age}. Skipping further processing for this sex...")
                    skip_sex = True  # Set the flag to True to skip further processing for this sex
                    break  # Exit the age loop for the current sex

                # Calculate the desired percentiles
                percentiles_values = np.percentile(feature_data, percentile)

                # Collect data for this group
                percentile_data.append([sex, age] + percentiles_values.tolist())

            # If the flag is set, skip further processing for this sex
            if skip_sex:
                continue

            # Create a DataFrame from the collected percentile data
            percentile_columns = ['Gender', 'Age'] + percentiles_labels
            percentile_df = pd.DataFrame(percentile_data, columns=percentile_columns)

            # Create separate plots for boys and girls
            fig, ax = plt.subplots(figsize=(12, 6))

            sex_data = percentile_df[percentile_df['Gender'] == sex]
            for percentile_label in percentiles_labels:
                smoothed_percentile = lowess(sex_data[percentile_label], sex_data['Age'], frac=frac)
                ax.plot(smoothed_percentile[:, 0], smoothed_percentile[:, 1], linestyle='-', label=f'Smoothed {percentile_label}')

            # Title and labels
            name = f'{name_df}-{feature}-Percentiles by Age-{sex.capitalize()}-LMS Method'
            ax.set_title(f'{name}')
            ax.set_xlabel('Age')
            ax.set_ylabel(f'{feature}')
            ax.legend()
            plt.grid()
            plt.xticks(sex_data['Age'])
            plt.savefig(f'../final_chart/{name_df}-{feature}-{sex.capitalize()}.png')  # Save the plot with .png extension
            plt.close()

            # Clear data for the next iteration
            percentile_data.clear()
            print('sure',len(percentile_data))


  grouped = df_org.groupby(['age', 'sex'])
  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp1 height
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp1 weight
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp1 waist
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp1 hip
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp1 bmi1
sure 0
sure 0


  grouped = df_org.groupby(['age', 'sex'])
  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp3 height
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp3 weight
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp3 waist
sure 0
sure 0


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

casp3 hip
No valid data for sex: Boy, age: 10.0. Skipping further processing for this sex...
No valid data for sex: Girl, age: 10.0. Skipping further processing for this sex...


  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp3 bmi1
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp4 height
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp4 weight
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp4 waist
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp4 hip
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp4 bmi1
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp5 height
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp5 weight
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp5 waist
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp5 hip
sure 0
sure 0


  params = grouped.apply(lambda group: calculate_params(group, feature=feature)).reset_index()
  grouped_with_z_score = grouped.apply(lambda group: apply_z_score(group, params,feature))


casp5 bmi1
sure 0
sure 0
