In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

### results analysis

In [None]:
df = pd.read_csv('../data/results/depressiontestresults.csv')

# Table for SVM tokenizer Performance and Fairness Metrics
SVMTKMetrics = df[['predictor','depressionTotalF1','normalTotalF1','accuracyTotal','statisticalParity','equalOpportunity','equalisedOdds','equalAccuracy']]

SVMTKMetrics.to_csv('SVMTKMetricsSVM.csv')

SVMTKMetrics = SVMTKMetrics[SVMTKMetrics['predictor'].isin(['blingfire','nltk','spacysm','spacylg','spacytrf'])]
SVMTKMetrics.set_index('predictor', inplace=True)
SVMTKMetrics = SVMTKMetrics.T

latex = SVMTKMetrics.to_latex(
    caption='Tokenizer Performance And Fairness Metrics',
    index=True,  
    header=True, 
    bold_rows=True, 
    column_format='lcccccc',
    escape=False
)
print(latex)
display(SVMTKMetrics)

### Symptom result creation

In [5]:
# options = ['blingfire','reweightedblingfire','nltk','reweightednltk','spacysm','reweightedspacysm','spacylg','reweightedspacylg','spacytrf','reweightedspacytrf']
options = ['Testspacysm']
symps = ["Anxious_Mood","Autonomic_symptoms","Cardiovascular_symptoms","Catatonic_behavior","Decreased_energy_tiredness_fatigue","Depressed_Mood","Gastrointestinal_symptoms","Genitourinary_symptoms","Hyperactivity_agitation","Impulsivity","Inattention","Indecisiveness","Respiratory_symptoms","Suicidal_ideas","Worthlessness_and_guilty","avoidance_of_stimuli","compensatory_behaviors_to_prevent_weight_gain","compulsions","diminished_emotional_expression","do_things_easily_get_painful_consequences","drastical_shift_in_mood_and_energy","fear_about_social_situations","fear_of_gaining_weight","fears_of_being_negatively_evaluated","flight_of_ideas","intrusion_symptoms","loss_of_interest_or_motivation","more_talktive","obsession","panic_fear","pessimism","poor_memory","sleep_disturbance","somatic_muscle","somatic_symptoms_others","somatic_symptoms_sensory","weight_and_appetite_change","Anger_Irritability"]
label = 'depression'

for name in options:
    df = pd.read_json(f'../data/vectorData/{name}Vectors.json',orient='records', lines=True)
    dfm = df.loc[(df['label'] == label) & (df['gender'] == 'm')]
    dff = df.loc[(df['label'] == label) & (df['gender'] == 'f')]
    vectorsm = dfm[symps]
    vectorsf = dff[symps]
    metricsm = {
    'Mean_M': vectorsm.mean(),
    'Std_M': vectorsm.std(),
    'Median_M': vectorsm.median(),
    'Maximum_M': vectorsm.max(),
    'Minimum_M': vectorsm.min(),
    '25th Percentile_M': vectorsm.quantile(0.25),
    '75th Percentile_M': vectorsm.quantile(0.75),
    'Variance_M': vectorsm.var(),
    }
    metricsf = {
    'Mean_F': vectorsf.mean(),
    'Std_F': vectorsf.std(),
    'Median_F': vectorsf.median(),
    'Maximum_F': vectorsf.max(),
    'Minimum_F': vectorsf.min(),
    '25th Percentile_F': vectorsf.quantile(0.25),
    '75th Percentile_F': vectorsf.quantile(0.75),
    'Variance_F': vectorsf.var(),
    }
    metricsm = pd.DataFrame(metricsm)
    metricsf = pd.DataFrame(metricsf)
    # metricsm = metricsm.T
    # metricsf = metricsf.T

metricsf.to_csv(f'{label}femaleSymptomMetrics.csv')
metricsm.to_csv(f'{label}maleSymptomMetrics.csv')


In [57]:
# Load data
female_data = pd.read_csv(f'{label}femaleSymptomMetrics.csv', index_col=0)
male_data = pd.read_csv(f'{label}maleSymptomMetrics.csv', index_col=0)

# Merging dataframes
merged_data = female_data.merge(male_data, left_index=True, right_index=True, how='inner')

# Calculating differences

merged_data['Mean_Diff'] = merged_data['Mean_F']/ merged_data['Mean_M']
merged_data['Std_Diff'] = merged_data['Std_F'] / merged_data['Std_M']

# Selecting metrics
comparison_columns = ['Mean_F', 'Mean_M','Mean_Diff',
                      'Std_F', 'Std_M','Std_Diff',
                      'Median_F', 'Median_M', 
                      'Maximum_F', 'Maximum_M', 
                      'Minimum_F', 'Minimum_M', 
                      '25th Percentile_F', '25th Percentile_M',
                      '75th Percentile_F', '75th Percentile_M', 
                      'Variance_F', 'Variance_M'
                      ]

# Creating a summary dataframe for comparison
summary_comparison = merged_data[comparison_columns]

summary_comparison.to_csv(f'{label}comparison.csv')

### Top 10 symptom metrics

In [None]:
# Creating ranked comparison
sorted_valuesF = summary_comparison.sort_values(by='Mean_F', ascending=False)
sorted_valuesM = summary_comparison.sort_values(by='Mean_M', ascending=False)

sorted_valuesF['ranking_F'] = range(1,39)
sorted_valuesM['ranking_M'] = range(1,39)


merged = sorted_valuesM.merge(sorted_valuesF[['ranking_F']], left_index=True, right_index=True, how='right')
merged.to_csv('../data/results/depressedSymptomComparison.csv')

print(merged[['Mean_F','Mean_M','Mean_Diff','Std_F', 'Std_M','Std_Diff']].head(10).to_latex(caption='Top 10 symptom metrics'))

### Top 10 repression symptom distribution graph

In [None]:
df = pd.read_json(f'../data/vectorData/TestspacysmVectors.json',orient='records', lines=True)
df = df.loc[df['label'] == 'depression']
symps = ["Anxious_Mood","Depressed_Mood","Hyperactivity_agitation","Worthlessness_and_guilty","diminished_emotional_expression","drastical_shift_in_mood_and_energy","panic_fear","pessimism","sleep_disturbance","Anger_Irritability"]

df = pd.melt(df, id_vars=['video_id', 'label', 'key', 'gender', 'duration', 'channelId', 'text'], 
                  var_name='Symptom', value_name='Score',
                  value_vars=symps)

fig = px.box(df, x='Symptom', y='Score', color='gender',
             title='Symptom Distributions by Gender',
             category_orders={"Symptom": sorted(df['Symptom'].unique())},
             color_discrete_map={'m': 'blue', 'f': 'red'})

fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray': sorted(df['Symptom'].unique())},
                  xaxis_title='Symptom',
                  yaxis_title='Score',
                  boxmode='group',  
                  height=1000,  
                  width=1800)  
fig.update_traces(quartilemethod="inclusive")

# Display the figure
fig.show()

### Adjusted symptom differences

In [None]:
df = pd.read_csv('../data/results/depressedSymptomComparison.csv')
df['Adjusted_Mean_Diff'] = (df['Mean_Diff'] / df['Mean_Diff'].mean() * 100) - 100

display(df[['Unnamed: 0','Mean_F','Mean_M','Mean_Diff','Adjusted_Mean_Diff']])

### D-vlog dataset distribution

In [11]:
df = pd.read_json('../data/splitData/postdataLinesSplit.json',lines=True, orient='records')

df = df[['gender','label']]


overview = pd.pivot_table(df, index='label', columns='gender', aggfunc=len, fill_value=0)

overview['Total'] = overview.sum(axis=1)  # Total per label
overview.loc['Total'] = overview.sum()  # Total per gender and overall

print(overview.to_latex(caption='D-vlog dataset distribution'))


\begin{table}
\caption{D-vlog dataset distribution}
\begin{tabular}{lrrr}
\toprule
gender & f & m & Total \\
label &  &  &  \\
\midrule
depression & 295 & 139 & 434 \\
normal & 237 & 126 & 363 \\
Total & 532 & 265 & 797 \\
\bottomrule
\end{tabular}
\end{table}

