In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

### results analysis

In [66]:
df = pd.read_csv('../data/results/depressiontestresults.csv')

# Table for SVM tokenizer Performance and Fairness Metrics
SVMTKMetrics = df[['predictor','depressionTotalF1','normalTotalF1','accuracyTotal','statisticalParity','equalOpportunity','equalisedOdds','equalAccuracy']]

SVMTKMetrics.to_csv('SVMTKMetricsSVM.csv')

SVMTKMetrics = SVMTKMetrics[SVMTKMetrics['predictor'].isin(['blingfire','nltk','spacysm','spacylg','spacytrf'])]
SVMTKMetrics.set_index('predictor', inplace=True)
SVMTKMetrics = SVMTKMetrics.T

latex = SVMTKMetrics.to_latex(
    caption='Tokenizer Performance And Fairness Metrics',
    index=True,  
    header=True, 
    bold_rows=True, 
    column_format='lcccccc',
    escape=False
)
print(latex)
display(SVMTKMetrics)

\begin{table}
\caption{Tokenizer Performance And Fairness Metrics}
\begin{tabular}{lcccccc}
\toprule
predictor & blingfire & nltk & spacysm & spacylg & spacytrf \\
\midrule
\textbf{depressionTotalF1} & 0.663900 & 0.663900 & 0.886076 & 0.860759 & 0.784314 \\
\textbf{normalTotalF1} & 0.000000 & 0.000000 & 0.890244 & 0.865854 & 0.804734 \\
\textbf{accuracyTotal} & 0.496894 & 0.496894 & 0.888199 & 0.863354 & 0.795031 \\
\textbf{statisticalParity} & NaN & NaN & 0.760321 & 0.760321 & 0.834862 \\
\textbf{equalOpportunity} & NaN & NaN & 0.940690 & 0.897931 & 0.833793 \\
\textbf{equalisedOdds} & 1.000000 & 1.000000 & 0.982556 & 0.941332 & 0.832223 \\
\textbf{equalAccuracy} & 1.340323 & 1.340323 & 0.974429 & 0.933828 & 0.822175 \\
\bottomrule
\end{tabular}
\end{table}



predictor,blingfire,nltk,spacysm,spacylg,spacytrf
depressionTotalF1,0.6639,0.6639,0.886076,0.860759,0.784314
normalTotalF1,0.0,0.0,0.890244,0.865854,0.804734
accuracyTotal,0.496894,0.496894,0.888199,0.863354,0.795031
statisticalParity,,,0.760321,0.760321,0.834862
equalOpportunity,,,0.94069,0.897931,0.833793
equalisedOdds,1.0,1.0,0.982556,0.941332,0.832223
equalAccuracy,1.340323,1.340323,0.974429,0.933828,0.822175


### Symptom result creation

In [5]:
# options = ['blingfire','reweightedblingfire','nltk','reweightednltk','spacysm','reweightedspacysm','spacylg','reweightedspacylg','spacytrf','reweightedspacytrf']
options = ['Testspacysm']
symps = ["Anxious_Mood","Autonomic_symptoms","Cardiovascular_symptoms","Catatonic_behavior","Decreased_energy_tiredness_fatigue","Depressed_Mood","Gastrointestinal_symptoms","Genitourinary_symptoms","Hyperactivity_agitation","Impulsivity","Inattention","Indecisiveness","Respiratory_symptoms","Suicidal_ideas","Worthlessness_and_guilty","avoidance_of_stimuli","compensatory_behaviors_to_prevent_weight_gain","compulsions","diminished_emotional_expression","do_things_easily_get_painful_consequences","drastical_shift_in_mood_and_energy","fear_about_social_situations","fear_of_gaining_weight","fears_of_being_negatively_evaluated","flight_of_ideas","intrusion_symptoms","loss_of_interest_or_motivation","more_talktive","obsession","panic_fear","pessimism","poor_memory","sleep_disturbance","somatic_muscle","somatic_symptoms_others","somatic_symptoms_sensory","weight_and_appetite_change","Anger_Irritability"]
label = 'depression'

for name in options:
    df = pd.read_json(f'../data/vectorData/{name}Vectors.json',orient='records', lines=True)
    dfm = df.loc[(df['label'] == label) & (df['gender'] == 'm')]
    dff = df.loc[(df['label'] == label) & (df['gender'] == 'f')]
    vectorsm = dfm[symps]
    vectorsf = dff[symps]
    metricsm = {
    'Mean_M': vectorsm.mean(),
    'Std_M': vectorsm.std(),
    'Median_M': vectorsm.median(),
    'Maximum_M': vectorsm.max(),
    'Minimum_M': vectorsm.min(),
    '25th Percentile_M': vectorsm.quantile(0.25),
    '75th Percentile_M': vectorsm.quantile(0.75),
    'Variance_M': vectorsm.var(),
    }
    metricsf = {
    'Mean_F': vectorsf.mean(),
    'Std_F': vectorsf.std(),
    'Median_F': vectorsf.median(),
    'Maximum_F': vectorsf.max(),
    'Minimum_F': vectorsf.min(),
    '25th Percentile_F': vectorsf.quantile(0.25),
    '75th Percentile_F': vectorsf.quantile(0.75),
    'Variance_F': vectorsf.var(),
    }
    metricsm = pd.DataFrame(metricsm)
    metricsf = pd.DataFrame(metricsf)
    # metricsm = metricsm.T
    # metricsf = metricsf.T

metricsf.to_csv(f'{label}femaleSymptomMetrics.csv')
metricsm.to_csv(f'{label}maleSymptomMetrics.csv')


In [57]:
# Load data
female_data = pd.read_csv(f'{label}femaleSymptomMetrics.csv', index_col=0)
male_data = pd.read_csv(f'{label}maleSymptomMetrics.csv', index_col=0)

# Merging dataframes
merged_data = female_data.merge(male_data, left_index=True, right_index=True, how='inner')

# Calculating differences

merged_data['Mean_Diff'] = merged_data['Mean_F']/ merged_data['Mean_M']
merged_data['Std_Diff'] = merged_data['Std_F'] / merged_data['Std_M']

# Selecting metrics
comparison_columns = ['Mean_F', 'Mean_M','Mean_Diff',
                      'Std_F', 'Std_M','Std_Diff',
                      'Median_F', 'Median_M', 
                      'Maximum_F', 'Maximum_M', 
                      'Minimum_F', 'Minimum_M', 
                    #   '25th Percentile_F', '25th Percentile_M', 
                    #   '75th Percentile_F', '75th Percentile_M', 
                    #   'Variance_F', 'Variance_M'
                      ]

# Creating a summary dataframe for comparison
summary_comparison = merged_data[comparison_columns]

summary_comparison.to_csv(f'{label}comparison.csv')

In [64]:
# Creating ranked comparison
sorted_valuesF = summary_comparison.sort_values(by='Mean_F', ascending=False)
sorted_valuesM = summary_comparison.sort_values(by='Mean_M', ascending=False)

sorted_valuesF['ranking_F'] = range(1,39)
sorted_valuesM['ranking_M'] = range(1,39)


merged = sorted_valuesM.merge(sorted_valuesF[['ranking_F']], left_index=True, right_index=True, how='right')
merged.to_csv('../data/results/symptomResults.csv')

print(merged[['Mean_F','Mean_M','Mean_Diff','Std_F', 'Std_M','Std_Diff']].head(10).to_latex(caption='Top 10 symptom metrics'))

\begin{table}
\caption{Top 10 symptom metrics}
\begin{tabular}{lrrrrrr}
\toprule
 & Mean_F & Mean_M & Mean_Diff & Std_F & Std_M & Std_Diff \\
\midrule
Depressed_Mood & 0.867480 & 0.880432 & 0.985289 & 0.275714 & 0.254977 & 1.081330 \\
Anxious_Mood & 0.661535 & 0.562628 & 1.175795 & 0.356803 & 0.406744 & 0.877217 \\
diminished_emotional_expression & 0.539533 & 0.530451 & 1.017122 & 0.278682 & 0.267200 & 1.042969 \\
Worthlessness_and_guilty & 0.467131 & 0.420450 & 1.111025 & 0.281541 & 0.283774 & 0.992133 \\
pessimism & 0.346634 & 0.320280 & 1.082284 & 0.288199 & 0.275450 & 1.046282 \\
sleep_disturbance & 0.346096 & 0.293989 & 1.177238 & 0.366967 & 0.353470 & 1.038185 \\
weight_and_appetite_change & 0.340676 & 0.259231 & 1.314177 & 0.392320 & 0.367322 & 1.068056 \\
Anger_Irritability & 0.332596 & 0.258840 & 1.284945 & 0.301390 & 0.253815 & 1.187441 \\
Hyperactivity_agitation & 0.323148 & 0.283876 & 1.138344 & 0.269449 & 0.290578 & 0.927286 \\
drastical_shift_in_mood_and_energy & 0.310640

In [53]:
df = pd.read_json(f'../data/vectorData/TestspacysmVectors.json',orient='records', lines=True)
dfm = df.loc[(df['gender'] == 'm')]
dff = df.loc[(df['gender'] == 'f')]

df = pd.melt(df, id_vars=['video_id', 'label', 'key', 'gender', 'duration', 'channelId', 'text'], 
                  var_name='Symptom', value_name='Score',
                  value_vars=symps)

fig = px.box(df, x='Symptom', y='Score', color='gender',
             title='Symptom Distributions by Gender',
             category_orders={"Symptom": sorted(df['Symptom'].unique())})

fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray': sorted(df['Symptom'].unique())},
                  xaxis_title='Symptom',
                  yaxis_title='Score',
                  boxmode='group',  
                  height=1000,  
                  width=1800)  
fig.update_traces(quartilemethod="inclusive")

# Display the figure
fig.show()