In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

### results analysis

In [7]:
df = pd.read_csv('../data/results/depressionResults.csv')

# Table for SVM tokenizer Performance and Fairness Metrics
SVMTKMetrics = df[['predictor','depressionTotalF1','normalTotalF1','MCCTotal','roc-aucTotal','statisticalParity','equalOpportunity','equalisedOdds','equalAccuracy']]

SVMTKMetrics = SVMTKMetrics[SVMTKMetrics['predictor'].isin(['reweightedspacysm','spacysm'])]
SVMTKMetrics.set_index('predictor', inplace=True)
SVMTKMetrics = SVMTKMetrics.T

latex = SVMTKMetrics.to_latex(
    caption='Tokenizer Performance And Fairness Metrics',
    index=True,  
    header=True, 
    bold_rows=True, 
    column_format='lcccccc',
    escape=False
)
print(latex)
display(SVMTKMetrics)

\begin{table}
\caption{Tokenizer Performance And Fairness Metrics}
\begin{tabular}{lcccccc}
\toprule
predictor & spacysm & reweightedspacysm \\
\midrule
\textbf{depressionTotalF1} & 0.886076 & 0.851613 \\
\textbf{normalTotalF1} & 0.890244 & 0.862275 \\
\textbf{MCCTotal} & 0.776594 & 0.715547 \\
\textbf{roc-aucTotal} & 0.947994 & 0.926852 \\
\textbf{statisticalParity} & 0.760321 & 0.729628 \\
\textbf{equalOpportunity} & 0.940690 & 0.919310 \\
\textbf{equalisedOdds} & 0.982556 & 1.005932 \\
\textbf{equalAccuracy} & 0.974429 & 0.985933 \\
\bottomrule
\end{tabular}
\end{table}



predictor,spacysm,reweightedspacysm
depressionTotalF1,0.886076,0.851613
normalTotalF1,0.890244,0.862275
MCCTotal,0.776594,0.715547
roc-aucTotal,0.947994,0.926852
statisticalParity,0.760321,0.729628
equalOpportunity,0.94069,0.91931
equalisedOdds,0.982556,1.005932
equalAccuracy,0.974429,0.985933


### Symptom result creation

In [4]:
# options = ['blingfire','reweightedblingfire','nltk','reweightednltk','spacysm','reweightedspacysm','spacylg','reweightedspacylg','spacytrf','reweightedspacytrf']
options = ['spacysm']
symps = ["Anxious_Mood","Autonomic_symptoms","Cardiovascular_symptoms","Catatonic_behavior","Decreased_energy_tiredness_fatigue","Depressed_Mood","Gastrointestinal_symptoms","Genitourinary_symptoms","Hyperactivity_agitation","Impulsivity","Inattention","Indecisiveness","Respiratory_symptoms","Suicidal_ideas","Worthlessness_and_guilty","avoidance_of_stimuli","compensatory_behaviors_to_prevent_weight_gain","compulsions","diminished_emotional_expression","do_things_easily_get_painful_consequences","drastical_shift_in_mood_and_energy","fear_about_social_situations","fear_of_gaining_weight","fears_of_being_negatively_evaluated","flight_of_ideas","intrusion_symptoms","loss_of_interest_or_motivation","more_talktive","obsession","panic_fear","pessimism","poor_memory","sleep_disturbance","somatic_muscle","somatic_symptoms_others","somatic_symptoms_sensory","weight_and_appetite_change","Anger_Irritability"]
label = 'depression'

for name in options:
    df = pd.read_json(f'../data/vectorData/{name}Vectors.json',orient='records', lines=True)
    print(len(df))
    dfm = df.loc[(df['label'] == label) & (df['gender'] == 'm')]
    dff = df.loc[(df['label'] == label) & (df['gender'] == 'f')]
    vectorsm = dfm[symps]
    vectorsf = dff[symps]
    metricsm = {
    'Mean_M': vectorsm.mean(),
    'Std_M': vectorsm.std(),
    'Median_M': vectorsm.median(),
    'Maximum_M': vectorsm.max(),
    'Minimum_M': vectorsm.min(),
    '25th Percentile_M': vectorsm.quantile(0.25),
    '75th Percentile_M': vectorsm.quantile(0.75),
    'Variance_M': vectorsm.var(),
    }
    metricsf = {
    'Mean_F': vectorsf.mean(),
    'Std_F': vectorsf.std(),
    'Median_F': vectorsf.median(),
    'Maximum_F': vectorsf.max(),
    'Minimum_F': vectorsf.min(),
    '25th Percentile_F': vectorsf.quantile(0.25),
    '75th Percentile_F': vectorsf.quantile(0.75),
    'Variance_F': vectorsf.var(),
    }
    metricsm = pd.DataFrame(metricsm)
    metricsf = pd.DataFrame(metricsf)
    # metricsm = metricsm.T
    # metricsf = metricsf.T

metricsf.to_csv(f'{label}femaleSymptomMetrics.csv')
metricsm.to_csv(f'{label}maleSymptomMetrics.csv')


797


In [6]:
# Load data
female_data = pd.read_csv(f'{label}femaleSymptomMetrics.csv', index_col=0)
male_data = pd.read_csv(f'{label}maleSymptomMetrics.csv', index_col=0)


# Merging dataframes
merged_data = female_data.merge(male_data, left_index=True, right_index=True, how='inner')

# Calculating differences

merged_data['Mean_Diff'] = merged_data['Mean_F']/ merged_data['Mean_M']
merged_data['Std_Diff'] = merged_data['Std_F'] / merged_data['Std_M']

# Selecting metrics
comparison_columns = ['Mean_F', 'Mean_M','Mean_Diff',
                      'Std_F', 'Std_M','Std_Diff',
                      'Median_F', 'Median_M', 
                      'Maximum_F', 'Maximum_M', 
                      'Minimum_F', 'Minimum_M', 
                      '25th Percentile_F', '25th Percentile_M',
                      '75th Percentile_F', '75th Percentile_M', 
                      'Variance_F', 'Variance_M'
                      ]

# Creating a summary dataframe for comparison
summary_comparison = merged_data[comparison_columns]

summary_comparison.to_csv(f'{label}comparison.csv')

### Top 10 symptom metrics

In [8]:
# Creating ranked comparison
sorted_valuesF = summary_comparison.sort_values(by='Mean_F', ascending=False)
sorted_valuesM = summary_comparison.sort_values(by='Mean_M', ascending=False)

sorted_valuesF['ranking_F'] = range(1,39)
sorted_valuesM['ranking_M'] = range(1,39)


merged = sorted_valuesM.merge(sorted_valuesF[['ranking_F']], left_index=True, right_index=True, how='right')
merged.to_csv('../data/results/depressedSymptomComparison.csv')

print(merged[['Mean_F','Mean_M','Mean_Diff','Std_F', 'Std_M','Std_Diff']].head(10).to_latex(caption='Top 10 symptom metrics by mean'))

\begin{table}
\caption{Top 10 symptom metrics by mean}
\begin{tabular}{lrrrrrr}
\toprule
 & Mean_F & Mean_M & Mean_Diff & Std_F & Std_M & Std_Diff \\
\midrule
Depressed_Mood & 0.867480 & 0.880432 & 0.985289 & 0.275714 & 0.254977 & 1.081330 \\
Anxious_Mood & 0.661535 & 0.562628 & 1.175795 & 0.356803 & 0.406744 & 0.877217 \\
diminished_emotional_expression & 0.539533 & 0.530451 & 1.017122 & 0.278682 & 0.267200 & 1.042969 \\
Worthlessness_and_guilty & 0.467131 & 0.420450 & 1.111025 & 0.281541 & 0.283774 & 0.992133 \\
pessimism & 0.346634 & 0.320280 & 1.082284 & 0.288199 & 0.275450 & 1.046282 \\
sleep_disturbance & 0.346096 & 0.293989 & 1.177238 & 0.366967 & 0.353470 & 1.038185 \\
weight_and_appetite_change & 0.340676 & 0.259231 & 1.314177 & 0.392320 & 0.367322 & 1.068056 \\
Anger_Irritability & 0.332596 & 0.258840 & 1.284945 & 0.301390 & 0.253815 & 1.187441 \\
Hyperactivity_agitation & 0.323148 & 0.283876 & 1.138344 & 0.269449 & 0.290578 & 0.927286 \\
drastical_shift_in_mood_and_energy & 

### Top 10 depression symptom distribution graph

In [16]:
df = pd.read_json(f'../data/vectorData/spacysmVectors.json',orient='records', lines=True)
df = df.loc[df['label'] == 'depression']
symps = ["Anxious_Mood","Depressed_Mood","Hyperactivity_agitation","Worthlessness_and_guilty","diminished_emotional_expression","drastical_shift_in_mood_and_energy","panic_fear","pessimism","sleep_disturbance","Anger_Irritability"]

df = pd.melt(df, id_vars=['video_id', 'label', 'key', 'gender', 'duration', 'channelId', 'text'], 
                  var_name='Symptom', value_name='Score',
                  value_vars=symps)

fig = px.box(df, x='Symptom', y='Score', color='gender',
             title='Symptom Distributions by Gender',
             category_orders={"Symptom": sorted(df['Symptom'].unique())},
             color_discrete_map={'m': 'blue', 'f': 'red'})

fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray': sorted(df['Symptom'].unique())},
                  xaxis_title='Symptom',
                  yaxis_title='Score',
                  boxmode='group',  
                  height=1000,  
                  width=1800)  
fig.update_traces(quartilemethod="inclusive")

# Display the figure
fig.show()

### Adjusted symptom differences

In [36]:
df = pd.read_csv('../data/results/depressedSymptomComparison.csv')
df['Mean_Diff'] = (df['Mean_Diff'] * 100 ) - 100
df['Mean_Average'] = df[['Mean_F','Mean_M']].mean(axis=1)
df = df.head(10)

fig.show()

fig1 = px.bar(df, x='Unnamed: 0',
              y='Mean_Diff',
              color='Mean_Average',
              color_continuous_scale='orrd',
              title='Difference in cross gender symptom prevalence'
              )

fig1.update_layout(
    xaxis_title="Symptoms",
    yaxis_title="Mean Differences",
    xaxis={'categoryorder':'total descending'},  # Sorts the bars if needed
    yaxis=dict(range=[min(df['Mean_Diff']), max(df['Mean_Diff'])])
)

fig1.show()

# df['Adjusted_Mean_Diff'] = (df['Mean_Diff'] / df['Mean_Diff'].mean() * 100) - 100
# df['Adjusted_Mean_Diffabs'] = abs((df['Mean_Diff'] / df['Mean_Diff'].mean() * 100) - 100)
# display(df[['Unnamed: 0','Mean_F','Mean_M','Mean_Diff','Adjusted_Mean_Diff']])
# df_sorted = df.sort_values(by='Adjusted_Mean_Diffabs',ascending=False).head(10)
# display(df_sorted[['Unnamed: 0','Mean_F','Mean_M','Mean_Diff','Adjusted_Mean_Diff']])

# fig = px.bar(df, x='Unnamed: 0', 
#              y='Adjusted_Mean_Diff', 
#              color='Mean_Average', 
#              color_continuous_scale='orrd',
#              title='Adjusted difference in cross gender symptom prevalence'
#              )

# fig.update_layout(
#     xaxis_title="Symptoms",
#     yaxis_title="Adjusted Mean Differences",
#     xaxis={'categoryorder':'total descending'},  # Sorts the bars if needed
#     yaxis=dict(range=[min(df['Adjusted_Mean_Diff']), max(df['Adjusted_Mean_Diff'])])
# )



Unnamed: 0.1,Unnamed: 0,Mean_F,Mean_M,Mean_Diff,Adjusted_Mean_Diff
0,Depressed_Mood,0.86748,0.880432,-1.47107,-16.91324
1,Anxious_Mood,0.661535,0.562628,17.579546,-0.848375
2,diminished_emotional_expression,0.539533,0.530451,1.712223,-14.228856
3,Worthlessness_and_guilty,0.467131,0.42045,11.102525,-6.31027
4,pessimism,0.346634,0.32028,8.228385,-8.733954
5,sleep_disturbance,0.346096,0.293989,17.723777,-0.726749
6,weight_and_appetite_change,0.340676,0.259231,31.417686,10.820951
7,Anger_Irritability,0.332596,0.25884,28.494528,8.355932
8,Hyperactivity_agitation,0.323148,0.283876,13.834434,-4.006526
9,drastical_shift_in_mood_and_energy,0.31064,0.233263,33.171247,12.299681


Unnamed: 0.1,Unnamed: 0,Mean_F,Mean_M,Mean_Diff,Adjusted_Mean_Diff
0,Depressed_Mood,0.86748,0.880432,-1.47107,-16.91324
2,diminished_emotional_expression,0.539533,0.530451,1.712223,-14.228856
9,drastical_shift_in_mood_and_energy,0.31064,0.233263,33.171247,12.299681
6,weight_and_appetite_change,0.340676,0.259231,31.417686,10.820951
4,pessimism,0.346634,0.32028,8.228385,-8.733954
7,Anger_Irritability,0.332596,0.25884,28.494528,8.355932
3,Worthlessness_and_guilty,0.467131,0.42045,11.102525,-6.31027
8,Hyperactivity_agitation,0.323148,0.283876,13.834434,-4.006526
1,Anxious_Mood,0.661535,0.562628,17.579546,-0.848375
5,sleep_disturbance,0.346096,0.293989,17.723777,-0.726749


### D-vlog dataset distribution

In [11]:
df = pd.read_json('../data/splitData/postdataLinesSplit.json',lines=True, orient='records')

df = df[['gender','label']]


overview = pd.pivot_table(df, index='label', columns='gender', aggfunc=len, fill_value=0)

overview['Total'] = overview.sum(axis=1)  # Total per label
overview.loc['Total'] = overview.sum()  # Total per gender and overall

print(overview.to_latex(caption='D-vlog dataset distribution'))


\begin{table}
\caption{D-vlog dataset distribution}
\begin{tabular}{lrrr}
\toprule
gender & f & m & Total \\
label &  &  &  \\
\midrule
depression & 295 & 139 & 434 \\
normal & 237 & 126 & 363 \\
Total & 532 & 265 & 797 \\
\bottomrule
\end{tabular}
\end{table}



### Symptom appendix

In [9]:
df = pd.read_csv('../data/results/depressedSymptomPrevalence.csv')

print(df[['Unnamed: 0','Mean_F','Mean_M','Std_F','Std_M']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
Unnamed: 0 & Mean_F & Mean_M & Std_F & Std_M \\
\midrule
Depressed_Mood & 0.867480 & 0.880432 & 0.275714 & 0.254977 \\
Anxious_Mood & 0.661535 & 0.562628 & 0.356803 & 0.406744 \\
diminished_emotional_expression & 0.539533 & 0.530451 & 0.278682 & 0.267200 \\
Worthlessness_and_guilty & 0.467131 & 0.420450 & 0.281541 & 0.283774 \\
pessimism & 0.346634 & 0.320280 & 0.288199 & 0.275450 \\
sleep_disturbance & 0.346096 & 0.293989 & 0.366967 & 0.353470 \\
weight_and_appetite_change & 0.340676 & 0.259231 & 0.392320 & 0.367322 \\
Anger_Irritability & 0.332596 & 0.258840 & 0.301390 & 0.253815 \\
Hyperactivity_agitation & 0.323148 & 0.283876 & 0.269449 & 0.290578 \\
drastical_shift_in_mood_and_energy & 0.310640 & 0.233263 & 0.304146 & 0.237590 \\
flight_of_ideas & 0.275055 & 0.252806 & 0.266356 & 0.261093 \\
Decreased_energy_tiredness_fatigue & 0.269593 & 0.236931 & 0.311732 & 0.300963 \\
Catatonic_behavior & 0.254318 & 0.201564 & 0.280538 & 0.256587 \\
fear_about_s