In [2]:
import pandas as pd

### results analysis

In [23]:
df = pd.read_csv('../data/results/testresults.csv')

# Table for SVM tokenizer Performance and Fairness Metrics
SVMTKMetrics = df[['predictor','depressionTotalF1','normalTotalF1','accuracyTotal','statisticalParity','equalOpportunity','equalisedOdds','equalAccuracy']]

SVMTKMetrics.to_csv('SVMTKMetricsSVM.csv')

SVMTKMetrics = SVMTKMetrics[SVMTKMetrics['predictor'].isin(['blingfire','nltk','spacysm','spacylg','spacytrf'])]
SVMTKMetrics.set_index('predictor', inplace=True)
SVMTKMetrics = SVMTKMetrics.T

latex = SVMTKMetrics.to_latex(
    caption='Tokenizer Performance And Fairness Metrics',
    index=True,  
    header=True, 
    bold_rows=True, 
    column_format='lcccccc',
    escape=False
)
print(latex)

\begin{table}
\caption{Tokenizer Performance And Fairness Metrics}
\begin{tabular}{lcccccc}
\toprule
predictor & blingfire & nltk & spacysm & spacylg & spacytrf \\
\midrule
\textbf{depressionTotalF1} & 0.663900 & 0.663900 & 0.886076 & 0.860759 & 0.784314 \\
\textbf{normalTotalF1} & 0.000000 & 0.000000 & 0.890244 & 0.865854 & 0.804734 \\
\textbf{accuracyTotal} & 0.496894 & 0.496894 & 0.888199 & 0.863354 & 0.795031 \\
\textbf{statisticalParity} & NaN & NaN & 0.760321 & 0.760321 & 0.834862 \\
\textbf{equalOpportunity} & NaN & NaN & 0.940690 & 0.897931 & 0.833793 \\
\textbf{equalisedOdds} & 1.000000 & 1.000000 & 0.982556 & 0.941332 & 0.832223 \\
\textbf{equalAccuracy} & 1.340323 & 1.340323 & 0.974429 & 0.933828 & 0.822175 \\
\bottomrule
\end{tabular}
\end{table}



### Symptom result creation

In [7]:
# options = ['blingfire','reweightedblingfire','nltk','reweightednltk','spacysm','reweightedspacysm','spacylg','reweightedspacylg','spacytrf','reweightedspacytrf']
options = ['Testspacysm']
symps = ["Anxious_Mood","Autonomic_symptoms","Cardiovascular_symptoms","Catatonic_behavior","Decreased_energy_tiredness_fatigue","Depressed_Mood","Gastrointestinal_symptoms","Genitourinary_symptoms","Hyperactivity_agitation","Impulsivity","Inattention","Indecisiveness","Respiratory_symptoms","Suicidal_ideas","Worthlessness_and_guilty","avoidance_of_stimuli","compensatory_behaviors_to_prevent_weight_gain","compulsions","diminished_emotional_expression","do_things_easily_get_painful_consequences","drastical_shift_in_mood_and_energy","fear_about_social_situations","fear_of_gaining_weight","fears_of_being_negatively_evaluated","flight_of_ideas","intrusion_symptoms","loss_of_interest_or_motivation","more_talktive","obsession","panic_fear","pessimism","poor_memory","sleep_disturbance","somatic_muscle","somatic_symptoms_others","somatic_symptoms_sensory","weight_and_appetite_change","Anger_Irritability"]
label = 'depression'

for name in options:
    df = pd.read_json(f'../data/vectorData/{name}Vectors.json',orient='records', lines=True)
    dfm = df.loc[(df['label'] == label) & (df['gender'] == 'm')]
    dff = df.loc[(df['label'] == label) & (df['gender'] == 'f')]
    vectorsm = dfm[symps]
    vectorsf = dff[symps]
    metricsm = {
    'Mean_M': vectorsm.mean(),
    'Std_M': vectorsm.std(),
    'Median_M': vectorsm.median(),
    'Maximum_M': vectorsm.max(),
    'Minimum_M': vectorsm.min(),
    '25th Percentile_M': vectorsm.quantile(0.25),
    '75th Percentile_M': vectorsm.quantile(0.75),
    'Variance_M': vectorsm.var(),
    }
    metricsf = {
    'Mean_F': vectorsf.mean(),
    'Std_F': vectorsf.std(),
    'Median_F': vectorsf.median(),
    'Maximum_F': vectorsf.max(),
    'Minimum_F': vectorsf.min(),
    '25th Percentile_F': vectorsf.quantile(0.25),
    '75th Percentile_F': vectorsf.quantile(0.75),
    'Variance_F': vectorsf.var(),
    }
    metricsm = pd.DataFrame(metricsm)
    metricsf = pd.DataFrame(metricsf)
    # metricsm = metricsm.T
    # metricsf = metricsf.T

metricsf.to_csv(f'{label}femaleSymptomMetrics.csv')
metricsm.to_csv(f'{label}maleSymptomMetrics.csv')


In [8]:
import pandas as pd


# Load the data
female_data = pd.read_csv(f'{label}femaleSymptomMetrics.csv', index_col=0)
male_data = pd.read_csv(f'{label}maleSymptomMetrics.csv', index_col=0)

# Merging the dataframes
merged_data = female_data.merge(male_data, left_index=True, right_index=True, how='inner')

# Calculating differences in mean values between genders for each symptom
merged_data['Mean_Diff-'] = merged_data['Mean_F'] - merged_data['Mean_M']
merged_data['Mean_Diff/'] = merged_data['Mean_F']/merged_data['Mean_M']
merged_data['Std_Diff'] = merged_data['Std_F'] - merged_data['Std_M']
merged_data['Median_Diff'] = merged_data['Median_F'] - merged_data['Median_M']

# Selecting a subset of columns to summarize the comparison
comparison_columns = ['Mean_F', 'Mean_M', 'Mean_Diff-', 'Mean_Diff/',
                      'Std_F', 'Std_M','Std_Diff',
                      'Median_F', 'Median_M','Median_Diff', 
                      'Maximum_F', 'Maximum_M', 
                      'Minimum_F', 'Minimum_M', 
                    #   '25th Percentile_F', '25th Percentile_M', 
                    #   '75th Percentile_F', '75th Percentile_M', 
                    #   'Variance_F', 'Variance_M'
                      ]

# Creating a summary dataframe for comparison
summary_comparison = merged_data[comparison_columns]

# Displaying the summary for the first few symptoms for brevity
summary_comparison.to_csv(f'{label}comparison.csv')

Unnamed: 0,Mean_F,Mean_M,Mean_Diff-,Mean_Diff/,Std_F,Std_M,Std_Diff,Median_F,Median_M,Median_Diff,Maximum_F,Maximum_M,Minimum_F,Minimum_M,ranking_F
Depressed_Mood,0.86748,0.880432,-0.012952,0.985289,0.275714,0.254977,0.020737,0.986859,0.985987,0.000871,0.995538,0.995156,0.000487,0.004446,1
Anxious_Mood,0.661535,0.562628,0.098907,1.175795,0.356803,0.406744,-0.049941,0.85287,0.744013,0.108857,0.999141,0.998937,0.001242,0.004724,2
diminished_emotional_expression,0.539533,0.530451,0.009082,1.017122,0.278682,0.2672,0.011481,0.628525,0.618126,0.010398,0.956294,0.961681,0.000113,0.001212,3
Worthlessness_and_guilty,0.467131,0.42045,0.046681,1.111025,0.281541,0.283774,-0.002233,0.446305,0.373636,0.072669,0.934176,0.947219,0.000193,0.001502,4
pessimism,0.346634,0.32028,0.026354,1.082284,0.288199,0.27545,0.012748,0.261497,0.198553,0.062944,0.956995,0.972086,0.000157,0.001349,5
sleep_disturbance,0.346096,0.293989,0.052106,1.177238,0.366967,0.35347,0.013497,0.15453,0.103751,0.050779,0.993432,0.985365,0.000724,0.001378,6
weight_and_appetite_change,0.340676,0.259231,0.081444,1.314177,0.39232,0.367322,0.024998,0.052269,0.018034,0.034236,0.985517,0.967721,0.000447,0.002441,7
Anger_Irritability,0.332596,0.25884,0.073755,1.284945,0.30139,0.253815,0.047575,0.219877,0.162907,0.05697,0.976096,0.962199,0.000492,0.006947,8
Hyperactivity_agitation,0.323148,0.283876,0.039273,1.138344,0.269449,0.290578,-0.021129,0.238507,0.17044,0.068067,0.983143,0.935516,0.000533,0.003247,9
drastical_shift_in_mood_and_energy,0.31064,0.233263,0.077376,1.331712,0.304146,0.23759,0.066556,0.197153,0.184424,0.012729,0.995984,0.984812,0.000211,0.00139,10


In [9]:
sorted_valuesF = summary_comparison.sort_values(by='Mean_F', ascending=False)
sorted_valuesM = summary_comparison.sort_values(by='Mean_M', ascending=False)

sorted_valuesF['ranking_F'] = range(1,39)
sorted_valuesM['ranking_M'] = range(1,39)



merged = sorted_valuesM.merge(sorted_valuesF[['ranking_F']], left_index=True, right_index=True, how='right')
merged.to_csv('../data/results/symptomResults.csv')