# Finding the variables which are significantly associated with each other

Importing the libraries

In [35]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

Importing the Data

In [36]:
df = pd.read_excel("Data/Visualization_Data.xlsx")

Encoding the Data

In [37]:
df['Sex'] = df['Sex'].map({'Female': 0, 'Male': 1})
df['depression_status'] = df['depression_status'].map({'No MDD': 0, 'MDD': 1})
df['Alcohol_Frequency'] = df['Alcohol_Frequency'].map({'Never': 1, 'Rarely': 2, 'Occasionally': 3})
df['Uni_Year'] = df['Uni_Year'].map({'Year 1': 1, 'Year 2': 2, 'Year 3': 3, 'Year 4': 4})
df['Academic_Stress'] = df['Academic_Stress'].map({1: 1, 2: 1, 3: 2, 4: 3, 5: 3})
df['Academic_Achievements_Satisfaction'] = df['Academic_Achievements_Satisfaction'].map(
    {'No': 1, 'Neither satisfied nor dissatisfied': 2, 'No GPA as of yet': 2, 'Yes': 3}
)
df['Family_Income'] = df['Family_Income'].map(
    {'Less than 30,000': 1, '30,000-100,000': 1, '100,00-250,000': 2, '250,000-500,000': 3, 'Greater than 500,000': 3}
)
df['Ragging_Experience'] = df['Ragging_Experience'].map({'No': 0, 'Yes': 1})
df['Separated'] = df['Separated'].map({'No': 0, 'Yes': 1})
df['Love_Affair_Satisfied'] = df['Love_Affair_Satisfied'].map({'No': 0, 'Yes': 1})
df['Love_Affair_Not_Satisfied'] = df['Love_Affair_Not_Satisfied'].map({'No': 0, 'Yes': 1})

Selecting the variables to measure associations

In [38]:
new_df = df[['Sex', 'depression_status', 'Alcohol_Frequency', 'Uni_Year', 'Academic_Stress',
             'Academic_Achievements_Satisfaction', 'Family_Income', 'Ragging_Experience',
             'Separated', 'Love_Affair_Satisfied', 'Love_Affair_Not_Satisfied',
             'Post_Stratification_Weight', 'Appearance_Satisfaction']]

Writing a custom function to get the weighted chi squared test of association

In [39]:

def weighted_chi_squared(df, col1, col2, weight_col):
    """
    Calculate the chi-squared test of independence for two categorical columns using a weighted contingency table.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    col1 (str): The name of the first categorical column.
    col2 (str): The name of the second categorical column.
    weight_col (str): The name of the column containing the weights for each observation.

    Returns:
    float: The chi-squared statistic.
    float: The p-value of the test.
    """
    # Compute the weighted contingency table
    contingency_table = df.groupby([col1, col2]).agg(total_weight=(weight_col, 'sum')).unstack(fill_value=0)
    contingency_matrix = contingency_table.values

    # Perform the chi-squared test
    chi2_stat, p_value, dof, expected = chi2_contingency(contingency_matrix)

    return chi2_stat, p_value


Getting the measures of association

In [40]:
df = new_df

# Define the pairs of variables to test
variable_pairs = [
    ('Sex', 'depression_status'),
    ('Alcohol_Frequency', 'depression_status'),
    ('Uni_Year', 'depression_status'),
    ('Uni_Year', 'Academic_Stress'),
    ('Academic_Achievements_Satisfaction', 'depression_status'),
    ('Family_Income', 'depression_status'),
    ('Ragging_Experience', 'depression_status'),
    ('Separated', 'depression_status'),
    ('Love_Affair_Satisfied', 'depression_status'),
    ('Love_Affair_Not_Satisfied', 'depression_status'),
    ('Appearance_Satisfaction', 'depression_status')
]

# Special cases with conditional filtering
special_cases = [
    ('Alcohol_Frequency', 'depression_status', 'Sex', 0),
    ('Sex', 'depression_status', 'Separated', 0),
    ('Sex', 'depression_status', 'Separated', 1),
    ('Sex', 'depression_status', 'Love_Affair_Not_Satisfied', 0),
    ('Sex', 'depression_status', 'Love_Affair_Not_Satisfied', 1)
]

# Results DataFrame
results = []

# Regular cases
for col1, col2 in variable_pairs:
    chi2_stat, p_value = weighted_chi_squared(df, col1, col2,'Post_Stratification_Weight')
    results.append({
        'Variables': f'{col1} and {col2}',
        'Chi-Squared Stat': chi2_stat,
        'P-Value': p_value
    })

# Special conditional cases
for col1, col2, condition_col, condition_val in special_cases:
    filtered_df = df[df[condition_col] == condition_val]
    chi2_stat, p_value = weighted_chi_squared(filtered_df, col1, col2,'Post_Stratification_Weight')
    results.append({
        'Variables': f'{col1} and {col2} (Where {condition_col} is {condition_val})',
        'Chi-Squared Stat': chi2_stat,
        'P-Value': p_value
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# return the DataFrame
results_df


Unnamed: 0,Variables,Chi-Squared Stat,P-Value
0,Sex and depression_status,2.02981,0.15424
1,Alcohol_Frequency and depression_status,4.059982,0.131337
2,Uni_Year and depression_status,8.988034,0.02945
3,Uni_Year and Academic_Stress,12.272562,0.056157
4,Academic_Achievements_Satisfaction and depress...,22.430882,1.3e-05
5,Family_Income and depression_status,0.194243,0.907446
6,Ragging_Experience and depression_status,7.69586,0.005535
7,Separated and depression_status,5.672327,0.017235
8,Love_Affair_Satisfied and depression_status,2.517319,0.112602
9,Love_Affair_Not_Satisfied and depression_status,0.0,1.0
