In [None]:
import pandas as pd

# Load the dataset
df = pd.read_excel('data.xlsx')

# Display basic information
print(f"Dataset shape: {df.shape}")

# View the first few rows
df.head()

In [None]:
# Remove timestamp column
df = df.drop('Timestamp', axis=1)

# Clean column names
df.columns = [col.strip() for col in df.columns]
df.columns = [col.replace('\r\n', '') for col in df.columns]

# Check age groups
print("Unique age groups:", df['Age Group'].unique())

# Clean age groups by removing the word 'years' if present
df['Age Group'] = df['Age Group'].str.replace(' years', '')

# Fill missing ages based on age group
# Define a mapping of age groups to representative ages
age_group_mapping = {
    '10-19': 17,  # Assuming average of range for those with missing age
    '20-29': 25,
    '30-39': 35,
    '40-49': 45,
    '50+': 55
}

# For each row with missing age, fill with the representative age for its age group
for idx, row in df[df['Age'].isna()].iterrows():
    age_group = row['Age Group']
    if age_group in age_group_mapping:
        df.at[idx, 'Age'] = age_group_mapping[age_group]

# Count any remaining missing ages
missing_ages = df['Age'].isna().sum()
print(f"Remaining missing ages: {missing_ages}")

# View the updated dataframe
print(f"Updated dataset shape: {df.shape}")
df.head()

In [None]:
# Check for null values in the entire dataframe
null_counts = df.isnull().sum()

# Display columns with null values (if any)
columns_with_nulls = null_counts[null_counts > 0]
if len(columns_with_nulls) > 0:
    print("Columns with null values:")
    print(columns_with_nulls)
else:
    print("No null values found in the dataset.")

# Get the percentage of null values in each column
null_percentage = (df.isnull().sum() / len(df)) * 100

# Display columns with null values along with their percentage
columns_with_nulls_percentage = null_percentage[null_percentage > 0]
if len(columns_with_nulls_percentage) > 0:
    print("\nPercentage of null values in columns:")
    print(columns_with_nulls_percentage)
    
# Count total number of null values in the dataframe
total_nulls = df.isnull().sum().sum()
print(f"\nTotal null values in the dataset: {total_nulls}")

In [None]:
# Handle missing values

# For the column with the most missing values (Family history), replace with 'None reported'
df['Family history of diagnosed mental illness in family'].fillna('None reported', inplace=True)

# For Likert scale questions (attitude/stigma questions), fill with the mode (most common answer)
# First, get a list of all the columns with null values except 'Family history'
likert_columns_with_nulls = [col for col in columns_with_nulls.index 
                            if col != 'Family history of diagnosed mental illness in family']

# Fill each column with its mode
for col in likert_columns_with_nulls:
    mode_value = df[col].mode()[0]
    df[col].fillna(mode_value, inplace=True)

# Verify all nulls are handled
remaining_nulls = df.isnull().sum().sum()
print(f"Remaining null values: {remaining_nulls}")

# If you want to see which columns still have nulls (should be none)
if remaining_nulls > 0:
    print(df.isnull().sum()[df.isnull().sum() > 0])

In [None]:
df

In [None]:
# ATSPPHS Openness Subscale (5 items)
atspphs_openness = [
    'If I believed I was having a mental breakdown, my first inclination would be to get professional help',
    'If I were experiencing a serious emotional crisis at this point in my life, I would be confident that I could find relief in psychotherapy',
    'I would want to get psychiatric attention if I was worried or upset for a long period of time',
    'At some future time, I might want to have psychological counselling',
    'A person with an emotional problem is not likely to solve it alone; he is likely to solve with professional help '
]

# ATSPPHS Value Subscale (5 items)
atspphs_value = [
    'The idea of talking about problems with a psychologist strikes me as a poor way to get rid of emotional conflicts',
    'There is something admirable in the attitude of a person who is willing to deal with own  conflicts and fears without resorting to professional help',
    'Considering the time and expense involved in psychotherapy, it would have doubtful value for a person like me',
    'A person should work out one\'s  own problems; getting psychological counselling should be the  last resort',
    'Emotional difficulties, like many things, tend to work out by themselves'
]

# DSS Personal Subscale (9 items)
dss_personal = [
    'People with depression could snap out of it if they wanted.',
    'Depression is a sign of personal weakness',
    'Depression is not a real medical illness',
    'People with depression are dangerous',
    'It is best to avoid people with depression, so you do not become depressed yourself',
    'People with depression are unpredictable',
    'If I had depression, I would not tell anyone',
    'I would not study/mingle with someone if I knew they were  depressed',
    'I would not vote for a student for a leadership role if I knew they were  depressed'
]

# DSS Perceived Subscale (9 items)
dss_perceived = [
    'Most people believe that people with depression could snap out of it if they wanted',
    'Most people believe that depression is a sign of personal weakness',
    'Most people believe that depression is not a real medical illness',
    'Most people believe that people with depression are dangerous',
    'Most people believe it is best to avoid people with depression, so you do not become depressed yourself',
    'Most people believe that people with depression are unpredictable',
    'If they had depression, most people would not tell anyone',
    'Most people would not study/mingle with someone they knew were  depressed',
    'Most people would vote for a student for a leadership role who they knew was  depressed'
]

In [None]:
# Check exact column names
column_names = df.columns.tolist()

# Find the exact column names that match our needed items
# For ATSPPHS Openness items
exact_atspphs_openness = []
for item in atspphs_openness:
    matches = [col for col in column_names if item.strip() in col.strip()]
    if matches:
        exact_atspphs_openness.append(matches[0])
    else:
        print(f"No match found for: {item}")

# For ATSPPHS Value items
exact_atspphs_value = []
for item in atspphs_value:
    matches = [col for col in column_names if item.strip() in col.strip()]
    if matches:
        exact_atspphs_value.append(matches[0])
    else:
        print(f"No match found for: {item}")

# For DSS Personal items
exact_dss_personal = []
for item in dss_personal:
    matches = [col for col in column_names if item.strip() in col.strip()]
    if matches:
        exact_dss_personal.append(matches[0])
    else:
        print(f"No match found for: {item}")

# For DSS Perceived items
exact_dss_perceived = []
for item in dss_perceived:
    matches = [col for col in column_names if item.strip() in col.strip()]
    if matches:
        exact_dss_perceived.append(matches[0])
    else:
        print(f"No match found for: {item}")

# Print the exact column names found
print("ATSPPHS Openness items found:", len(exact_atspphs_openness))
print("ATSPPHS Value items found:", len(exact_atspphs_value))
print("DSS Personal items found:", len(exact_dss_personal))
print("DSS Perceived items found:", len(exact_dss_perceived))

# Now use these exact column names for conversion
df_numeric = df.copy()

# Convert DSS items (higher score = more stigma)
df_numeric = convert_likert_to_numeric(df_numeric, exact_dss_personal)
df_numeric = convert_likert_to_numeric(df_numeric, exact_dss_perceived)

# Check for the last item in perceived stigma that needs to be reversed
reversed_item = [col for col in exact_dss_perceived if 'vote for a student' in col]
if reversed_item:
    likert_mapping = {'Strongly Disagree': 4, 'Disagree': 3, 'Agree': 2, 'Strongly agree': 1, 'Strongly Agree': 1}
    df_numeric[reversed_item[0]] = df_numeric[reversed_item[0]].map(likert_mapping)

# Convert ATSPPHS items
df_numeric = convert_likert_to_numeric(df_numeric, exact_atspphs_openness)
df_numeric = convert_likert_to_numeric(df_numeric, exact_atspphs_value, reverse=True)

# Calculate scale scores
# DSS Personal Scale
df_numeric['DSS_Personal'] = df_numeric[exact_dss_personal].sum(axis=1)

# DSS Perceived Scale
df_numeric['DSS_Perceived'] = df_numeric[exact_dss_perceived].sum(axis=1)

# DSS Total Score
df_numeric['DSS_Total'] = df_numeric['DSS_Personal'] + df_numeric['DSS_Perceived']

# ATSPPHS Openness Scale
df_numeric['ATSPPHS_Openness'] = df_numeric[exact_atspphs_openness].sum(axis=1)

# ATSPPHS Value Scale
df_numeric['ATSPPHS_Value'] = df_numeric[exact_atspphs_value].sum(axis=1)

# ATSPPHS Total Score
df_numeric['ATSPPHS_Total'] = df_numeric['ATSPPHS_Openness'] + df_numeric['ATSPPHS_Value']

In [None]:
# Print the full DataFrame columns to examine them
print("All column names in the DataFrame:")
for i, col in enumerate(df.columns):
    print(f"{i}: {col}")

# Now let's find the closest match for the missing item
missing_item = "A person should work out one's own problems; getting psychological counselling should be the last resort"
potential_matches = [col for col in df.columns if "work out" in col and "psychological" in col]
print("\nPotential matches for the missing item:")
for match in potential_matches:
    print(match)

# Let's manually fix the ATSPPHS Value list with the correct column name
# Assuming we found the match, update the exact_atspphs_value list
if potential_matches:
    exact_atspphs_value.append(potential_matches[0])
    print(f"\nAdded missing item: {potential_matches[0]}")
else:
    print("\nNo match found. Check your column names carefully.")

print("\nATSPPHS Value items found now:", len(exact_atspphs_value))

# Now recalculate the ATSPPHS Value and Total scores
if len(exact_atspphs_value) == 5:  # Only proceed if we have all 5 items
    df_numeric = convert_likert_to_numeric(df_numeric, [potential_matches[0]], reverse=True)
    df_numeric['ATSPPHS_Value'] = df_numeric[exact_atspphs_value].sum(axis=1)
    df_numeric['ATSPPHS_Total'] = df_numeric['ATSPPHS_Openness'] + df_numeric['ATSPPHS_Value']
    print("Recalculated ATSPPHS scores successfully.")

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate descriptive statistics for continuous variables
continuous_vars = ['Age', 'DSS_Personal', 'DSS_Perceived', 'DSS_Total', 'ATSPPHS_Openness', 'ATSPPHS_Value', 'ATSPPHS_Total']

# Create a function to calculate mean and standard deviation
def mean_std(data):
    return f"{data.mean():.2f} ± {data.std():.2f}"

# Calculate descriptive statistics
desc_stats = pd.DataFrame({
    'Mean ± SD': [mean_std(df_numeric[var]) for var in continuous_vars]
}, index=continuous_vars)

print("Descriptive Statistics for Continuous Variables:")
print(desc_stats)

# Now let's calculate frequencies and percentages for categorical variables
categorical_vars = ['Gender', 'Age Group', 'Field of Study', 'Living arrangement', 'Marital status', 
                   'Family system', 'History of suicide in family', 'Depression', 'GAD', 'Panic', 
                   'Schiz', 'Bipolar', 'None']

# Calculate frequencies and percentages
print("\nFrequency and Percentage for Categorical Variables:")
for var in categorical_vars:
    freq = df[var].value_counts()
    perc = df[var].value_counts(normalize=True) * 100
    print(f"\n{var}:")
    result = pd.DataFrame({
        'Frequency': freq,
        'Percentage (%)': perc.round(1)
    })
    print(result)

In [None]:
# First, let's check the data types of our columns
print("Data types of continuous variables:")
for var in continuous_vars:
    if var in df_numeric.columns:
        print(f"{var}: {df_numeric[var].dtype}")
    else:
        print(f"{var}: Not found in dataframe")

# Let's focus on 'Age' first which should be numeric
print("\nFirst few values of Age:")
print(df_numeric['Age'].head())

# Let's check if all our scale scores are calculated properly
# Let's look at the data types of the first few items in each scale
print("\nFirst item in DSS Personal:")
if exact_dss_personal:
    print(f"{exact_dss_personal[0]}: {df_numeric[exact_dss_personal[0]].dtype}")
    print(df_numeric[exact_dss_personal[0]].head())

print("\nFirst item in DSS Perceived:")
if exact_dss_perceived:
    print(f"{exact_dss_perceived[0]}: {df_numeric[exact_dss_perceived[0]].dtype}")
    print(df_numeric[exact_dss_perceived[0]].head())

print("\nFirst item in ATSPPHS Openness:")
if exact_atspphs_openness:
    print(f"{exact_atspphs_openness[0]}: {df_numeric[exact_atspphs_openness[0]].dtype}")
    print(df_numeric[exact_atspphs_openness[0]].head())

print("\nFirst item in ATSPPHS Value:")
if exact_atspphs_value:
    print(f"{exact_atspphs_value[0]}: {df_numeric[exact_atspphs_value[0]].dtype}")
    print(df_numeric[exact_atspphs_value[0]].head())

# Let's manually recalculate the scale scores to ensure they're numeric
# First, make sure our Likert responses are properly converted to numbers
print("\nChecking if Likert conversion worked properly:")
for col in exact_dss_personal[:1] + exact_dss_perceived[:1] + exact_atspphs_openness[:1] + exact_atspphs_value[:1]:
    unique_values = df_numeric[col].unique()
    print(f"{col}: {unique_values}")

# Let's manually convert one item from each scale to check
test_cols = [
    exact_dss_personal[0],
    exact_dss_perceived[0],
    exact_atspphs_openness[0],
    exact_atspphs_value[0]
]

# Define the mapping again
likert_mapping = {
    'Strongly Disagree': 1, 
    'Disagree': 2, 
    'Agree': 3, 
    'Strongly agree': 4,
    'Strongly Agree': 4
}

# Apply mapping to test columns
for col in test_cols:
    df_numeric[f"{col}_test"] = df_numeric[col].map(likert_mapping)
    print(f"\nConverted {col}:")
    print(f"Original: {df_numeric[col].head()}")
    print(f"Converted: {df_numeric[f'{col}_test'].head()}")

In [None]:
# Check problematic Age values
print("Examining Age column:")
print(df_numeric['Age'].value_counts().head(10))  # Show most common values
print("\nNon-numeric values in Age column:")
non_numeric = [val for val in df_numeric['Age'].unique() if not pd.api.types.is_numeric_dtype(type(val)) and not isinstance(val, (int, float))]
for val in non_numeric:
    print(val)

# Handle Age with a function to extract just the first number
def extract_first_number(age_str):
    if pd.isna(age_str):
        return np.nan
    
    if isinstance(age_str, (int, float)):
        return age_str
    
    # Convert to string if it's not already
    age_str = str(age_str)
    
    # Extract the first number from the string
    import re
    numbers = re.findall(r'\d+', age_str)
    if numbers:
        return int(numbers[0])
    else:
        return np.nan

# Apply the function to Age
df_numeric['Age'] = df_numeric['Age'].apply(extract_first_number)
print("\nAge after extraction:", df_numeric['Age'].dtype)
print(df_numeric['Age'].value_counts().head(10))

# Now let's convert to numeric
df_numeric['Age'] = pd.to_numeric(df_numeric['Age'], errors='coerce')
print("\nAge dtype after conversion:", df_numeric['Age'].dtype)

# Calculate descriptive statistics
continuous_vars = ['Age', 'DSS_Personal', 'DSS_Perceived', 'DSS_Total', 'ATSPPHS_Openness', 'ATSPPHS_Value', 'ATSPPHS_Total']

# Create a function to calculate mean and standard deviation for numeric data
def mean_std(data):
    return f"{data.mean():.2f} ± {data.std():.2f}"

# Calculate descriptive statistics
desc_stats = {}
for var in continuous_vars:
    if var in df_numeric.columns:
        desc_stats[var] = mean_std(df_numeric[var])
    else:
        desc_stats[var] = "Not calculated"

# Convert to DataFrame for display
desc_stats_df = pd.DataFrame({'Mean ± SD': desc_stats}, index=list(desc_stats.keys()))

print("\nDescriptive Statistics for Continuous Variables:")
print(desc_stats_df)

In [None]:
# Calculate frequencies and percentages for categorical variables
categorical_vars = ['Gender', 'Age Group', 'Field of Study', 'Living arrangement', 'Marital status', 
                   'Family system', 'History of suicide in family', 'Depression', 'GAD', 'Panic', 
                   'Schiz', 'Bipolar', 'None']

print("\nFrequency and Percentage for Categorical Variables:")
for var in categorical_vars:
    if var in df.columns:
        freq = df[var].value_counts()
        perc = df[var].value_counts(normalize=True) * 100
        result = pd.DataFrame({
            'Frequency': freq,
            'Percentage (%)': perc.round(1)
        })
        print(f"\n{var}:")
        print(result)

In [None]:
# Perform Kolmogorov-Smirnov test for normality
from scipy import stats
import numpy as np

scale_vars = ['DSS_Personal', 'DSS_Perceived', 'DSS_Total', 'ATSPPHS_Openness', 'ATSPPHS_Value', 'ATSPPHS_Total']

print("\nKolmogorov-Smirnov Normality Test Results:")
for var in scale_vars:
    if var in df_numeric.columns:
        # Remove any NaN values
        data = df_numeric[var].dropna()
        
        # Only perform test if we have data
        if len(data) > 0:
            # Calculate mean and std for the data
            mean = data.mean()
            std = data.std()
            
            # Perform the test
            stat, p = stats.kstest(data, 'norm', args=(mean, std))
            
            print(f"{var}: Statistic={stat:.4f}, p-value={p:.4f}, Normal distribution: {p > 0.05}")
        else:
            print(f"{var}: Not enough data for normality test")
    else:
        print(f"{var}: Not found in dataframe")

In [None]:
# Create binary variables for the scales based on median split
# This is an alternative to dichotomizing each individual item

# For DSS scales, higher scores indicate more stigma
df_numeric['DSS_Personal_High'] = (df_numeric['DSS_Personal'] > df_numeric['DSS_Personal'].median()).astype(int)
df_numeric['DSS_Perceived_High'] = (df_numeric['DSS_Perceived'] > df_numeric['DSS_Perceived'].median()).astype(int)
df_numeric['DSS_Total_High'] = (df_numeric['DSS_Total'] > df_numeric['DSS_Total'].median()).astype(int)

# For ATSPPHS scales, higher scores indicate more positive attitude
df_numeric['ATSPPHS_Openness_High'] = (df_numeric['ATSPPHS_Openness'] > df_numeric['ATSPPHS_Openness'].median()).astype(int)
df_numeric['ATSPPHS_Value_High'] = (df_numeric['ATSPPHS_Value'] > df_numeric['ATSPPHS_Value'].median()).astype(int)
df_numeric['ATSPPHS_Total_High'] = (df_numeric['ATSPPHS_Total'] > df_numeric['ATSPPHS_Total'].median()).astype(int)

# Display frequencies of high/low categories
print("\nFrequencies of Dichotomized Scale Scores:")
binary_vars = ['DSS_Personal_High', 'DSS_Perceived_High', 'DSS_Total_High', 
               'ATSPPHS_Openness_High', 'ATSPPHS_Value_High', 'ATSPPHS_Total_High']

for var in binary_vars:
    freq = df_numeric[var].value_counts()
    perc = df_numeric[var].value_counts(normalize=True) * 100
    result = pd.DataFrame({
        'Frequency': freq,
        'Percentage (%)': perc.round(1)
    })
    print(f"\n{var}:")
    print(result)

In [None]:
# Chi-Square test for categorical variables and dichotomized scales
# Example: Test association between Gender and DSS_Personal_High
if 'Gender' in df.columns and 'DSS_Personal_High' in df_numeric.columns:
    # Create a contingency table
    contingency_table = pd.crosstab(df['Gender'], df_numeric['DSS_Personal_High'])
    print("\nContingency Table (Gender vs DSS_Personal_High):")
    print(contingency_table)
    
    # Perform Chi-Square test
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    
    # Check if Fisher's exact test is needed (if any expected frequency < 5)
    if (expected < 5).any():
        print("\nUsing Fisher's exact test (expected frequency < 5)")
        odds_ratio, p = stats.fisher_exact(contingency_table)
        test_name = "Fisher's exact test"
    else:
        print("\nUsing Chi-Square test")
        test_name = "Chi-Square test"
    
    print(f"\n{test_name} results:")
    print(f"p-value: {p:.4f}")
    print(f"Significant (p ≤ 0.05): {p <= 0.05}")
else:
    print("\nCannot perform Gender vs DSS_Personal_High test - columns not found")

# Example: Test association between Living arrangement and ATSPPHS_Total_High
if 'Living arrangement' in df.columns and 'ATSPPHS_Total_High' in df_numeric.columns:
    # Create a contingency table
    contingency_table = pd.crosstab(df['Living arrangement'], df_numeric['ATSPPHS_Total_High'])
    print("\nContingency Table (Living arrangement vs ATSPPHS_Total_High):")
    print(contingency_table)
    
    # Perform Chi-Square test
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    
    # Check if Fisher's exact test is needed (if any expected frequency < 5)
    if (expected < 5).any():
        print("\nUsing Fisher's exact test (expected frequency < 5)")
        # For tables larger than 2x2, we might need a different approach
        if contingency_table.shape == (2, 2):
            odds_ratio, p = stats.fisher_exact(contingency_table)
        else:
            print("Table is larger than 2x2, p-value from Chi-Square may not be accurate")
        test_name = "Fisher's exact test"
    else:
        print("\nUsing Chi-Square test")
        test_name = "Chi-Square test"
    
    print(f"\n{test_name} results:")
    print(f"p-value: {p:.4f}")
    print(f"Significant (p ≤ 0.05): {p <= 0.05}")
else:
    print("\nCannot perform Living arrangement vs ATSPPHS_Total_High test - columns not found")

In [None]:
# Check data types of variables used in regression
X_vars = ['Age', 'DSS_Personal', 'DSS_Perceived']
cat_vars = ['Gender', 'Living arrangement', 'Depression']

print("Data types of regression variables:")
for var in X_vars + cat_vars:
    if var in df_numeric.columns:
        print(f"{var}: {df_numeric[var].dtype}")
    else:
        print(f"{var}: Not in dataframe")

# Let's examine if there are any non-numeric values in our continuous variables
for var in X_vars:
    if var in df_numeric.columns:
        non_numeric = df_numeric[var][~pd.to_numeric(df_numeric[var], errors='coerce').notna()]
        if len(non_numeric) > 0:
            print(f"\nNon-numeric values in {var}:")
            print(non_numeric)

# Now let's fix the regression model by ensuring all variables are numeric
# Prepare data properly for regression
def prepare_regression_data(df, dep_var, indep_vars, cat_vars):
    # Create a clean dataframe for regression
    df_reg = df.copy()
    
    # Ensure all numeric variables are properly converted
    for var in indep_vars + [dep_var]:
        if var in df_reg.columns:
            df_reg[var] = pd.to_numeric(df_reg[var], errors='coerce')
    
    # Drop rows with NaN in dependent or independent variables
    df_reg = df_reg.dropna(subset=indep_vars + [dep_var])
    
    # Select continuous variables for the model
    X = df_reg[indep_vars].copy()
    
    # Add dummy variables for categorical predictors
    for cat_var in cat_vars:
        if cat_var in df_reg.columns:
            if cat_var == 'Gender':
                # Gender (reference: Female)
                X[f'{cat_var}_Male'] = (df_reg[cat_var] == 'Male').astype(int)
            elif cat_var == 'Depression':
                # Depression (reference: no)
                X[f'{cat_var}_yes'] = (df_reg[cat_var] == 'yes').astype(int)
            else:
                # Other categorical variables (reference: first category)
                dummies = pd.get_dummies(df_reg[cat_var], prefix=cat_var, drop_first=True)
                X = pd.concat([X, dummies], axis=1)
    
    # Add constant
    X = sm.add_constant(X)
    
    # Define dependent variable
    y = df_reg[dep_var]
    
    return X, y, df_reg

# Run the regression for ATSPPHS_Total
dep_var = 'ATSPPHS_Total'
indep_vars = ['Age', 'DSS_Personal', 'DSS_Perceived']
cat_vars = ['Gender', 'Living arrangement', 'Depression']

try:
    X, y, df_reg = prepare_regression_data(df_numeric, dep_var, indep_vars, cat_vars)
    
    # Print information about the data
    print(f"\nRegression data shapes: X: {X.shape}, y: {y.shape}")
    print(f"X data types:\n{X.dtypes}")
    
    # Fit the model
    model = sm.OLS(y, X).fit()
    
    # Display regression results
    print("\nMultiple Linear Regression: Predictors of ATSPPHS_Total")
    print(model.summary())
    
    # Extract coefficients and confidence intervals
    coefs = model.params
    conf_int = model.conf_int(alpha=0.05)
    std_err = model.bse
    p_values = model.pvalues
    
    # Create a dataframe for regression results
    results_df = pd.DataFrame({
        'Coefficient': coefs,
        'Std. Error': std_err,
        'p-value': p_values,
        '95% CI Lower': conf_int[0],
        '95% CI Upper': conf_int[1],
        'Significant': p_values <= 0.05
    })
    
    print("\nRegression Coefficients with 95% Confidence Intervals:")
    print(results_df)
except Exception as e:
    print(f"Error in regression: {e}")
    print("\nChecking for problematic columns:")
    # Find problematic columns with object dtype
    object_cols = X.select_dtypes(include=['object']).columns
    if len(object_cols) > 0:
        print(f"Columns with object dtype: {object_cols.tolist()}")
        for col in object_cols:
            print(f"\nUnique values in {col}:")
            print(X[col].unique())

In [None]:
# Fix the regression model specifically addressing the boolean column
dep_var = 'ATSPPHS_Total'
indep_vars = ['Age', 'DSS_Personal', 'DSS_Perceived']
cat_vars = ['Gender', 'Living arrangement', 'Depression']

# Create a clean dataframe for regression
df_reg = df_numeric.copy()

# Ensure all numeric variables are properly converted
for var in indep_vars + [dep_var]:
    if var in df_reg.columns:
        df_reg[var] = pd.to_numeric(df_reg[var], errors='coerce')

# Drop rows with NaN in dependent or independent variables
df_reg = df_reg.dropna(subset=indep_vars + [dep_var])

# Create X matrix manually to ensure all columns are numeric
X = pd.DataFrame()
X['const'] = 1.0
X['Age'] = df_reg['Age'].astype(float)
X['DSS_Personal'] = df_reg['DSS_Personal'].astype(float)
X['DSS_Perceived'] = df_reg['DSS_Perceived'].astype(float)

# Add dummy variables manually
X['Gender_Male'] = (df_reg['Gender'] == 'Male').astype(float)

# Create living arrangement dummies
# First, get the unique values
living_categories = df_reg['Living arrangement'].unique()
if len(living_categories) > 1:
    # Use the first category as reference
    reference_category = living_categories[0]
    for category in living_categories[1:]:
        # Create dummy variable for each non-reference category
        X[f'Living_{category}'] = (df_reg['Living arrangement'] == category).astype(float)

# Add Depression dummy
X['Depression_yes'] = (df_reg['Depression'] == 'yes').astype(float)

# Define dependent variable
y = df_reg[dep_var].astype(float)

# Check data types
print("X data types after manual conversion:")
print(X.dtypes)

# Fit the model
try:
    model = sm.OLS(y, X).fit()
    
    # Display regression results
    print("\nMultiple Linear Regression: Predictors of ATSPPHS_Total")
    print(model.summary())
    
    # Extract coefficients and confidence intervals
    coefs = model.params
    conf_int = model.conf_int(alpha=0.05)
    std_err = model.bse
    p_values = model.pvalues
    
    # Create a dataframe for regression results
    results_df = pd.DataFrame({
        'Coefficient': coefs,
        'Std. Error': std_err,
        'p-value': p_values,
        '95% CI Lower': conf_int[0],
        '95% CI Upper': conf_int[1],
        'Significant': p_values <= 0.05
    })
    
    print("\nRegression Coefficients with 95% Confidence Intervals:")
    print(results_df)
except Exception as e:
    print(f"Error in regression: {e}")

In [None]:
# Fix the regression model by removing inf/nan values
dep_var = 'ATSPPHS_Total'
indep_vars = ['Age', 'DSS_Personal', 'DSS_Perceived']
cat_vars = ['Gender', 'Living arrangement', 'Depression']

# Create a clean dataframe for regression
df_reg = df_numeric.copy()

# Ensure all numeric variables are properly converted
for var in indep_vars + [dep_var]:
    if var in df_reg.columns:
        df_reg[var] = pd.to_numeric(df_reg[var], errors='coerce')

# Create X matrix manually to ensure all columns are numeric
X = pd.DataFrame()
X['const'] = 1.0
X['Age'] = df_reg['Age'].astype(float)
X['DSS_Personal'] = df_reg['DSS_Personal'].astype(float)
X['DSS_Perceived'] = df_reg['DSS_Perceived'].astype(float)

# Add dummy variables manually
X['Gender_Male'] = (df_reg['Gender'] == 'Male').astype(float)

# Create living arrangement dummies
# First, get the unique values
living_categories = df_reg['Living arrangement'].unique()
if len(living_categories) > 1:
    # Use the first category as reference
    reference_category = living_categories[0]
    for category in living_categories[1:]:
        # Create dummy variable for each non-reference category
        X[f'Living_{category}'] = (df_reg['Living arrangement'] == category).astype(float)

# Add Depression dummy
X['Depression_yes'] = (df_reg['Depression'] == 'yes').astype(float)

# Define dependent variable
y = df_reg[dep_var].astype(float)

# Check for NaN or infinite values
print("Checking for NaN or inf values in predictor variables:")
for col in X.columns:
    nan_count = X[col].isna().sum()
    inf_count = np.isinf(X[col]).sum()
    if nan_count > 0 or inf_count > 0:
        print(f"{col}: {nan_count} NaNs, {inf_count} infs")

# Drop rows with NaN or inf values
mask_finite = np.isfinite(X).all(axis=1) & np.isfinite(y)
X_clean = X[mask_finite]
y_clean = y[mask_finite]

print(f"\nRemoved {len(X) - len(X_clean)} rows with NaN/inf values")
print(f"Final regression data shapes: X: {X_clean.shape}, y: {y_clean.shape}")

# Fit the model
try:
    model = sm.OLS(y_clean, X_clean).fit()
    
    # Display regression results
    print("\nMultiple Linear Regression: Predictors of ATSPPHS_Total")
    print(model.summary())
    
    # Extract coefficients and confidence intervals
    coefs = model.params
    conf_int = model.conf_int(alpha=0.05)
    std_err = model.bse
    p_values = model.pvalues
    
    # Create a dataframe for regression results
    results_df = pd.DataFrame({
        'Coefficient': coefs,
        'Std. Error': std_err,
        'p-value': p_values,
        '95% CI Lower': conf_int[0],
        '95% CI Upper': conf_int[1],
        'Significant': p_values <= 0.05
    })
    
    print("\nRegression Coefficients with 95% Confidence Intervals:")
    print(results_df)
except Exception as e:
    print(f"Error in regression: {e}")

In [None]:
# Fix the regression model by properly creating the const column
dep_var = 'ATSPPHS_Total'
indep_vars = ['Age', 'DSS_Personal', 'DSS_Perceived']
cat_vars = ['Gender', 'Living arrangement', 'Depression']

# Create a clean dataframe for regression
df_reg = df_numeric.copy()

# Ensure all numeric variables are properly converted
for var in indep_vars + [dep_var]:
    if var in df_reg.columns:
        df_reg[var] = pd.to_numeric(df_reg[var], errors='coerce')

# Drop rows with NaN in dependent or independent variables
mask = df_reg[indep_vars + [dep_var]].notna().all(axis=1)
df_reg = df_reg[mask]

print(f"Kept {len(df_reg)} rows after removing NaN values in key variables")

# Create X matrix manually to ensure all columns are numeric
X = pd.DataFrame()
X['const'] = np.ones(len(df_reg))  # Correctly create the constant column
X['Age'] = df_reg['Age'].astype(float)
X['DSS_Personal'] = df_reg['DSS_Personal'].astype(float)
X['DSS_Perceived'] = df_reg['DSS_Perceived'].astype(float)

# Add dummy variables manually
X['Gender_Male'] = (df_reg['Gender'] == 'Male').astype(float)

# Create living arrangement dummies
# First, get the unique values
living_categories = df_reg['Living arrangement'].unique()
if len(living_categories) > 1:
    # Use the first category as reference
    reference_category = living_categories[0]
    for category in living_categories[1:]:
        # Create dummy variable for each non-reference category
        X[f'Living_{category}'] = (df_reg['Living arrangement'] == category).astype(float)

# Add Depression dummy
X['Depression_yes'] = (df_reg['Depression'] == 'yes').astype(float)

# Define dependent variable
y = df_reg[dep_var].astype(float)

print(f"Final regression data shapes: X: {X.shape}, y: {y.shape}")

# Check if we have enough data
if len(X) > 0:
    # Fit the model
    try:
        model = sm.OLS(y, X).fit()
        
        # Display regression results
        print("\nMultiple Linear Regression: Predictors of ATSPPHS_Total")
        print(model.summary())
        
        # Extract coefficients and confidence intervals
        coefs = model.params
        conf_int = model.conf_int(alpha=0.05)
        std_err = model.bse
        p_values = model.pvalues
        
        # Create a dataframe for regression results
        results_df = pd.DataFrame({
            'Coefficient': coefs,
            'Std. Error': std_err,
            'p-value': p_values,
            '95% CI Lower': conf_int[0],
            '95% CI Upper': conf_int[1],
            'Significant': p_values <= 0.05
        })
        
        print("\nRegression Coefficients with 95% Confidence Intervals:")
        print(results_df)
    except Exception as e:
        print(f"Error in regression: {e}")
else:
    print("No data available for regression after filtering out NaN values")

In [None]:
# Code to generate the demographic characteristics table
# First, calculate frequency and percentages for categorical variables
demographic_vars = {
    'Gender': df['Gender'].value_counts(),
    'Living arrangement': df['Living arrangement'].value_counts(),
    'Marital status': df['Marital status'].value_counts(),
    'Family system': df['Family system'].value_counts(),
    'History of suicide in family': df['History of suicide in family'].value_counts(),
    'Depression': df['Depression'].value_counts(),
    'Field of Study': df['Field of Study'].value_counts()
}

# Create a DataFrame for the demographics table
demographics_table = pd.DataFrame(columns=['Characteristics', 'n (%)'])

# Add age row
age_row = pd.DataFrame({
    'Characteristics': ['Age in years, mean (SD)'],
    'n (%)': [f"{df_numeric['Age'].mean():.2f} ± {df_numeric['Age'].std():.2f}"]
})
demographics_table = pd.concat([demographics_table, age_row], ignore_index=True)

# Add other demographic variables
for var, counts in demographic_vars.items():
    for category, count in counts.items():
        percentage = count / len(df) * 100
        row = pd.DataFrame({
            'Characteristics': [f"{var}: {category}"],
            'n (%)': [f"{count} ({percentage:.1f}%)"]
        })
        demographics_table = pd.concat([demographics_table, row], ignore_index=True)

print("Table 1: Demographic characteristics of participants (n=199)")
print(demographics_table)

In [None]:
# Code to generate scale score comparison table
# We'll compare Medicine/Nursing vs Other Fields
# First, create a field grouping
df_numeric['Field_Group'] = df['Field of Study'].apply(
    lambda x: 'Medicine/Nursing' if x == 'Medicine/ Nursing' else 'Other Fields'
)

# Calculate mean and SD for each scale by field group
scales = ['ATSPPHS_Openness', 'ATSPPHS_Value', 'ATSPPHS_Total', 
          'DSS_Personal', 'DSS_Perceived', 'DSS_Total']

# Create an empty DataFrame for the scale scores table
scale_table = pd.DataFrame(columns=['Characteristics', 'Total', 'Medicine/Nursing', 'Other Fields', 'p-value'])

# Add rows for each scale
for scale in scales:
    # Calculate overall mean and SD
    total_mean = df_numeric[scale].mean()
    total_sd = df_numeric[scale].std()
    
    # Calculate mean and SD for Medicine/Nursing
    med_data = df_numeric[df_numeric['Field_Group'] == 'Medicine/Nursing'][scale]
    med_mean = med_data.mean()
    med_sd = med_data.std()
    
    # Calculate mean and SD for Other Fields
    other_data = df_numeric[df_numeric['Field_Group'] == 'Other Fields'][scale]
    other_mean = other_data.mean()
    other_sd = other_data.std()
    
    # Calculate p-value for difference between groups
    _, p_val = stats.ttest_ind(med_data, other_data, equal_var=False)
    
    # Create a row for this scale
    row = pd.DataFrame({
        'Characteristics': [scale],
        'Total': [f"{total_mean:.2f} ± {total_sd:.2f}"],
        'Medicine/Nursing': [f"{med_mean:.2f} ± {med_sd:.2f}"],
        'Other Fields': [f"{other_mean:.2f} ± {other_sd:.2f}"],
        'p-value': [f"{p_val:.3f}"]
    })
    
    # Add row to table
    scale_table = pd.concat([scale_table, row], ignore_index=True)

print("Table 3: Comparison of ATSPPHS and DSS scale scores by field of study")
print(scale_table)

In [None]:
# Code to generate ATSPPHS item distribution table
# First, identify agreement and disagreement values
agree_values = ['Strongly agree', 'Strongly Agree', 'Agree']
disagree_values = ['Strongly Disagree', 'Disagree']

# Separate openness and value items
openness_items = exact_atspphs_openness
value_items = exact_atspphs_value

# Create an empty DataFrame for the ATSPPHS table
atspphs_table = pd.DataFrame(columns=['S.No', 'Items', 'Agree n (%)', 'Disagree n (%)', 'p-value'])

# Add rows for Openness Scale items
for i, item in enumerate(openness_items, 1):
    # Calculate overall agreement/disagreement
    agree_count = df[item].isin(agree_values).sum()
    disagree_count = df[item].isin(disagree_values).sum()
    agree_pct = agree_count / len(df) * 100
    disagree_pct = disagree_count / len(df) * 100
    
    # Calculate p-value between field groups
    med_agree = df[df['Field of Study'] == 'Medicine/ Nursing'][item].isin(agree_values).sum()
    med_disagree = df[df['Field of Study'] == 'Medicine/ Nursing'][item].isin(disagree_values).sum()
    other_agree = agree_count - med_agree
    other_disagree = disagree_count - med_disagree
    
    contingency = np.array([[med_agree, med_disagree], [other_agree, other_disagree]])
    _, p_val, _, _ = stats.chi2_contingency(contingency)
    
    # Create a row for this item
    item_name = item
    if len(item_name) > 50:  # Truncate long item names
        item_name = item_name[:47] + "..."
        
    row = pd.DataFrame({
        'S.No': [i],
        'Items': [item_name],
        'Agree n (%)': [f"{agree_count} ({agree_pct:.1f}%)"],
        'Disagree n (%)': [f"{disagree_count} ({disagree_pct:.1f}%)"],
        'p-value': [f"{p_val:.3f}"]
    })
    
    # Add row to table
    atspphs_table = pd.concat([atspphs_table, row], ignore_index=True)

# Add a section header for Openness Scale
header_row = pd.DataFrame({
    'S.No': [""],
    'Items': ["Openness Scale"],
    'Agree n (%)': [""],
    'Disagree n (%)': [""],
    'p-value': [""]
})
atspphs_table = pd.concat([header_row, atspphs_table], ignore_index=True)

# Add rows for Value Scale items
value_table = pd.DataFrame(columns=['S.No', 'Items', 'Agree n (%)', 'Disagree n (%)', 'p-value'])
for i, item in enumerate(value_items, 1):
    # Calculate overall agreement/disagreement
    agree_count = df[item].isin(agree_values).sum()
    disagree_count = df[item].isin(disagree_values).sum()
    agree_pct = agree_count / len(df) * 100
    disagree_pct = disagree_count / len(df) * 100
    
    # Calculate p-value between field groups
    med_agree = df[df['Field of Study'] == 'Medicine/ Nursing'][item].isin(agree_values).sum()
    med_disagree = df[df['Field of Study'] == 'Medicine/ Nursing'][item].isin(disagree_values).sum()
    other_agree = agree_count - med_agree
    other_disagree = disagree_count - med_disagree
    
    contingency = np.array([[med_agree, med_disagree], [other_agree, other_disagree]])
    _, p_val, _, _ = stats.chi2_contingency(contingency)
    
    # Create a row for this item
    item_name = item
    if len(item_name) > 50:  # Truncate long item names
        item_name = item_name[:47] + "..."
        
    row = pd.DataFrame({
        'S.No': [i],
        'Items': [item_name],
        'Agree n (%)': [f"{agree_count} ({agree_pct:.1f}%)"],
        'Disagree n (%)': [f"{disagree_count} ({disagree_pct:.1f}%)"],
        'p-value': [f"{p_val:.3f}"]
    })
    
    # Add row to table
    value_table = pd.concat([value_table, row], ignore_index=True)

# Add a section header for Value Scale
header_row = pd.DataFrame({
    'S.No': [""],
    'Items': ["Value Scale"],
    'Agree n (%)': [""],
    'Disagree n (%)': [""],
    'p-value': [""]
})
value_table = pd.concat([header_row, value_table], ignore_index=True)

# Combine openness and value tables
atspphs_table = pd.concat([atspphs_table, value_table], ignore_index=True)

print("Table 4: Distribution of ATSPPHS items (agreement vs. disagreement)")
print(atspphs_table)

In [None]:
# Code to generate DSS item distribution table
agree_values = ['Strongly agree', 'Strongly Agree', 'Agree']
disagree_values = ['Strongly Disagree', 'Disagree']

# Separate personal and perceived stigma items
personal_items = exact_dss_personal
perceived_items = exact_dss_perceived

# Create an empty DataFrame for the DSS table
dss_table = pd.DataFrame(columns=['S.No', 'Items', 'Agree n (%)', 'Disagree n (%)', 'p-value'])

# Add rows for Personal Stigma Scale items
for i, item in enumerate(personal_items, 1):
    # Calculate overall agreement/disagreement
    agree_count = df[item].isin(agree_values).sum()
    disagree_count = df[item].isin(disagree_values).sum()
    agree_pct = agree_count / len(df) * 100
    disagree_pct = disagree_count / len(df) * 100
    
    # Calculate p-value between field groups
    med_agree = df[df['Field of Study'] == 'Medicine/ Nursing'][item].isin(agree_values).sum()
    med_disagree = df[df['Field of Study'] == 'Medicine/ Nursing'][item].isin(disagree_values).sum()
    other_agree = agree_count - med_agree
    other_disagree = disagree_count - med_disagree
    
    contingency = np.array([[med_agree, med_disagree], [other_agree, other_disagree]])
    _, p_val, _, _ = stats.chi2_contingency(contingency)
    
    # Create a row for this item
    item_name = item
    if len(item_name) > 50:  # Truncate long item names
        item_name = item_name[:47] + "..."
        
    row = pd.DataFrame({
        'S.No': [i],
        'Items': [item_name],
        'Agree n (%)': [f"{agree_count} ({agree_pct:.1f}%)"],
        'Disagree n (%)': [f"{disagree_count} ({disagree_pct:.1f}%)"],
        'p-value': [f"{p_val:.3f}"]
    })
    
    # Add row to table
    dss_table = pd.concat([dss_table, row], ignore_index=True)

# Add a section header for Personal Stigma Scale
header_row = pd.DataFrame({
    'S.No': [""],
    'Items': ["Personal Stigma Scale"],
    'Agree n (%)': [""],
    'Disagree n (%)': [""],
    'p-value': [""]
})
dss_table = pd.concat([header_row, dss_table], ignore_index=True)

# Add rows for Perceived Stigma Scale items
perceived_table = pd.DataFrame(columns=['S.No', 'Items', 'Agree n (%)', 'Disagree n (%)', 'p-value'])
for i, item in enumerate(perceived_items, 1):
    # Calculate overall agreement/disagreement
    agree_count = df[item].isin(agree_values).sum()
    disagree_count = df[item].isin(disagree_values).sum()
    agree_pct = agree_count / len(df) * 100
    disagree_pct = disagree_count / len(df) * 100
    
    # Calculate p-value between field groups
    med_agree = df[df['Field of Study'] == 'Medicine/ Nursing'][item].isin(agree_values).sum()
    med_disagree = df[df['Field of Study'] == 'Medicine/ Nursing'][item].isin(disagree_values).sum()
    other_agree = agree_count - med_agree
    other_disagree = disagree_count - med_disagree
    
    contingency = np.array([[med_agree, med_disagree], [other_agree, other_disagree]])
    _, p_val, _, _ = stats.chi2_contingency(contingency)
    
    # Create a row for this item
    item_name = item
    if len(item_name) > 50:  # Truncate long item names
        item_name = item_name[:47] + "..."
        
    row = pd.DataFrame({
        'S.No': [i],
        'Items': [item_name],
        'Agree n (%)': [f"{agree_count} ({agree_pct:.1f}%)"],
        'Disagree n (%)': [f"{disagree_count} ({disagree_pct:.1f}%)"],
        'p-value': [f"{p_val:.3f}"]
    })
    
    # Add row to table
    perceived_table = pd.concat([perceived_table, row], ignore_index=True)

# Add a section header for Perceived Stigma Scale
header_row = pd.DataFrame({
    'S.No': [""],
    'Items': ["Perceived Stigma Scale"],
    'Agree n (%)': [""],
    'Disagree n (%)': [""],
    'p-value': [""]
})
perceived_table = pd.concat([header_row, perceived_table], ignore_index=True)

# Combine personal and perceived tables
dss_table = pd.concat([dss_table, perceived_table], ignore_index=True)

print("Table 5: Frequency (percentage) of DSS items (agreement vs. disagreement)")
print(dss_table)

In [None]:
# Code to generate predictors table based on regression analysis
# We'll use the regression results we already have for ATSPPHS_Total
# and run new regressions for DSS_Personal and DSS_Perceived

# Create a DataFrame for the predictors table
predictors_table = pd.DataFrame(columns=['Characteristics', 'Crude β (95% CI)', 'Adjusted β (95% CI)'])

# Define a function to run regression and extract results
def run_regression(dep_var, indep_vars, cat_vars):
    # Create X matrix
    X = pd.DataFrame()
    X['const'] = np.ones(len(df_numeric))
    
    for var in indep_vars:
        if var in df_numeric.columns:
            X[var] = df_numeric[var].astype(float)
    
    # Add categorical variables
    for var in cat_vars:
        if var == 'Gender':
            X['Gender_Male'] = (df_numeric['Gender'] == 'Male').astype(float)
        elif var == 'Living arrangement':
            for category in df_numeric['Living arrangement'].unique():
                if category != df_numeric['Living arrangement'].unique()[0]:  # Skip reference category
                    X[f'Living_{category}'] = (df_numeric['Living arrangement'] == category).astype(float)
        elif var == 'Depression':
            X['Depression_yes'] = (df_numeric['Depression'] == 'yes').astype(float)
        elif var == 'Field of Study':
            for field in df_numeric['Field of Study'].unique():
                if field != df_numeric['Field of Study'].unique()[0]:  # Skip reference category
                    X[f'Field_{field}'] = (df_numeric['Field of Study'] == field).astype(float)
    
    # Define dependent variable
    y = df_numeric[dep_var].astype(float)
    
    # Drop rows with NaN
    mask = ~(X.isna().any(axis=1) | pd.isna(y))
    X_clean = X[mask]
    y_clean = y[mask]
    
    # Fit model
    model = sm.OLS(y_clean, X_clean).fit()
    
    return model

# Extract significant predictors for ATSPPHS_Total from our previous regression
# Ensure the model exists and is defined
if 'model' in locals() or 'model' in globals():
    for var in model.params.index:
        coef = model.params[var]
        pval = model.pvalues[var]
        conf_int = model.conf_int().loc[var]
        
        if pval <= 0.05:  # Only include significant predictors
            var_name = var
            if var == 'DSS_Personal':
                var_name = 'Personal Depression Stigma'
            elif var == 'Depression_yes':
                var_name = 'Depression Diagnosis'
            elif 'Living_' in var:
                var_name = var.replace('Living_', 'Living arrangement: ')
            
            row = pd.DataFrame({
                'Characteristics': [f"Attitude towards seeking help: {var_name}"],
                'Crude β (95% CI)': [f"{coef:.2f} ({conf_int[0]:.2f}, {conf_int[1]:.2f})"],
                'Adjusted β (95% CI)': [f"{coef:.2f} ({conf_int[0]:.2f}, {conf_int[1]:.2f})"]
            })
            predictors_table = pd.concat([predictors_table, row], ignore_index=True)
else:
    # If the model isn't available, run a new regression for ATSPPHS_Total
    atspphs_model = run_regression('ATSPPHS_Total', 
                                 ['Age', 'DSS_Personal', 'DSS_Perceived'], 
                                 ['Gender', 'Living arrangement', 'Depression'])
    
    for var in atspphs_model.params.index:
        coef = atspphs_model.params[var]
        pval = atspphs_model.pvalues[var]
        conf_int = atspphs_model.conf_int().loc[var]
        
        if pval <= 0.05:  # Only include significant predictors
            var_name = var
            if var == 'DSS_Personal':
                var_name = 'Personal Depression Stigma'
            elif var == 'Depression_yes':
                var_name = 'Depression Diagnosis'
            elif 'Living_' in var:
                var_name = var.replace('Living_', 'Living arrangement: ')
            
            row = pd.DataFrame({
                'Characteristics': [f"Attitude towards seeking help: {var_name}"],
                'Crude β (95% CI)': [f"{coef:.2f} ({conf_int[0]:.2f}, {conf_int[1]:.2f})"],
                'Adjusted β (95% CI)': [f"{coef:.2f} ({conf_int[0]:.2f}, {conf_int[1]:.2f})"]
            })
            predictors_table = pd.concat([predictors_table, row], ignore_index=True)

# Run regression for DSS_Personal
dss_personal_model = run_regression('DSS_Personal', 
                                  ['Age', 'ATSPPHS_Total'], 
                                  ['Gender', 'Living arrangement', 'Depression', 'Field of Study'])

# Extract significant predictors for DSS_Personal
for var in dss_personal_model.params.index:
    coef = dss_personal_model.params[var]
    pval = dss_personal_model.pvalues[var]
    conf_int = dss_personal_model.conf_int().loc[var]
    
    if pval <= 0.05:  # Only include significant predictors
        var_name = var
        if var == 'ATSPPHS_Total':
            var_name = 'Attitude Towards Seeking Help'
        elif var == 'Depression_yes':
            var_name = 'Depression Diagnosis'
        elif 'Living_' in var:
            var_name = var.replace('Living_', 'Living arrangement: ')
        elif 'Field_' in var:
            var_name = var.replace('Field_', 'Field of Study: ')
        
        row = pd.DataFrame({
            'Characteristics': [f"Personal depression stigma: {var_name}"],
            'Crude β (95% CI)': [f"{coef:.2f} ({conf_int[0]:.2f}, {conf_int[1]:.2f})"],
            'Adjusted β (95% CI)': [f"{coef:.2f} ({conf_int[0]:.2f}, {conf_int[1]:.2f})"]
        })
        predictors_table = pd.concat([predictors_table, row], ignore_index=True)

# Run regression for DSS_Perceived
dss_perceived_model = run_regression('DSS_Perceived', 
                                   ['Age', 'ATSPPHS_Total'], 
                                   ['Gender', 'Living arrangement', 'Depression', 'Field of Study'])

# Extract significant predictors for DSS_Perceived
for var in dss_perceived_model.params.index:
    coef = dss_perceived_model.params[var]
    pval = dss_perceived_model.pvalues[var]
    conf_int = dss_perceived_model.conf_int().loc[var]
    
    if pval <= 0.05:  # Only include significant predictors
        var_name = var
        if var == 'ATSPPHS_Total':
            var_name = 'Attitude Towards Seeking Help'
        elif var == 'Depression_yes':
            var_name = 'Depression Diagnosis'
        elif 'Living_' in var:
            var_name = var.replace('Living_', 'Living arrangement: ')
        elif 'Field_' in var:
            var_name = var.replace('Field_', 'Field of Study: ')
        
        row = pd.DataFrame({
            'Characteristics': [f"Perceived depression stigma: {var_name}"],
            'Crude β (95% CI)': [f"{coef:.2f} ({conf_int[0]:.2f}, {conf_int[1]:.2f})"],
            'Adjusted β (95% CI)': [f"{coef:.2f} ({conf_int[0]:.2f}, {conf_int[1]:.2f})"]
        })
        predictors_table = pd.concat([predictors_table, row], ignore_index=True)

print("Table 6: Predictors of attitude towards seeking help, and personal and perceived depression stigma")
print(predictors_table)

In [None]:
#table 2

In [None]:
# Code to generate Table 2: Comparison of characteristics by field of study
# We'll compare all three disciplines: Medicine/Nursing, Engineering/IT, and Arts/Humanities

# List of characteristics to compare
categorical_vars = ['Gender', 'Living arrangement', 'Marital status', 'Family system', 
                   'History of suicide in family', 'Depression']
continuous_vars = ['Age']

# Create an empty DataFrame for the comparison table
comparison_table = pd.DataFrame(columns=['Characteristics', 'Medicine/Nursing', 'Engineering/IT', 'Arts/Humanities', 'p-value'])

# Add rows for continuous variables (like age)
for var in continuous_vars:
    # Calculate statistics for each group
    med_data = df_numeric[df['Field of Study'] == 'Medicine/ Nursing'][var]
    eng_data = df_numeric[df['Field of Study'] == 'Engineering/ IT'][var]
    arts_data = df_numeric[df['Field of Study'] == 'Arts/ Humanities'][var]
    
    # Calculate p-value using ANOVA
    groups = [med_data.dropna(), eng_data.dropna(), arts_data.dropna()]
    f_stat, p_val = stats.f_oneway(*groups)
    
    # Create a row for this variable
    row = pd.DataFrame({
        'Characteristics': [f"{var} in years, mean (SD)"],
        'Medicine/Nursing': [f"{med_data.mean():.1f} ± {med_data.std():.1f}"],
        'Engineering/IT': [f"{eng_data.mean():.1f} ± {eng_data.std():.1f}"],
        'Arts/Humanities': [f"{arts_data.mean():.1f} ± {arts_data.std():.1f}"],
        'p-value': [f"{p_val:.3f}"]
    })
    
    # Add row to table
    comparison_table = pd.concat([comparison_table, row], ignore_index=True)

# Add rows for categorical variables
for var in categorical_vars:
    # Get the unique categories for this variable
    categories = df[var].unique()
    
    for category in categories:
        # Count occurrences in each group
        med_count = sum((df['Field of Study'] == 'Medicine/ Nursing') & (df[var] == category))
        med_total = sum(df['Field of Study'] == 'Medicine/ Nursing')
        med_pct = med_count / med_total * 100
        
        eng_count = sum((df['Field of Study'] == 'Engineering/ IT') & (df[var] == category))
        eng_total = sum(df['Field of Study'] == 'Engineering/ IT')
        eng_pct = eng_count / eng_total * 100
        
        arts_count = sum((df['Field of Study'] == 'Arts/ Humanities') & (df[var] == category))
        arts_total = sum(df['Field of Study'] == 'Arts/ Humanities')
        arts_pct = arts_count / arts_total * 100
        
        # Create a row for this category
        row = pd.DataFrame({
            'Characteristics': [f"{var}: {category}"],
            'Medicine/Nursing': [f"{med_count} ({med_pct:.1f}%)"],
            'Engineering/IT': [f"{eng_count} ({eng_pct:.1f}%)"],
            'Arts/Humanities': [f"{arts_count} ({arts_pct:.1f}%)"],
            'p-value': [""]  # We'll calculate p-values for the variable as a whole, not each category
        })
        
        # Add row to table
        comparison_table = pd.concat([comparison_table, row], ignore_index=True)
    
    # Calculate p-value for this categorical variable
    contingency_table = pd.crosstab(df[var], df['Field of Study'])
    
    # Use Chi-square test
    chi2, p_val, dof, expected = stats.chi2_contingency(contingency_table)
    
    # If any expected frequency is < 5, note that Fisher's exact test would be more appropriate
    # But for tables larger than 2x2, we'll still use Chi-square
    test_type = "Chi2"
    if (expected < 5).any():
        test_type = "Fisher"
    
    # Update p-value in the last row for this variable
    idx = comparison_table[comparison_table['Characteristics'].str.contains(var)].index[-1]
    comparison_table.loc[idx, 'p-value'] = f"{p_val:.3f}{' *' if test_type == 'Fisher' else ''}"

print("Table 2: Comparison of characteristics by field of study")
print(comparison_table)

# Add footnote for Fisher's exact test
if comparison_table['p-value'].str.contains('\*').any():
    print("\n* Fisher's exact test would be more appropriate due to low expected frequencies")

In [None]:
# Code to generate the demographic characteristics table including all mental health conditions
# First, calculate frequency and percentages for categorical variables
demographic_vars = {
    'Gender': df['Gender'].value_counts(),
    'Living arrangement': df['Living arrangement'].value_counts(),
    'Marital status': df['Marital status'].value_counts(),
    'Family system': df['Family system'].value_counts(),
    'History of suicide in family': df['History of suicide in family'].value_counts(),
    'Family history of diagnosed mental illness in family': df['Family history of diagnosed mental illness in family'].value_counts()
}

# Mental health conditions
mental_health_conditions = {
    'Depression': df['Depression'].value_counts(),
    'Generalized Anxiety Disorder (GAD)': df['GAD'].value_counts(),
    'Panic Disorder': df['Panic'].value_counts(),
    'Schizophrenia': df['Schiz'].value_counts(),
    'Bipolar Disorder': df['Bipolar'].value_counts(),
    'No diagnosed mental illness': df['None'].value_counts()
}

# Create a DataFrame for the demographics table
demographics_table = pd.DataFrame(columns=['Characteristics', 'n (%)'])

# Add age row
age_row = pd.DataFrame({
    'Characteristics': ['Age in years, mean (SD)'],
    'n (%)': [f"{df_numeric['Age'].mean():.2f} ± {df_numeric['Age'].std():.2f}"]
})
demographics_table = pd.concat([demographics_table, age_row], ignore_index=True)

# Add other demographic variables
for var, counts in demographic_vars.items():
    for category, count in counts.items():
        percentage = count / len(df) * 100
        row = pd.DataFrame({
            'Characteristics': [f"{var}: {category}"],
            'n (%)': [f"{count} ({percentage:.1f}%)"]
        })
        demographics_table = pd.concat([demographics_table, row], ignore_index=True)

# Add a section header for mental health conditions
header_row = pd.DataFrame({
    'Characteristics': ["Mental Health Conditions"],
    'n (%)': [""]
})
demographics_table = pd.concat([demographics_table, header_row], ignore_index=True)

# Add mental health conditions
for condition, counts in mental_health_conditions.items():
    for response, count in counts.items():
        if response == 'yes':
            percentage = count / len(df) * 100
            row = pd.DataFrame({
                'Characteristics': [f"{condition}"],
                'n (%)': [f"{count} ({percentage:.1f}%)"]
            })
            demographics_table = pd.concat([demographics_table, row], ignore_index=True)

print("Table 1: Demographic characteristics of participants (n=199)")
print(demographics_table)

In [None]:
from scipy.stats import spearmanr
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Function to calculate Cronbach's alpha
def cronbachs_alpha(items):
    # Convert string responses to numeric if needed
    if items.dtypes[0] == 'object':
        le = LabelEncoder()
        items_numeric = items.apply(lambda x: le.fit_transform(x))
    else:
        items_numeric = items
        
    # Calculate item variances and total variance
    item_variances = items_numeric.var(axis=0)
    total_variance = items_numeric.sum(axis=1).var()
    
    # Calculate Cronbach's alpha
    n_items = items.shape[1]
    return (n_items / (n_items - 1)) * (1 - item_variances.sum() / total_variance)

# Calculate alpha for each subscale
alpha_dss_personal = cronbachs_alpha(df[exact_dss_personal])
alpha_dss_perceived = cronbachs_alpha(df[exact_dss_perceived])
alpha_atspphs_openness = cronbachs_alpha(df[exact_atspphs_openness])
alpha_atspphs_value = cronbachs_alpha(df[exact_atspphs_value])

print(f"Cronbach's alpha for DSS-Personal: {alpha_dss_personal:.3f}")
print(f"Cronbach's alpha for DSS-Perceived: {alpha_dss_perceived:.3f}")
print(f"Cronbach's alpha for ATSPPHS-Openness: {alpha_atspphs_openness:.3f}")
print(f"Cronbach's alpha for ATSPPHS-Value: {alpha_atspphs_value:.3f}")

In [None]:
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
import matplotlib.pyplot as plt

# Combine all scale items
all_items = exact_dss_personal + exact_dss_perceived + exact_atspphs_openness + exact_atspphs_value

# Convert to numeric values if needed
df_items = df_numeric[all_items]

# Check if the data is suitable for factor analysis
chi_square_value, p_value = calculate_bartlett_sphericity(df_items)
print(f"Bartlett's test of sphericity: chi² = {chi_square_value:.3f}, p = {p_value:.10f}")

# Run Harman's single factor test
fa = FactorAnalyzer(n_factors=1, rotation=None)
fa.fit(df_items)

# Get the eigenvalues
ev, _ = fa.get_eigenvalues()
print(f"Eigenvalues: {ev}")

# Calculate variance explained by a single factor
loadings = fa.loadings_
explained_variance = (loadings**2).sum() / len(loadings)
print(f"Variance explained by a single factor: {explained_variance:.2%}")

# If a single factor explains more than 50% of variance, common method bias might be a concern
if explained_variance > 0.5:
    print("WARNING: Common method bias may be present (>50% variance explained by a single factor)")
else:
    print("Common method bias is likely not a major concern (<50% variance explained)")