# data transformation

In [None]:
import os
import pyreadstat
import pandas as pd
from datetime import datetime

# Paths to input files and output directories
current_directory = os.getcwd()
sav_file_path = os.path.join(current_directory, '01-data', 'yaleoutput.sav')
rename_excel_path = os.path.join(current_directory, '01-data', 'column_rename_mapping.xlsx')
output_excel_path = os.path.join(current_directory, '03-output', 'dfyale.xlsx')

# Read the .sav file
df, meta = pyreadstat.read_sav(sav_file_path)

# Drop the specified columns
columns_to_drop = ['starttime', 'endtime']
df = df.drop(columns=columns_to_drop)

# Load the rename mapping from the Excel file
rename_df = pd.read_excel(rename_excel_path, sheet_name="Mapping")
column_rename_mapping = dict(zip(rename_df['OldName'], rename_df['NewName']))

# Rename the columns
df.rename(columns=column_rename_mapping, inplace=True)

# Drop columns ending in '_select'
select_columns = [col for col in df.columns if col.endswith('_select')]
df.drop(columns=select_columns, inplace=True)

# Get the current year
current_year = datetime.now().year

# Compute ages based on the current year
df['age'] = current_year - df['birthyr']

# Define age group bins with "65+" as the last group
bins = [0, 17, 24, 34, 44, 54, 64, 100]
labels = ['0-17', '18-24', '25-34', '35-44', '45-54', '55-64', '65+']

# Categorize ages into the specified groups
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

# Convert all numerical columns to floats and round to one decimal place
for col in df.select_dtypes(include=['number']).columns:
    df[col] = df[col].astype(float).round(1)

# Load the grouping data from the Excel file
grouping_df = pd.read_excel(rename_excel_path, sheet_name="Grouping")

# Mapping functions by suffix
suffix_mappings = {
    '_impact': dict(zip(grouping_df[grouping_df['Suffix'] == '_impact']['Code'], grouping_df[grouping_df['Suffix'] == '_impact']['Recode'])),
    '_confidence': dict(zip(grouping_df[grouping_df['Suffix'] == '_confidence']['Code'], grouping_df[grouping_df['Suffix'] == '_confidence']['Recode'])),
    '_important': dict(zip(grouping_df[grouping_df['Suffix'] == '_important']['Code'], grouping_df[grouping_df['Suffix'] == '_important']['Recode'])),
    '_likelihood': dict(zip(grouping_df[grouping_df['Suffix'] == '_likelihood']['Code'], grouping_df[grouping_df['Suffix'] == '_likelihood']['Recode'])),
    '_agreement': dict(zip(grouping_df[grouping_df['Suffix'] == '_agreement']['Code'], grouping_df[grouping_df['Suffix'] == '_agreement']['Recode'])),
    'gender': dict(zip(grouping_df[grouping_df['Suffix'] == 'gender']['Code'], grouping_df[grouping_df['Suffix'] == 'gender']['Recode'])),
    'hhi': dict(zip(grouping_df[grouping_df['Suffix'] == 'hhi']['Code'], grouping_df[grouping_df['Suffix'] == 'hhi']['Recode'])),
    'marstat': dict(zip(grouping_df[grouping_df['Suffix'] == 'marstat']['Code'], grouping_df[grouping_df['Suffix'] == 'marstat']['Recode'])),
    'educ': dict(zip(grouping_df[grouping_df['Suffix'] == 'educ']['Code'], grouping_df[grouping_df['Suffix'] == 'educ']['Recode'])),
    'race': dict(zip(grouping_df[grouping_df['Suffix'] == 'race']['Code'], grouping_df[grouping_df['Suffix'] == 'race']['Recode'])),
    'state': dict(zip(grouping_df[grouping_df['Suffix'] == 'state']['Code'], grouping_df[grouping_df['Suffix'] == 'state']['Recode'])),
    'employ': dict(zip(grouping_df[grouping_df['Suffix'] == 'employ']['Code'], grouping_df[grouping_df['Suffix'] == 'employ']['Recode'])),
    'Xvar_politics': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_politics']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_politics']['Recode'])),
    'Xvar_Q04_R01_ed_courses_CS': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R01_ed_courses_CS']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R01_ed_courses_CS']['Recode'])),
    'Xvar_Q04_R02_ed_undergrad_CS': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R02_ed_undergrad_CS']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R02_ed_undergrad_CS']['Recode'])),
    'Xvar_Q04_R03_ed_Grad_CS': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R03_ed_Grad_CS']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R03_ed_Grad_CS']['Recode'])),
    'Xvar_Q04_R04_ed_program_exp': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R04_ed_program_exp']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R04_ed_program_exp']['Recode'])),
    'Xvar_Q04_R05_ed_none': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R05_ed_none']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R05_ed_none']['Recode'])),
    'Y_Q05_SupportAI': dict(zip(grouping_df[grouping_df['Suffix'] == 'Y_Q05_SupportAI']['Code'], grouping_df[grouping_df['Suffix'] == 'Y_Q05_SupportAI']['Recode'])),
    'Y_Q17_SupportDevHighlevelAI': dict(zip(grouping_df[grouping_df['Suffix'] == 'Y_Q17_SupportDevHighlevelAI']['Code'], grouping_df[grouping_df['Suffix'] == 'Y_Q17_SupportDevHighlevelAI']['Recode']))
}

# Function to recode columns based on suffix
def apply_mapping(df, mapping_dict, suffix):
    for col in df.columns:
        if col.endswith(suffix):
            df[col] = df[col].replace(mapping_dict)

# Apply mappings for each suffix
for suffix, mapping_dict in suffix_mappings.items():
    apply_mapping(df, mapping_dict, suffix)

# List of columns to be one-hot encoded
columns_to_encode = ['Xvar_politics', 'Xvar_Q04_R01_ed_courses_CS', 'Xvar_Q04_R02_ed_undergrad_CS', 'Xvar_Q04_R03_ed_Grad_CS', 'Xvar_Q04_R04_ed_program_exp', 'Xvar_Q04_R05_ed_none']

# Apply one-hot encoding to categorical columns, which will use 0 and 1
df_dummies = pd.get_dummies(df, columns=columns_to_encode, drop_first=False)

# Convert all Boolean columns to 0/1
bool_columns = df_dummies.select_dtypes(include=['bool']).columns
df_dummies[bool_columns] = df_dummies[bool_columns].astype(int)

# Drop unwanted columns
columns_to_drop = ['birthyr', 'pid7', 'votereg', 'ideo5', 'newsint', 'religpew', 'pew_churatd', 'pew_bornagain', 'pew_religimp', 'pew_prayer', 'Q03new_treat', 'q05b_treat', 'q12a_treat', 'q12_treat', 'q15_treat', 'Xvar_Q05b','Xvar_Q04_R05_ed_none_No','Xvar_Q04_R04_ed_program_exp_No','Xvar_Q04_R03_ed_Grad_CS_No','Xvar_Q04_R02_ed_undergrad_CS_No','Xvar_Q04_R01_ed_courses_CS_No']
df_dummies = df_dummies.drop(columns=columns_to_drop, errors='ignore')

# Add the 'NA' category to all categorical columns and then fill missing values
for col in df_dummies.select_dtypes(include='category').columns:
    df_dummies[col] = df_dummies[col].cat.add_categories(['NA'])
df_dummies = df_dummies.fillna('NA')

weight_column = 'weight'

# Identify columns starting with "Xvar" or "Y_"
columns_to_weight = [col for col in df_dummies.columns if col.startswith('Xvar') or col.startswith('Y_')]

# Ensure the columns are converted to numeric or boolean types before applying weights
for col in columns_to_weight:
    # Check if the column is categorical using isinstance with pd.CategoricalDtype
    if isinstance(df_dummies[col].dtype, pd.CategoricalDtype):
        df_dummies[col] = df_dummies[col].cat.codes
    # Convert boolean columns to integers (0/1)
    elif pd.api.types.is_bool_dtype(df_dummies[col]):
        df_dummies[col] = df_dummies[col].astype(int)
    # Convert remaining columns to numeric, coercing errors to NaN
    df_dummies[col] = pd.to_numeric(df_dummies[col], errors='coerce')

# Now apply the weight from the weight column to the identified columns
for col in columns_to_weight:
    df_dummies[col] = df_dummies[col] * df_dummies[weight_column]

# Save the modified DataFrame to an Excel file
df_dummies.to_excel(output_excel_path, index=False)

print(f"DataFrame with renamed columns is saved to {output_excel_path}")


# Model section

In [33]:
import pandas as pd
from semopy import Model
from graphviz import Digraph
import os

# Load the original data
current_directory = os.getcwd()
survey = os.path.join(current_directory, '03-output', 'dfyale.xlsx')
sheet_name = 'Sheet1'
df = pd.read_excel(survey, sheet_name=sheet_name)

# List of specific endings to group by
endings = [
    "Govt_War_challenges_important", "AI_Challenges_likelihood",
    "AI_employee_Challenge_likelihood", "AI_ethics_important",
    "AI_innovation_important", "AI_innovation_likelihood",
    "AI_Law_likelihood", "AI_risk_impact", "AIHarm_likelihood",
    "General_harm_likelihood", "General_risk_impact",
    "Govt_War_challenges", "Govt_War_likelihood",
    "Mgmt_Govt_confidence", "Mgmt_NGO_confidence", "Mgmt_Tech_confidence",
    "Natural_Disaster_likelihood", "Natural_risk_impact",
    "SocialEco_likelihood", "PI_Govt_confidence",
    "PI_NGO_confidence", "PI_Tech_confidence", "SocialEco_risk_impact"
]

# Dictionary to store columns for each group
grouped_columns = {ending: [] for ending in endings}

# Assign columns to each group based on their endings
for col in df.columns:
    for ending in endings:
        if col.endswith(ending):
            grouped_columns[ending].append(col)

# Sum values in each group to create new columns
for ending, cols in grouped_columns.items():
    if cols:
        df[f'Grouped_{ending}'] = df[cols].sum(axis=1)

# Combine with dependent variables
dependent_columns = ['Y_Q05_SupportAI']
grouped_vars = [f'Grouped_{ending}' for ending in endings if grouped_columns[ending]]
final_df = pd.concat([df[grouped_vars + dependent_columns]], axis=1)

# SEM model with grouped features
model_desc = f"""
# Structural Model
Y_Q05_SupportAI ~ {' + '.join(grouped_vars)}
"""

# Create the SEM model and load the dataset
model = Model(model_desc)
model.load_dataset(final_df)

# Optimize the model
model.fit()

# Create a graph using the fdp engine in graphviz
graph = Digraph(comment='SEM Model', engine='fdp', format='pdf')
graph.attr(overlap='false', splines='true', dpi='300')

# Add nodes (features) and edges (relationships) manually
nodes = grouped_vars + ['Y_Q05_SupportAI']
for node in nodes:
    graph.node(node, node, shape='box', fontsize='20')

# Add relationships (edges) between grouped features and the dependent variables
for source in grouped_vars:
    graph.edge(source, 'Y_Q05_SupportAI', fontsize='16')

# Save the graph as a PDF
output_directory = os.path.join(current_directory, '04-summary')
os.makedirs(output_directory, exist_ok=True)
graph_output_path = os.path.join(output_directory, 'sem_model_graph_grouped')
graph.render(graph_output_path)

print(f"Model graph saved to: {graph_output_path}.pdf")


Model graph saved to: /Users/danramirez/mbs-structural-equation-modeling/04-summary/sem_model_graph_grouped.pdf
