# data transformation

In [28]:
import os
import pyreadstat
import pandas as pd
from datetime import datetime

# Paths to input files and output directories
current_directory = os.getcwd()
sav_file_path = os.path.join(current_directory, '01-data', 'yaleoutput.sav')
rename_excel_path = os.path.join(current_directory, '01-data', 'column_rename_mapping.xlsx')
output_excel_path = os.path.join(current_directory, '03-output', 'dfyale.xlsx')

# Read the .sav file
df, meta = pyreadstat.read_sav(sav_file_path)

# Drop the specified columns
columns_to_drop = ['starttime', 'endtime']
df = df.drop(columns=columns_to_drop)

# Load the rename mapping from the Excel file
rename_df = pd.read_excel(rename_excel_path, sheet_name="Mapping")
column_rename_mapping = dict(zip(rename_df['OldName'], rename_df['NewName']))

# Rename the columns
df.rename(columns=column_rename_mapping, inplace=True)

# Drop columns ending in '_select'
select_columns = [col for col in df.columns if col.endswith('_select')]
df.drop(columns=select_columns, inplace=True)

# Get the current year
current_year = datetime.now().year

# Compute ages based on the current year
df['age'] = current_year - df['birthyr']

# Define age group bins with "65+" as the last group
bins = [0, 17, 24, 34, 44, 54, 64, 100]
labels = ['0-17', '18-24', '25-34', '35-44', '45-54', '55-64', '65+']

# Categorize ages into the specified groups
df['Xvar_age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

# Convert all numerical columns to floats and round to one decimal place
for col in df.select_dtypes(include=['number']).columns:
    df[col] = df[col].astype(float).round(1)

grouping_df = pd.read_excel(rename_excel_path, sheet_name="Grouping")

# Mapping functions by suffix
suffix_mappings = {
    '_impact': dict(zip(grouping_df[grouping_df['Suffix'] == '_impact']['Code'], grouping_df[grouping_df['Suffix'] == '_impact']['Recode'])),
    '_confidence': dict(zip(grouping_df[grouping_df['Suffix'] == '_confidence']['Code'], grouping_df[grouping_df['Suffix'] == '_confidence']['Recode'])),
    '_important': dict(zip(grouping_df[grouping_df['Suffix'] == '_important']['Code'], grouping_df[grouping_df['Suffix'] == '_important']['Recode'])),
    '_likelihood': dict(zip(grouping_df[grouping_df['Suffix'] == '_likelihood']['Code'], grouping_df[grouping_df['Suffix'] == '_likelihood']['Recode'])),
    '_agreement': dict(zip(grouping_df[grouping_df['Suffix'] == '_agreement']['Code'], grouping_df[grouping_df['Suffix'] == '_agreement']['Recode'])),
    'gender': dict(zip(grouping_df[grouping_df['Suffix'] == 'gender']['Code'], grouping_df[grouping_df['Suffix'] == 'gender']['Recode'])),
    'hhi': dict(zip(grouping_df[grouping_df['Suffix'] == 'hhi']['Code'], grouping_df[grouping_df['Suffix'] == 'hhi']['Recode'])),
    'marstat': dict(zip(grouping_df[grouping_df['Suffix'] == 'marstat']['Code'], grouping_df[grouping_df['Suffix'] == 'marstat']['Recode'])),
    'hhi': dict(zip(grouping_df[grouping_df['Suffix'] == 'hhi']['Code'], grouping_df[grouping_df['Suffix'] == 'hhi']['Recode'])),
    'educ': dict(zip(grouping_df[grouping_df['Suffix'] == 'educ']['Code'], grouping_df[grouping_df['Suffix'] == 'educ']['Recode'])),
    'race': dict(zip(grouping_df[grouping_df['Suffix'] == 'race']['Code'], grouping_df[grouping_df['Suffix'] == 'race']['Recode'])),
    'state': dict(zip(grouping_df[grouping_df['Suffix'] == 'state']['Code'], grouping_df[grouping_df['Suffix'] == 'state']['Recode'])),
    'employ': dict(zip(grouping_df[grouping_df['Suffix'] == 'employ']['Code'], grouping_df[grouping_df['Suffix'] == 'employ']['Recode'])),
    'Xvar_politics': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_politics']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_politics']['Recode'])),
    'Xvar_Q04_R01_ed_courses_CS': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R01_ed_courses_CS']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R01_ed_courses_CS']['Recode'])),
    'Xvar_Q04_R02_ed_undergrad_CS': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R02_ed_undergrad_CS']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R02_ed_undergrad_CS']['Recode'])),
    'Xvar_Q04_R03_ed_Grad_CS': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R03_ed_Grad_CS']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R03_ed_Grad_CS']['Recode'])),
    'Xvar_Q04_R04_ed_program_exp': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R04_ed_program_exp']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R04_ed_program_exp']['Recode'])),
    'Xvar_Q04_R05_ed_none': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R05_ed_none']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_R05_ed_none']['Recode'])),
    'Y_Q05_SupportAI': dict(zip(grouping_df[grouping_df['Suffix'] == 'Y_Q05_SupportAI']['Code'], grouping_df[grouping_df['Suffix'] == 'Y_Q05_SupportAI']['Recode'])),
    'Y_Q17_SupportDevHighlevelAI': dict(zip(grouping_df[grouping_df['Suffix'] == 'Y_Q17_SupportDevHighlevelAI']['Code'], grouping_df[grouping_df['Suffix'] == 'Y_Q17_SupportDevHighlevelAI']['Recode']))
}

# Function to recode columns based on suffix
def apply_mapping(df, mapping_dict, suffix):
    for col in df.columns:
        if col.endswith(suffix):
            df[col] = df[col].replace(mapping_dict)

# Apply mappings for each suffix
for suffix, mapping_dict in suffix_mappings.items():
    apply_mapping(df, mapping_dict, suffix)

# List of columns to be one-hot encoded
columns_to_encode = ['Xvar_age_group','Xvar_politics', 'Xvar_Q04_R01_ed_courses_CS', 'Xvar_Q04_R02_ed_undergrad_CS', 'Xvar_Q04_R03_ed_Grad_CS', 'Xvar_Q04_R04_ed_program_exp', 'Xvar_Q04_R05_ed_none']

# Apply one-hot encoding to categorical columns, which will use 0 and 1
df_dummies = pd.get_dummies(df, columns=columns_to_encode, drop_first=False)

# Convert all Boolean columns to 0/1
bool_columns = df_dummies.select_dtypes(include=['bool']).columns
df_dummies[bool_columns] = df_dummies[bool_columns].astype(int)

columns_to_drop = ['birthyr','pid7','votereg','ideo5','newsint','religpew','pew_churatd','pew_bornagain','pew_religimp','pew_prayer','Q03new_treat','q05b_treat','q12a_treat','q12_treat','q15_treat','Xvar_Q05b']

df_dummies = df_dummies.drop(columns=columns_to_drop, errors='ignore')
df_dummies = df_dummies.fillna('NA')


# Save the modified DataFrame to an Excel file
df_dummies.to_excel(output_excel_path, index=False)

print(f"DataFrame with renamed columns is saved to {output_excel_path}")


DataFrame with renamed columns is saved to /Users/danramirez/mbs-structural-equation-modeling/03-output/dfyale.xlsx


# Model section

In [13]:
import pandas as pd
from sklearn.decomposition import PCA
from semopy import Model
import pydot
import os

# Load the original data
current_directory = os.getcwd()
survey = os.path.join(current_directory, '03-output', 'dfyale.xlsx')
sheet_name = 'Sheet1'
df = pd.read_excel(survey, sheet_name=sheet_name)

# Identify independent columns (features) for PCA
feature_columns = [col for col in df.columns if col.startswith('Xvar')]

# Apply PCA to reduce features
num_components = 3  # Adjust based on how much variance you want to capture
pca = PCA(n_components=num_components)
pca_result = pca.fit_transform(df[feature_columns])

# Create a DataFrame for the PCA-transformed features
pca_columns = [f'PC{i+1}' for i in range(num_components)]
pca_df = pd.DataFrame(pca_result, columns=pca_columns)

# Combine with dependent variables
dependent_columns = ['Y_Q05_SupportAI']
final_df = pd.concat([pca_df, df[dependent_columns]], axis=1)

# Example SEM model with reduced PCA features
model_desc = f"""
# Structural Model
Y_Q05_SupportAI ~ {' + '.join(pca_columns)}
"""

# Create the SEM model and load the dataset
model = Model(model_desc)
model.load_dataset(final_df)

# Optimize the model
model.fit()

# Create a graph using pydot
graph = pydot.Dot(graph_type='digraph', ratio="0.7", dpi=200)

# Add nodes (features) and edges (relationships) manually
nodes = pca_columns + ['Y_Q05_SupportAI']
for node in nodes:
    graph.add_node(pydot.Node(node))

# Add relationships (edges) between PCA components and dependent variables
dependent_edges = [
    ('Y_Q05_SupportAI', pca_columns)

]

# Add edges to the graph
for target, sources in dependent_edges:
    for source in sources:
        graph.add_edge(pydot.Edge(source, target))

# Extract PCA loadings for each principal component
loadings = pd.DataFrame(pca.components_, columns=feature_columns, index=pca_columns)

# Add edges representing the loadings of the original features to each principal component
for pc, features in loadings.iterrows():
    for feature, score in features.items():
        graph.add_edge(pydot.Edge(feature, pc, label=f"{score:.2f}"))

# Save the graph as a PNG image
output_directory = os.path.join(current_directory, '04-summary')
os.makedirs(output_directory, exist_ok=True)
graph_output_path = os.path.join(output_directory, 'sem_model_graph_unfiltered.png')
graph.write_png(graph_output_path)

# Export PCA scores and loadings to Excel for detailed analysis
with pd.ExcelWriter(os.path.join(output_directory, 'pca_scores_and_loadings.xlsx')) as writer:
    pca_df.to_excel(writer, sheet_name='PCA Scores', index=False)
    loadings.to_excel(writer, sheet_name='PCA Loadings')

# Print the PCA explained variance ratio
explained_variance = pd.DataFrame({
    "Principal Component": pca_columns,
    "Explained Variance (%)": pca.explained_variance_ratio_ * 100
})
print("\nPCA Explained Variance:")
print(explained_variance)

# Save explained variance to the Excel file
explained_variance.to_excel(os.path.join(output_directory, 'pca_explained_variance.xlsx'), index=False)

print(f"Model graph saved to: {graph_output_path}")


KeyboardInterrupt: 