# data transformation

In [2]:
import os
import pyreadstat
import pandas as pd
from datetime import datetime

# Paths to input files and output directories
current_directory = os.getcwd()
sav_file_path = os.path.join(current_directory, '01-data', 'yaleoutput.sav')
rename_excel_path = os.path.join(current_directory, '01-data', 'column_rename_mapping.xlsx')  # Update this path to your rename mapping Excel file
output_excel_path = os.path.join(current_directory, '03-output', 'dfyale.xlsx')

# Read the .sav file
df, meta = pyreadstat.read_sav(sav_file_path)

# Drop the specified columns
columns_to_drop = ['starttime', 'endtime']
df = df.drop(columns=columns_to_drop)

# Load the rename mapping from the Excel file
rename_df = pd.read_excel(rename_excel_path, sheet_name="Mapping")
column_rename_mapping = dict(zip(rename_df['OldName'], rename_df['NewName']))

# Rename the columns
df.rename(columns=column_rename_mapping, inplace=True)

# Drop columns ending in '_select'
select_columns = [col for col in df.columns if col.endswith('_select')]
df.drop(columns=select_columns, inplace=True)

xvar_q06_cols = df.filter(regex='^Xvar_Q06_')
print(xvar_q06_cols.dtypes)
print(df.select_dtypes(include=['float']).head())

# Get the current year
current_year = datetime.now().year

# Compute ages based on the current year
df['age'] = current_year - df['birthyr']

# Define age group bins with "65+" as the last group
bins = [0, 17, 24, 34, 44, 54, 64, 100]
labels = ['0-17', '18-24', '25-34', '35-44', '45-54', '55-64', '65+']

# Categorize ages into the specified groups
df['Xvar_age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

# Convert all numerical columns to floats and round to one decimal place
for col in df.select_dtypes(include=['int', 'float','int64']).columns:
    df[col] = df[col].astype(float).round(1)
# Load the Grouping sheet
grouping_df = pd.read_excel(rename_excel_path, sheet_name="Grouping")

# Mapping functions by suffix
suffix_mappings = {
    '_impact': dict(zip(grouping_df[grouping_df['Suffix'] == '_impact']['Code'], grouping_df[grouping_df['Suffix'] == '_impact']['Recode'])),
    '_confidence': dict(zip(grouping_df[grouping_df['Suffix'] == '_confidence']['Code'], grouping_df[grouping_df['Suffix'] == '_confidence']['Recode'])),
    '_important': dict(zip(grouping_df[grouping_df['Suffix'] == '_important']['Code'], grouping_df[grouping_df['Suffix'] == '_important']['Recode'])),
    '_likelihood': dict(zip(grouping_df[grouping_df['Suffix'] == '_likelihood']['Code'], grouping_df[grouping_df['Suffix'] == '_likelihood']['Recode'])),
    '_agreement': dict(zip(grouping_df[grouping_df['Suffix'] == '_agreement']['Code'], grouping_df[grouping_df['Suffix'] == '_agreement']['Recode'])),
    'Xvar_gender': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_gender']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_gender']['Recode'])),
    'Xvar_politics': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_politics']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_politics']['Recode'])),
    'Xvar_Q04_01_ed_courses_CS': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_01_ed_courses_CS']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_01_ed_courses_CS']['Recode'])),
    'Xvar_Q04_02_ed_undergrad_CS': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_02_ed_undergrad_CS']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_02_ed_undergrad_CS']['Recode'])),
    'Xvar_Q04_03_ed_Grad_CS': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_03_ed_Grad_CS']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_03_ed_Grad_CS']['Recode'])),
    'Xvar_Q04_04_ed_program_exp': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_04_ed_program_exp']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_04_ed_program_exp']['Recode'])),
    'Xvar_Q04_05_ed_none': dict(zip(grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_05_ed_none']['Code'], grouping_df[grouping_df['Suffix'] == 'Xvar_Q04_05_ed_none']['Recode']))
}

# Function to recode columns based on suffix
def apply_mapping(df, mapping_dict, suffix):
    for col in df.columns:
        if col.endswith(suffix):
            df[col] = df[col].replace(mapping_dict)

# Apply mappings for each suffix
for suffix, mapping_dict in suffix_mappings.items():
    apply_mapping(df, mapping_dict, suffix)

# List of columns to be one-hot encoded
columns_to_encode = ['Xvar_gender','Xvar_age_group','Xvar_politics', 'Xvar_Q04_01_ed_courses_CS', 'Xvar_Q04_02_ed_undergrad_CS', 'Xvar_Q04_03_ed_Grad_CS', 'Xvar_Q04_04_ed_program_exp', 'Xvar_Q04_05_ed_none']

# Apply one-hot encoding (dummy variables) without converting to integers
df_dummies = pd.get_dummies(df, columns=columns_to_encode, drop_first=True).astype(float)
# Select specific numerical columns to round
df_dummies[col] = df_dummies[col].astype(float).round(1)
columns_to_drop = ['birthyr', 'age']
df_dummies = df_dummies.drop(columns=columns_to_drop, errors='ignore')

# Save the modified DataFrame to an Excel file
df_dummies.to_excel(output_excel_path, index=False)


print(f"DataFrame with renamed columns is saved to {output_excel_path}")


Xvar_Q06_01_PublicInterest_military_confidence                 float64
Xvar_Q06_02_PublicInterest_CilivianGovt_confidence             float64
Xvar_Q06_03_PublicInterest_NSA_confidence                      float64
Xvar_Q06_04_PublicInterest_FBI_confidence                      float64
Xvar_Q06_05_PublicInterest_CIA_confidence                      float64
Xvar_Q06_06_PublicInterest_NATO_confidence                     float64
Xvar_Q06_07_PublicInterest_InternationalResearch_confidence    float64
Xvar_Q06_08_PublicInterest_TechCos_confidence                  float64
Xvar_Q06_09_PublicInterest_Google_confidence                   float64
Xvar_Q06_10_PublicInterest_Facebook_confidence                 float64
Xvar_Q06_11_PublicInterest_Apple_confidence                    float64
Xvar_Q06_12_PublicInterest_MSFT_confidence                     float64
Xvar_Q06_13_PublicInterest_Amazon_confidence                   float64
Xvar_Q06_14_PublicInterest_nonproffit_confidence               float64
Xvar_Q

# Model section

In [None]:
#example code, will replace with real equation
from semopy import Model
import pandas as pd

# Example model specification
model_desc = """
    # Measurement model
    eta1 =~ y1 + y2 + y3
    eta2 =~ y4 + y5 + y6
    
    # Structural model
    eta2 ~ eta1
"""

# Assuming you have a DataFrame 'data' with your observed variables y1, y2, y3, y4, y5, y6
# data = pd.read_csv('your_data.csv')

# Initialize and fit the model
model = Model(model_desc)
result = model.fit(data)

# Check out the results
print(result)
# For more detailed output
print(model.inspect())
