In [3]:
import pandas as pd
# Load the merged state graduation data
grad_rates_path = r"C:\Users\user\Desktop\StateEdFundingImpact\data\merged_state_grads_df.csv"
merged_state_grads_df = pd.read_csv(grad_rates_path, index_col='STATE')

# Load the state funding data
funding_data_path = r"C:\Users\user\Desktop\StateEdFundingImpact\data\state_funding_df.csv"
state_funding_df = pd.read_csv(funding_data_path, index_col=['STATE', 'YEAR'])

# Merge the two datasets
state_funding_impact_df = merged_state_grads_df.merge(state_funding_df, left_index=True, right_index=True)

# Check the first few rows of the merged DataFrame
print(state_funding_impact_df.head())


                 Cohort_2015_2016  Rate_2015_2016  Cohort_2016_2017  \
STATE      YEAR                                                       
ALABAMA    2016          145983.0       76.378635          110458.0   
ALASKA     2016           25039.0       62.145089           25788.0   
ARIZONA    2016          202193.0       65.798584          204878.0   
ARKANSAS   2016           94906.0       76.651163          100439.0   
CALIFORNIA 2016         1305135.0       76.375773         1256965.0   

                 Rate_2016_2017  Cohort_2017_2018  Rate_2017_2018  \
STATE      YEAR                                                     
ALABAMA    2016       81.771401          141949.0       77.201247   
ALASKA     2016       63.026316           26634.0       63.012245   
ARIZONA    2016       64.633005          217413.0       65.936614   
ARKANSAS   2016       77.265913          102892.0       75.852765   
CALIFORNIA 2016       76.089526         1312181.0       74.947489   

                 C

In [6]:
# Reset the index so 'STATE' and 'YEAR' become columns
state_funding_impact_df_reset = state_funding_impact_df.reset_index()

# Display the columns to confirm available value_vars
print(state_funding_impact_df_reset.columns)


Index(['STATE', 'YEAR', 'Cohort_2015_2016', 'Rate_2015_2016',
       'Cohort_2016_2017', 'Rate_2016_2017', 'Cohort_2017_2018',
       'Rate_2017_2018', 'Cohort_2018_2019', 'Rate_2018_2019',
       'Cohort_2019_2020', 'Rate_2019_2020', 'Cohort_2020_2021',
       'Rate_2020_2021', 'Unnamed: 0', 'StateFIP', 'CensusRegion',
       'CensusDivision', 'FiscalEffortPercentage',
       'FiscalEffortIncomePercentage', 'PredictedCostPerPupilState',
       'ActualSpendingPerPupilState', 'StateEnrollment', 'TeacherSalary25_30',
       'NonTeacherSalary25_30', 'TeacherSalary31_40', 'NonTeacherSalary31_40',
       'TeacherSalary41_50', 'NonTeacherSalary41_50', 'TeacherSalary51_60',
       'NonTeacherSalary51_60', 'SalaryParityAge25', 'SalaryParityAge35',
       'SalaryParityAge45', 'SalaryParityAge55', 'TotalTeacherSalary'],
      dtype='object')


In [8]:
# Reset the index so 'STATE' and 'YEAR' become columns
state_funding_impact_df_reset = state_funding_impact_df.reset_index()

# Melt the graduation rate columns
melted_grad_rate_df = state_funding_impact_df_reset.melt(
    id_vars=['STATE', 'YEAR'], 
    value_vars=['Rate_2015_2016', 'Rate_2016_2017', 'Rate_2017_2018', 'Rate_2018_2019', 'Rate_2019_2020', 'Rate_2020_2021'],
    var_name='Year_GradRate', 
    value_name='Grad_Rate'
)

# Display the first few rows to confirm
print(melted_grad_rate_df.head())


        STATE  YEAR   Year_GradRate  Grad_Rate
0     ALABAMA  2016  Rate_2015_2016  76.378635
1      ALASKA  2016  Rate_2015_2016  62.145089
2     ARIZONA  2016  Rate_2015_2016  65.798584
3    ARKANSAS  2016  Rate_2015_2016  76.651163
4  CALIFORNIA  2016  Rate_2015_2016  76.375773


In [10]:
# Reset the index so 'STATE' and 'YEAR' become columns
state_funding_impact_df_reset = state_funding_impact_df.reset_index()

# Assuming we treat ActualSpendingPerPupilState as TotalBudget
melted_budget_df = state_funding_impact_df_reset.melt(
    id_vars=['STATE', 'YEAR'], 
    value_vars=['ActualSpendingPerPupilState'],
    var_name='Year_Budget', 
    value_name='TotalBudget'
)

# Display the first few rows to confirm
print(melted_budget_df.head())


        STATE  YEAR                  Year_Budget  TotalBudget
0     ALABAMA  2016  ActualSpendingPerPupilState     9243.286
1      ALASKA  2016  ActualSpendingPerPupilState    17631.310
2     ARIZONA  2016  ActualSpendingPerPupilState     7530.644
3    ARKANSAS  2016  ActualSpendingPerPupilState     9639.397
4  CALIFORNIA  2016  ActualSpendingPerPupilState    10993.270


In [12]:
# Combine the melted dataframes
combined_melted_df = pd.merge(melted_grad_rate_df, melted_budget_df, on=['STATE', 'YEAR'])

# Save the combined melted DataFrame to a CSV file
combined_melted_df.to_csv(r"C:\Users\user\Desktop\StateEdFundingImpact\data\state_funding_impact_melted.csv", index=False)

# Display the first few rows to confirm
print(combined_melted_df.head())


     STATE  YEAR   Year_GradRate  Grad_Rate                  Year_Budget  \
0  ALABAMA  2016  Rate_2015_2016  76.378635  ActualSpendingPerPupilState   
1  ALABAMA  2016  Rate_2016_2017  81.771401  ActualSpendingPerPupilState   
2  ALABAMA  2016  Rate_2017_2018  77.201247  ActualSpendingPerPupilState   
3  ALABAMA  2016  Rate_2018_2019  78.639080  ActualSpendingPerPupilState   
4  ALABAMA  2016  Rate_2019_2020  78.251689  ActualSpendingPerPupilState   

   TotalBudget  
0     9243.286  
1     9243.286  
2     9243.286  
3     9243.286  
4     9243.286  


# Split the DataFrame into Features and Target

In [14]:
# Define the features (X) and target (y)
features = combined_melted_df.drop(columns=['Grad_Rate'])
target = combined_melted_df['Grad_Rate']

# Convert categorical columns to dummy variables
features = pd.get_dummies(features, drop_first=True)

# Display the first few rows to confirm
print(features.head())


   YEAR  TotalBudget  STATE_ALASKA  STATE_ARIZONA  STATE_ARKANSAS  \
0  2016     9243.286         False          False           False   
1  2016     9243.286         False          False           False   
2  2016     9243.286         False          False           False   
3  2016     9243.286         False          False           False   
4  2016     9243.286         False          False           False   

   STATE_CALIFORNIA  STATE_COLORADO  STATE_CONNECTICUT  STATE_DELAWARE  \
0             False           False              False           False   
1             False           False              False           False   
2             False           False              False           False   
3             False           False              False           False   
4             False           False              False           False   

   STATE_DISTRICT OF COLUMBIA  ...  STATE_VERMONT  STATE_VIRGINIA  \
0                       False  ...          False           False   
1 

# Splitting the Data into Training and Testing Sets

In [16]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets to confirm
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


(1411, 55) (1411,)
(353, 55) (353,)


# Standardizing the Numeric Features

In [29]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled arrays back to dataframes for easier inspection (optional)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Display the first few rows of the scaled training data
print(X_train_scaled.head())


       YEAR  TotalBudget  STATE_ALASKA  STATE_ARIZONA  STATE_ARKANSAS  \
0 -0.286235     1.065427     -0.147389      -0.149879       -0.144859   
1  0.300796     0.609174     -0.147389      -0.149879       -0.144859   
2 -0.286235    -0.126200     -0.147389      -0.149879       -0.144859   
3  0.887828    -0.380356     -0.147389      -0.149879       -0.144859   
4 -1.460298     0.003801     -0.147389      -0.149879       -0.144859   

   STATE_CALIFORNIA  STATE_COLORADO  STATE_CONNECTICUT  STATE_DELAWARE  \
0         -0.144859       -0.147389          -0.149879       -0.147389   
1         -0.144859       -0.147389          -0.149879       -0.147389   
2         -0.144859       -0.147389          -0.149879       -0.147389   
3         -0.144859       -0.147389          -0.149879       -0.147389   
4         -0.144859       -0.147389          -0.149879       -0.147389   

   STATE_DISTRICT OF COLUMBIA  ...  STATE_VERMONT  STATE_VIRGINIA  \
0                   -0.139673  ...      -0.1473

# Save Results

In [30]:
# Save the preprocessed data (optional)
X_train_scaled.to_csv('X_train_preprocessed.csv', index=False)
X_test_scaled.to_csv('X_test_preprocessed.csv', index=False)
y_train.to_csv('y_train_preprocessed.csv', index=False)
y_test.to_csv('y_test_preprocessed.csv', index=False)

print("Preprocessing complete. Data saved and ready for modeling.")

Preprocessing complete. Data saved and ready for modeling.
