In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns


In [9]:
# Load the datasets
completions = pd.read_csv('IPEDS_Completions.csv')
institutions = pd.read_csv('IPEDS_IC.csv')
Directory = pd.read_csv('IPEDS_Directory.csv')
demographics = pd.read_csv('IPEDS_EFFY.csv')


In [10]:
institutions.head()

Unnamed: 0,UNITID,PEO1ISTR,PEO2ISTR,PEO3ISTR,PEO4ISTR,PEO5ISTR,PEO6ISTR,PEO7ISTR,CNTLAFFI,PUBPRIME,...,ASSOC5,ASSOC6,SPORT1,CONFNO1,SPORT2,CONFNO2,SPORT3,CONFNO3,SPORT4,CONFNO4
0,100654,0,1,0,0,0,0,0,1,2,...,0,0,1,133,1,133,1,133,1,133
1,100663,0,1,1,0,0,0,0,1,2,...,0,0,1,111,1,111,1,111,1,111
2,100690,0,1,0,0,0,0,0,4,-2,...,0,0,2,-2,2,-2,2,-2,2,-2
3,100706,0,1,1,1,0,0,0,1,2,...,0,0,2,-2,1,146,1,146,1,146
4,100724,1,1,0,0,0,1,1,1,2,...,0,0,1,133,1,133,1,133,1,133


In [11]:
demographics.head()

Unnamed: 0,UNITID,EFFYALEV,EFFYLEV,LSTUDY,XEYTOTLT,EFYTOTLT,XEYTOTLM,EFYTOTLM,XEYTOTLW,EFYTOTLW,...,XEYNRALW,EFYNRALW,XEFYGUUN,EFYGUUN,XEFYGUAN,EFYGUAN,XEFYGUTO,EFYGUTOT,XEFYGUKN,EFYGUKN
0,100654,1,1,999,R,6681,R,2666,R,4015,...,R,48,R,14.0,A,,R,14.0,R,6667.0
1,100654,2,2,1,R,5663,R,2337,R,3326,...,R,24,R,8.0,A,,R,8.0,R,5655.0
2,100654,3,-2,1,R,5621,R,2323,R,3298,...,R,24,A,,A,,A,,A,
3,100654,4,-2,1,R,1680,R,722,R,958,...,R,4,A,,A,,A,,A,
4,100654,5,-2,1,R,3941,R,1601,R,2340,...,R,20,A,,A,,A,,A,


In [12]:
completions.head()

Unnamed: 0,UNITID,CIPCODE,MAJORNUM,AWLEVEL,XCTOTALT,CTOTALT,XCTOTALM,CTOTALM,XCTOTALW,CTOTALW,...,XCUNKNM,CUNKNM,XCUNKNW,CUNKNW,XCNRALT,CNRALT,XCNRALM,CNRALM,XCNRALW,CNRALW
0,100654,1.0999,1,5,R,9,R,3,R,6,...,Z,0,Z,0,Z,0,Z,0,Z,0
1,100654,1.1001,1,5,R,7,R,3,R,4,...,Z,0,Z,0,Z,0,Z,0,Z,0
2,100654,1.1001,1,7,R,7,R,1,R,6,...,Z,0,R,2,R,2,R,1,R,1
3,100654,1.1001,1,17,R,3,R,1,R,2,...,Z,0,Z,0,Z,0,Z,0,Z,0
4,100654,1.9999,1,5,R,1,R,1,Z,0,...,Z,0,Z,0,Z,0,Z,0,Z,0


In [17]:
# Filter for public universities
public_universities = institutions[institutions['Sector'].str.contains('Public', na=False)]
public_university_ids = public_universities['UNITID'].unique()

KeyError: 'Sector'

In [4]:

# Filter completions for public universities and STEM fields
stem_fields = ['CIP code for STEM fields']  # Replace with actual CIP codes for STEM fields
completions_stem = completions[(completions['Institution_ID'].isin(public_university_ids)) &
                               (completions['CIP_code'].isin(stem_fields))]


KeyError: 'Institution_ID'

In [16]:

# Merge completions with demographic data
completions_demographics = pd.merge(completions_stem, demographics, on='Institution_ID')

# Feature Engineering
features = completions_demographics[['Year', 'Gender', 'Ethnicity', 'Age', 'Institution_Size', 'Funding']]
target = completions_demographics['Completions']


NameError: name 'completions_stem' is not defined

In [None]:

# Handle categorical variables
categorical_features = ['Gender', 'Ethnicity']
numerical_features = ['Year', 'Age', 'Institution_Size', 'Funding']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Plot the results
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel('Actual Completions')
plt.ylabel('Predicted Completions')
plt.title('Actual vs Predicted Completions')
plt.show()
