In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
import ast
from collections import Counter

# Load the datasets
df = pd.read_csv("ML - Curricular Analytics - PIDM ONLY & Fixed Repeat IND.csv")
parsed = pd.read_csv("formatted_data.csv")


# Convert string representations of lists into actual lists
parsed['Semester Grades'] = parsed['Semester Grades'].apply(lambda x: ast.literal_eval(x))
parsed['Semester Points'] = parsed['Semester Points'].apply(lambda x: ast.literal_eval(x))
parsed['Classes'] = parsed['Classes'].apply(lambda x: ast.literal_eval(x))

# Group the parsed data by 'Pidm' and aggregate relevant fields
data = parsed.groupby('Pidm').agg({
    "ACTE": 'first',
    "ACTR": 'first',
    "ACTS": 'first',
    "ACTM": 'first',
    "EACT": 'first',
    "SAT-ERW": 'first',
    "SATM": 'first',
    "SAT_TOTAL": 'first',
    "HS GPA": 'first',
    'Final GPA': 'first',
    'Semester GPA': lambda x: list(x),
    'Semester Grades': lambda x: sum(x, []),
    'Semester Points': lambda x: sum(x, []),
    'Classes': lambda x: sum(x, []),
}).reset_index()



  df = pd.read_csv("ML - Curricular Analytics - PIDM ONLY & Fixed Repeat IND.csv")


In [19]:
# Display sample data for a specific student
display(df[df["Pidm"] == 454253])
display(parsed[parsed["Pidm"] == 454253])

Unnamed: 0,Pidm,Admit_Code,Admit_Desc,Admit_Term,Admit_Level,Admit_College,Admit_Major_Code,Major_Desc,Trump_Race,Trump_Race_Desc,...,EACT,SAT-ERW,SATM,SAT_TOTAL,Term,CRN,SUBJ,CRSE_NUMB,REPEAT_IND,FINAL_GRADE
3168,454253,SB,Second Baccalaureate,201908,UG,SA,PNR,Pre-Nursing,W,White,...,-,-,-,-,201908.0,82359.0,CHM,2023,,A+
3169,454253,SB,Second Baccalaureate,201908,UG,SA,PNR,Pre-Nursing,W,White,...,-,-,-,-,202101.0,21150.0,NUR,3125,,A
3170,454253,SB,Second Baccalaureate,201908,UG,SA,PNR,Pre-Nursing,W,White,...,-,-,-,-,202101.0,21152.0,NUR,3825,,A+
3171,454253,SB,Second Baccalaureate,201908,UG,SA,PNR,Pre-Nursing,W,White,...,-,-,-,-,202101.0,21153.0,NUR,3066,,A+
3172,454253,SB,Second Baccalaureate,201908,UG,SA,PNR,Pre-Nursing,W,White,...,-,-,-,-,202101.0,21154.0,NUR,3026,,A
3173,454253,SB,Second Baccalaureate,201908,UG,SA,PNR,Pre-Nursing,W,White,...,-,-,-,-,202101.0,21155.0,NUR,3066L,,S
3174,454253,SB,Second Baccalaureate,201908,UG,SA,PNR,Pre-Nursing,W,White,...,-,-,-,-,202101.0,21160.0,NUR,3027L,,S
3175,454253,SB,Second Baccalaureate,201908,UG,SA,PNR,Pre-Nursing,W,White,...,-,-,-,-,202101.0,25163.0,NUR,3026L,,S
3176,454253,SB,Second Baccalaureate,201908,UG,SA,PNR,Pre-Nursing,W,White,...,-,-,-,-,202105.0,51080.0,NUR,3145,,A
3177,454253,SB,Second Baccalaureate,201908,UG,SA,PNR,Pre-Nursing,W,White,...,-,-,-,-,202105.0,56567.0,NUR,3225,,A


Unnamed: 0,Pidm,Semester,Admit_Code,Admit_Level,Admit_College,Admit_Major_Code,Major_Desc,Trump_Race,Trump_Race_Desc,MULTI,...,ACTS,EACT,SAT-ERW,SATM,SAT_TOTAL,Final GPA,Semester GPA,Semester Grades,Semester Points,Classes
1436,454253,201908,SB,UG,SA,PNR,Pre-Nursing,W,White,,...,-,-,-,-,-,3.98,4.0,[A+],[4.0],[CHM2023]
1437,454253,202101,SB,UG,SA,PNR,Pre-Nursing,W,White,,...,-,-,-,-,-,3.98,4.0,"[A, A+, A+, A, S, S, S, A, A+, A+, A, S, S, S]","[4.0, 4.0, 4.0, 4.0, 0.0, 0.0, 0.0, 4.0, 4.0, ...","[NUR3125, NUR3825, NUR3066, NUR3026, NUR3066L,..."
1438,454253,202105,SB,UG,SA,PNR,Pre-Nursing,W,White,,...,-,-,-,-,-,3.98,4.0,"[A, A, A, S, S, A, A, A, S, S]","[4.0, 4.0, 4.0, 0.0, 0.0, 4.0, 4.0, 4.0, 0.0, ...","[NUR3145, NUR3225, NUR4467, NUR4467L, NUR3225L..."
1439,454253,202108,SB,UG,SA,PNR,Pre-Nursing,W,White,,...,-,-,-,-,-,3.98,3.92,"[A-, A+, S, S, A+, A+, A-, A+, S, S, A+, A+]","[3.67, 4.0, 0.0, 0.0, 4.0, 4.0, 3.67, 4.0, 0.0...","[NUR4535, NUR4165, NUR4227L, NUR4535L, NUR4888..."
1440,454253,202201,SB,UG,SA,PNR,Pre-Nursing,W,White,,...,-,-,-,-,-,3.98,4.0,"[S, A+, S, A+, S, A+, S, A+]","[0.0, 4.0, 0.0, 4.0, 0.0, 4.0, 0.0, 4.0]","[NUR4948L, NUR4635, NUR4635L, NUR4827, NUR4948..."


In [20]:
data

Unnamed: 0,Pidm,ACTE,ACTR,ACTS,ACTM,EACT,SAT-ERW,SATM,SAT_TOTAL,HS GPA,Final GPA,Semester GPA,Semester Grades,Semester Points,Classes
0,285,-,-,-,-,-,-,-,-,,3.75,"[3.75, nan]","[A, A-, B+, A, W]","[4.0, 3.67, 3.33, 4.0, 0.0]","[LIS3361, LIS3353, COP2030, CIS4510, LIS4365]"
1,432,-,-,-,-,-,-,-,-,,0.00,[0.0],"[IF, F, F]","[0.0, 0.0, 0.0]","[MUS4930, CCJ3117, IDS4934]"
2,705,31,36,29,24,30,-,-,-,,4.00,[4.0],"[A, A+, A, A, A, A]","[4.0, 4.0, 4.0, 4.0, 4.0, 4.0]","[MCB2000, MCB2000L, HUN2201, BSC2085, BSC2085L..."
3,794,25,25,22,26,25,-,-,-,3.30,3.98,"[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 3.84, 4.0, 4.0,...","[A+, A+, A+, A+, A+, A+, A+, A+, A+, A+, A+, A...","[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, ...","[NUR3805, NUR3078, NUR4895, NUR4828C, NUR4128,..."
4,1089,-,-,-,-,-,-,-,-,,4.00,"[4.0, 4.0, 4.0]","[A, A, A, A, A, A, A]","[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]","[CCJ3621, PSY3213, EXP4680C, CLP4433, CLP4143,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98768,3984703,-,-,-,-,-,510,510,1020,3.64,3.50,[3.5],"[A-, B+]","[3.67, 3.33]","[SLS2901, THE2000]"
98769,3984725,-,-,-,-,-,580,590,1170,3.64,3.34,[3.34],"[B, A-]","[3.0, 3.67]","[HSC2000, SPC2608]"
98770,3984750,18,18,20,17,18,410,340,750,3.29,2.33,[2.33],[C+],[2.33],[MMC3602]
98771,3984772,-,-,-,-,-,450,480,930,0.00,0.00,[0.0],[F],[0.0],[HUM1020]


In [21]:
data.columns

Index(['Pidm', 'ACTE', 'ACTR', 'ACTS', 'ACTM', 'EACT', 'SAT-ERW', 'SATM',
       'SAT_TOTAL', 'HS GPA', 'Final GPA', 'Semester GPA', 'Semester Grades',
       'Semester Points', 'Classes'],
      dtype='object')

In [22]:
# ACT to SAT conversion tables
act_to_sat_conversion_math = {
    "36": 800, "35": 780, "34": 760, '33': 740, "32": 720, "31": 710,
    "30": 700, "29": 680, "28": 660, "27": 640, '26': 610, "25": 590,
    "24": 580, "23": 560, "22": 540, '21': 530, "20": 520, '19': 510,
    "18": 500, "17": 470, '16': 430, "15": 400, "14": 330, "13": 330,
    "12": 310, "11": 280, "10": 260,
}

act_to_sat_conversion_eng = {
    "72": 790, '71': 770, '70': 750, '69': 740, '68': 730, '67': 720,
    "66": 710, '65': 700, '64': 700, '63': 690, '62': 680, '61': 680,
    # More conversions...
}

# Convert ACTM to SATM and keep the highest SATM
def convert_actm_to_satm(row):
    actm = row['ACTM']
    satm = row['SATM']
    if actm != '-' and actm in act_to_sat_conversion_math:
        converted_satm = act_to_sat_conversion_math[actm]
        if satm == '-' or converted_satm > int(satm):
            return converted_satm
    return int(satm) if satm != '-' else '-'

# Convert ACTR and ACTE to SAT-ERW and keep the highest
def convert_actr_acte_to_saterw(row):
    actr, acte = row['ACTR'], row['ACTE']
    sat_erw = row['SAT-ERW']
    if pd.isna(sat_erw):
        sat_erw = '-'
    if actr != '-' and acte != '-' and actr in act_to_sat_conversion_eng and acte in act_to_sat_conversion_eng:
        converted_sum = act_to_sat_conversion_eng[str(int(actr) + int(acte))]
        if sat_erw == '-' or converted_sum > int(sat_erw):
            return converted_sum
    return int(sat_erw) if sat_erw != '-' else '-'

# Apply the conversion functions to the dataset
data['SATM'] = data.apply(convert_actm_to_satm, axis=1)
data['SAT-ERW'] = data.apply(convert_actr_acte_to_saterw, axis=1)

# Filter out rows with missing SAT or HS GPA values and remove duplicates
data = data[(data['SAT-ERW'] != '-') & (data['SATM'] != '-') & (data['HS GPA'].notnull())]
data = data.drop_duplicates(subset='Pidm')


In [23]:
# Count occurrences of grades
print(Counter([letter for sublist in data['Semester Grades'] for letter in sublist]))

# Define failing grades and create a binary 'Fail' column
fail_grades = ['F', 'IF', 'W', 'D-', 'D+', 'FF', 'Z', 'F+', 'F-', 'I']
data['Fail'] = data['Semester Grades'].apply(lambda grades: any(grade in fail_grades for grade in grades)).astype(int)


Counter({'A': 537670, 'B': 224633, 'A+': 200723, 'A-': 156917, 'B+': 107847, 'C': 95581, 'B-': 66417, 'S': 59072, 'C+': 45307, 'F': 44379, 'W': 36865, 'D': 20961, 'C-': 20061, 'WC': 15280, 'D+': 7962, 'D-': 4809, 'U': 3904, 'F*': 1779, 'WE': 1569, 'IF': 1234, 'CF': 1087, 'C*': 855, 'Z': 679, 'B*': 628, 'D*': 466, 'A*': 345, 'B-*': 334, 'C-*': 317, 'C+*': 293, 'FF': 230, 'A-*': 209, 'B+*': 197, 'I': 172, 'D+*': 167, 'CM': 163, 'D-*': 144, 'S*': 62, 'A+*': 61, 'CA': 51, 'CB': 48, 'IF*': 43, 'IU': 29, 'CC': 28, 'CD': 24, 'CB+': 23, 'CB-': 23, 'W*': 21, 'CA-': 16, 'CC-': 15, 'CD-': 13, 'N': 12, 'CC+': 12, 'CU': 11, 'Z*': 11, 'CD+': 8, '-': 4, 'CA+': 4, 'M': 4, 'U*': 3, 'CS': 3, 'C-#': 2, 'D#': 2, 'F#': 2, 'IB': 2, 'B+#': 1, 'B#': 1, 'A#': 1, 'CI': 1, 'IS': 1})


In [28]:
from sklearn.model_selection import train_test_split

# Define feature matrix X and target vector y
X = data[['SAT-ERW', 'SATM', 'HS GPA']] 
y = data['Fail']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [29]:

# Create a pipeline that scales, adds polynomial features, and performs logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Add polynomial features
    ('logreg', LogisticRegression(random_state=42))  # Logistic Regression
])

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'logreg__solver': ['liblinear', 'lbfgs'],  # Different solvers
    'logreg__penalty': ['l2']  # L2 regularization
}

# Hyperparameter tuning with GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Perform grid search on the training data
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearchCV
print(f"Best parameters: {grid_search.best_params_}")

# Use the best estimator from grid search
best_model = grid_search.best_estimator_

# Cross-validation scores
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(cv_scores):.2f}")

# Train the final model on the full training set
best_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model on the test set
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output results
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{cm}')
print(f'Classification Report:\n{report}')


Best parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}
Cross-Validation Accuracy: 0.61
Accuracy: 0.6096789883268483
Confusion Matrix:
[[4091 2329]
 [2486 3430]]
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.64      0.63      6420
           1       0.60      0.58      0.59      5916

    accuracy                           0.61     12336
   macro avg       0.61      0.61      0.61     12336
weighted avg       0.61      0.61      0.61     12336

