In [1]:
import pandas as pd
from sqlalchemy import create_engine, inspect,text
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# Database path in the schema folder
db_path = '../Schema/student_database.db'

engine = create_engine(f'sqlite:///{db_path}')

with engine.connect() as conn:
    print("Connected to the database.")
    
    inspector = inspect(engine)
    tables = inspector.get_table_names()
    print("Tables in the database:", tables)

Connected to the database.
Tables in the database: ['student_performance']


In [5]:
query = "SELECT * FROM student_performance;"
df = pd.read_sql(query, con=engine)
df.head()

Unnamed: 0,Student ID,Gender,Age Group,Living Situation,Parent Education Level,Family Financial Status,Class,Department,Performance in English,Performance in Maths,...,School Learning Resources,Motivation for Attending School,Plan to Attend University,JAMB Goal Score,Extracurricular Hours,Free Time Activities,Language at Home,Involved in Work Outside School,Last Exam Performance,Foundational Knowledge
0,1001,Female,15 - 16,Both parents,Post-graduate education,We meet our needs but can't afford luxuries,SS2,Science,Excellent,Good,...,Library,To learn and gain knowledge,Yes,200 To 250,Less than 5 hours,Doing chores at home,Yoruba,Frequently,60% - 69%,Average
1,1002,Female,15 - 16,Both parents,Higher Education,We can afford some luxuries,SS3,Science,Good,Good,...,Library,To prepare for a good career,Yes,300 And Above,5 - 10 hours,Others,Yoruba,Occasionally,70% - 100%,Strong
2,1003,Male,15 - 16,Both parents,Post-graduate education,We meet our needs but can't afford luxuries,SS3,Science,Fair,Fair,...,Nil,To prepare for a good career,Not Sure,200 To 250,More than 10 hours,Playing sports,English,No,60% - 69%,Average
3,1004,Male,12 - 14,Both parents,Higher Education,We can afford some luxuries,SS3,Science,Good,Excellent,...,"Computer lab, Extra tutoring sessions",To learn and gain knowledge,Yes,300 And Above,Less than 5 hours,"Studying,Socializing with friends,Watching mov...",English,No,70% - 100%,Average
4,1005,Male,15 - 16,Single parent,Higher Education,We are financially comfortable,SS3,Science,Good,Fair,...,"Computer lab, Extra tutoring sessions",To prepare for a good career,Yes,200 To 250,Less than 5 hours,"Watching movies,Playing sports,Doing chores at...",English,No,60% - 69%,Strong


In [7]:
df.shape

(100, 47)

In [9]:
df['Last Exam Performance']

0      60% - 69%
1     70% - 100%
2      60% - 69%
3     70% - 100%
4      60% - 69%
         ...    
95     60% - 69%
96    70% - 100%
97     40% - 59%
98     60% - 69%
99     60% - 69%
Name: Last Exam Performance, Length: 100, dtype: object

In [11]:
# To categorize new performance column into Likely to Succeed or Likely to Fail based on last performance

df['Next Exam Prediction'] = None  # Initialize the column with None or empty values

# Update based on conditions using loc[]
df.loc[df['Last Exam Performance'] == 'Below 40%', 'Next Exam Prediction'] = 'Likely To Fail'
df.loc[df['Last Exam Performance'] == '40% To 59%', 'Next Exam Prediction'] = 'Likely To Fail'
df.loc[df['Last Exam Performance'] == '60% - 69%', 'Next Exam Prediction'] = 'Likely To Succeed'
df.loc[df['Last Exam Performance'] == '70% - 100%', 'Next Exam Prediction'] = 'Likely to Succeed'

# Display the updated DataFrame
df

Unnamed: 0,Student ID,Gender,Age Group,Living Situation,Parent Education Level,Family Financial Status,Class,Department,Performance in English,Performance in Maths,...,Motivation for Attending School,Plan to Attend University,JAMB Goal Score,Extracurricular Hours,Free Time Activities,Language at Home,Involved in Work Outside School,Last Exam Performance,Foundational Knowledge,Next Exam Prediction
0,1001,Female,15 - 16,Both parents,Post-graduate education,We meet our needs but can't afford luxuries,SS2,Science,Excellent,Good,...,To learn and gain knowledge,Yes,200 To 250,Less than 5 hours,Doing chores at home,Yoruba,Frequently,60% - 69%,Average,Likely To Succeed
1,1002,Female,15 - 16,Both parents,Higher Education,We can afford some luxuries,SS3,Science,Good,Good,...,To prepare for a good career,Yes,300 And Above,5 - 10 hours,Others,Yoruba,Occasionally,70% - 100%,Strong,Likely to Succeed
2,1003,Male,15 - 16,Both parents,Post-graduate education,We meet our needs but can't afford luxuries,SS3,Science,Fair,Fair,...,To prepare for a good career,Not Sure,200 To 250,More than 10 hours,Playing sports,English,No,60% - 69%,Average,Likely To Succeed
3,1004,Male,12 - 14,Both parents,Higher Education,We can afford some luxuries,SS3,Science,Good,Excellent,...,To learn and gain knowledge,Yes,300 And Above,Less than 5 hours,"Studying,Socializing with friends,Watching mov...",English,No,70% - 100%,Average,Likely to Succeed
4,1005,Male,15 - 16,Single parent,Higher Education,We are financially comfortable,SS3,Science,Good,Fair,...,To prepare for a good career,Yes,200 To 250,Less than 5 hours,"Watching movies,Playing sports,Doing chores at...",English,No,60% - 69%,Strong,Likely To Succeed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1096,Female,15 - 16,Both parents,Higher Education,We can afford some luxuries,SS2,Humanities,Good,Good,...,To learn and gain knowledge,Yes,250 To 300,5 - 10 hours,Doing chores at home,English,No,60% - 69%,Average,Likely To Succeed
96,1097,Male,15 - 16,Both parents,Secondary school,We meet our needs but can't afford luxuries,SS1,Humanities,Excellent,Fair,...,To learn and gain knowledge,Yes,250 To 300,Less than 5 hours,Watching movies,English,No,70% - 100%,Average,Likely to Succeed
97,1098,Female,15 - 16,Single parent,Secondary school,We meet our needs but can't afford luxuries,SS2,Humanities,Fair,Fair,...,To learn and gain knowledge,Yes,200 To 250,Less than 5 hours,Studying,Igbo,No,40% - 59%,Weak,
98,1099,Male,15 - 16,Both parents,Higher Education,We can afford some luxuries,SS1,Humanities,Good,Good,...,To learn and gain knowledge,Yes,200 To 250,Less than 5 hours,Doing chores at home,English,Frequently,60% - 69%,Strong,Likely To Succeed


In [13]:
df.shape

(100, 48)

In [15]:
from sklearn.model_selection import train_test_split

# Splitting 70% for training and 30% for temp (validation + testing)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)

val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check the number of rows in each set
print(f"Training set: {len(train_df)} rows")
print(f"Validation set: {len(val_df)} rows")
print(f"Test set: {len(test_df)} rows")

Training set: 70 rows
Validation set: 15 rows
Test set: 15 rows


## Identifying Input and Target Columns:

In [18]:
input_cols = [
    "Age Group",
    "Living Situation",
    "Parent Education Level",
    "Family Financial Status",
    "Class",
    "Department",
    "Performance in English",
    "Performance in Maths",
    "Study Hours per Week",
    "Extra Tutoring",
    "School Attendance",
    "Extracurricular Participation",
    "Considered Dropping Out",
    "Experienced Bullying",
    "Comfort in School Environment",
    "Stress About Schoolwork",
    "Confidence in Academic Ability",
    "Access to Counseling",
    "Motivation for Attending School",
    "Plan to Attend University",
    "JAMB Goal Score",
    "Foundational Knowledge",
    "Last Exam Performance"
]

In [22]:
input_cols = input_cols
target_col = "Next Exam Prediction"

Creating inputs and targets for the training, validation and test sets for further processing and model training.

In [25]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

In [27]:
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

In [29]:
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_col].copy()

In [33]:
train_inputs.sample(5)

Unnamed: 0,Age Group,Living Situation,Parent Education Level,Family Financial Status,Class,Department,Performance in English,Performance in Maths,Study Hours per Week,Extra Tutoring,...,Experienced Bullying,Comfort in School Environment,Stress About Schoolwork,Confidence in Academic Ability,Access to Counseling,Motivation for Attending School,Plan to Attend University,JAMB Goal Score,Foundational Knowledge,Last Exam Performance
41,15 - 16,Both parents,Secondary school,We meet our needs but can't afford luxuries,SS2,Commercial,Good,Fair,Less than 5 hours,No,...,Occasionally,Somewhat Comfortable,Sometimes,Somewhat confident,I'm not sure,To avoid being bored at home,Yes,250 To 300,Weak,40% - 59%
61,15 - 16,Both parents,Post-graduate education,We are financially comfortable,SS3,Science,Fair,Excellent,5 - 10 hours,Occasionally,...,Never,Comfortable,Sometimes,Very confident,I'm not sure,To learn and gain knowledge,Yes,300 And Above,Strong,70% - 100%
38,15 - 16,Both parents,Post-graduate education,We can afford some luxuries,SS2,Humanities,Excellent,Poor,Less than 5 hours,Occasionally,...,Never,Somewhat Comfortable,Sometimes,Not confident,I'm not sure,To prepare for a good career,Yes,300 And Above,Strong,60% - 69%
97,15 - 16,Single parent,Secondary school,We meet our needs but can't afford luxuries,SS2,Humanities,Fair,Fair,Less than 5 hours,No,...,Never,Somewhat Comfortable,Sometimes,Somewhat confident,No,To learn and gain knowledge,Yes,200 To 250,Weak,40% - 59%
87,15 - 16,Both parents,Secondary school,We meet our needs but can't afford luxuries,SS1,Science,Good,Fair,Less than 5 hours,Regularly,...,Frequently,Comfortable,Sometimes,Somewhat confident,Yes,To learn and gain knowledge,Yes,250 To 300,Average,40% - 59%


In [35]:
train_targets

11    Likely To Succeed
47    Likely to Succeed
85    Likely To Succeed
28    Likely To Succeed
93    Likely to Succeed
            ...        
60                 None
71    Likely To Succeed
14    Likely to Succeed
92    Likely to Succeed
51    Likely to Succeed
Name: Next Exam Prediction, Length: 70, dtype: object

Identifing which of the columns are numerical and which ones are categorical

In [42]:
import numpy as np
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()

In [48]:
numeric_cols

[]

No  Numeric Cols

In [51]:
categorical_cols

['Age Group',
 'Living Situation',
 'Parent Education Level',
 'Family Financial Status',
 'Class',
 'Department',
 'Performance in English',
 'Performance in Maths',
 'Study Hours per Week',
 'Extra Tutoring',
 'School Attendance',
 'Extracurricular Participation',
 'Considered Dropping Out',
 'Experienced Bullying',
 'Comfort in School Environment',
 'Stress About Schoolwork',
 'Confidence in Academic Ability',
 'Access to Counseling',
 'Motivation for Attending School',
 'Plan to Attend University',
 'JAMB Goal Score',
 'Foundational Knowledge',
 'Last Exam Performance']

In [53]:
train_inputs[categorical_cols].nunique()

Age Group                          3
Living Situation                   3
Parent Education Level             5
Family Financial Status            4
Class                              3
Department                         3
Performance in English             3
Performance in Maths               4
Study Hours per Week               4
Extra Tutoring                     3
School Attendance                  3
Extracurricular Participation      3
Considered Dropping Out            3
Experienced Bullying               3
Comfort in School Environment      3
Stress About Schoolwork            4
Confidence in Academic Ability     3
Access to Counseling               3
Motivation for Attending School    5
Plan to Attend University          3
JAMB Goal Score                    3
Foundational Knowledge             3
Last Exam Performance              4
dtype: int64

Encoding Categorical Data

In [66]:
from sklearn.preprocessing import OneHotEncoder

# Using sparse_output instead of sparse
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(df[categorical_cols])

In [68]:
encoder.categories_

[array(['12 - 14', '15 - 16', '17 - 18'], dtype=object),
 array(['Both parents', 'Guardian', 'Single parent'], dtype=object),
 array(['Higher Education', 'No formal education',
        'Post-graduate education', 'Primary school', 'Secondary school'],
       dtype=object),
 array(['We are financially comfortable', 'We can afford some luxuries',
        "We meet our needs but can't afford luxuries",
        'We struggle to meet basic needs'], dtype=object),
 array(['SS1', 'SS2', 'SS3'], dtype=object),
 array(['Commercial', 'Humanities', 'Science'], dtype=object),
 array(['Excellent', 'Fair', 'Good'], dtype=object),
 array(['Excellent', 'Fair', 'Good', 'Poor'], dtype=object),
 array(['5 - 10 hours', 'Less than 5 hours', 'More than 10 hours',
        'More than 15 hours'], dtype=object),
 array(['No', 'Occasionally', 'Regularly'], dtype=object),
 array(['Every day', 'Most days', 'Occasionally'], dtype=object),
 array(['Not at all', 'Occasionally', 'Regularly'], dtype=object),
 array(['Neve

In [70]:
# We can generate column names for each individual category using get_feature_names_out.

encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
print(encoded_cols)

['Age Group_12 - 14', 'Age Group_15 - 16', 'Age Group_17 - 18', 'Living Situation_Both parents', 'Living Situation_Guardian', 'Living Situation_Single parent', 'Parent Education Level_Higher Education', 'Parent Education Level_No formal education', 'Parent Education Level_Post-graduate education', 'Parent Education Level_Primary school', 'Parent Education Level_Secondary school', 'Family Financial Status_We are financially comfortable', 'Family Financial Status_We can afford some luxuries', "Family Financial Status_We meet our needs but can't afford luxuries", 'Family Financial Status_We struggle to meet basic needs', 'Class_SS1', 'Class_SS2', 'Class_SS3', 'Department_Commercial', 'Department_Humanities', 'Department_Science', 'Performance in English_Excellent', 'Performance in English_Fair', 'Performance in English_Good', 'Performance in Maths_Excellent', 'Performance in Maths_Fair', 'Performance in Maths_Good', 'Performance in Maths_Poor', 'Study Hours per Week_5 - 10 hours', 'Study 

In [76]:
# All of the above columns will be added to train_inputs, val_inputs and test_inputs.

# To perform the encoding, we use the transform method of encoder.

train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])
# verifing that these new columns have been added to our training, test and validation sets.

pd.set_option('display.max_columns', None)

test_inputs.head()

(15, 102)

In [78]:
# Saving Processed Data to Disk

print('train_inputs:', train_inputs.shape)
print('train_targets:', train_targets.shape)
print('val_inputs:', val_inputs.shape)
print('val_targets:', val_targets.shape)
print('test_inputs:', test_inputs.shape)
print('test_targets:', test_targets.shape)

train_inputs: (70, 102)
train_targets: (70,)
val_inputs: (15, 102)
val_targets: (15,)
test_inputs: (15, 102)
test_targets: (15,)


In [80]:
!pip install pyarrow --quiet

train_inputs.to_parquet('train_inputs.parquet')
val_inputs.to_parquet('val_inputs.parquet')
test_inputs.to_parquet('test_inputs.parquet')

In [81]:
%%time
pd.DataFrame(train_targets).to_parquet('train_targets.parquet')
pd.DataFrame(val_targets).to_parquet('val_targets.parquet')
pd.DataFrame(test_targets).to_parquet('test_targets.parquet')

CPU times: total: 46.9 ms
Wall time: 445 ms


In [85]:
# Can read the data back using pd.read_parquet.
train_inputs = pd.read_parquet('train_inputs.parquet')
val_inputs = pd.read_parquet('val_inputs.parquet')
test_inputs = pd.read_parquet('test_inputs.parquet')

train_targets = pd.read_parquet('train_targets.parquet')[target_col]
val_targets = pd.read_parquet('val_targets.parquet')[target_col]
test_targets = pd.read_parquet('test_targets.parquet')[target_col]

In [86]:
# verifying that the data was loaded properly.

print('train_inputs:', train_inputs.shape)
print('train_targets:', train_targets.shape)
print('val_inputs:', val_inputs.shape)
print('val_targets:', val_targets.shape)
print('test_inputs:', test_inputs.shape)
print('test_targets:', test_targets.shape)

train_inputs: (70, 102)
train_targets: (70,)
val_inputs: (15, 102)
val_targets: (15,)
test_inputs: (15, 102)
test_targets: (15,)


In [87]:
val_inputs.sample(3)

Unnamed: 0,Age Group,Living Situation,Parent Education Level,Family Financial Status,Class,Department,Performance in English,Performance in Maths,Study Hours per Week,Extra Tutoring,School Attendance,Extracurricular Participation,Considered Dropping Out,Experienced Bullying,Comfort in School Environment,Stress About Schoolwork,Confidence in Academic Ability,Access to Counseling,Motivation for Attending School,Plan to Attend University,JAMB Goal Score,Foundational Knowledge,Last Exam Performance,Age Group_12 - 14,Age Group_15 - 16,Age Group_17 - 18,Living Situation_Both parents,Living Situation_Guardian,Living Situation_Single parent,Parent Education Level_Higher Education,Parent Education Level_No formal education,Parent Education Level_Post-graduate education,Parent Education Level_Primary school,Parent Education Level_Secondary school,Family Financial Status_We are financially comfortable,Family Financial Status_We can afford some luxuries,Family Financial Status_We meet our needs but can't afford luxuries,Family Financial Status_We struggle to meet basic needs,Class_SS1,Class_SS2,Class_SS3,Department_Commercial,Department_Humanities,Department_Science,Performance in English_Excellent,Performance in English_Fair,Performance in English_Good,Performance in Maths_Excellent,Performance in Maths_Fair,Performance in Maths_Good,Performance in Maths_Poor,Study Hours per Week_5 - 10 hours,Study Hours per Week_Less than 5 hours,Study Hours per Week_More than 10 hours,Study Hours per Week_More than 15 hours,Extra Tutoring_No,Extra Tutoring_Occasionally,Extra Tutoring_Regularly,School Attendance_Every day,School Attendance_Most days,School Attendance_Occasionally,Extracurricular Participation_Not at all,Extracurricular Participation_Occasionally,Extracurricular Participation_Regularly,Considered Dropping Out_Never,Considered Dropping Out_Often,Considered Dropping Out_Sometimes,Experienced Bullying_Frequently,Experienced Bullying_Never,Experienced Bullying_Occasionally,Comfort in School Environment_Comfortable,Comfort in School Environment_Somewhat Comfortable,Comfort in School Environment_Uncomfortable,Stress About Schoolwork_Always,Stress About Schoolwork_Never,Stress About Schoolwork_Rarely,Stress About Schoolwork_Sometimes,Confidence in Academic Ability_Not confident,Confidence in Academic Ability_Somewhat confident,Confidence in Academic Ability_Very confident,Access to Counseling_I'm not sure,Access to Counseling_No,Access to Counseling_Yes,Motivation for Attending School_Others,Motivation for Attending School_To avoid being bored at home,Motivation for Attending School_To learn and gain knowledge,Motivation for Attending School_To meet family expectations,Motivation for Attending School_To prepare for a good career,Plan to Attend University_No,Plan to Attend University_Not Sure,Plan to Attend University_Yes,JAMB Goal Score_200 To 250,JAMB Goal Score_250 To 300,JAMB Goal Score_300 And Above,JAMB Goal Score_Below 200,Foundational Knowledge_Average,Foundational Knowledge_Strong,Foundational Knowledge_Weak,Last Exam Performance_40% - 59%,Last Exam Performance_60% - 69%,Last Exam Performance_70% - 100%,Last Exam Performance_Below 40%
40,12 - 14,Both parents,Post-graduate education,We are financially comfortable,SS1,Science,Excellent,Excellent,More than 10 hours,Regularly,Every day,Occasionally,Never,Occasionally,Comfortable,Sometimes,Very confident,Yes,To prepare for a good career,Yes,300 And Above,Strong,70% - 100%,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
45,15 - 16,Both parents,Higher Education,We meet our needs but can't afford luxuries,SS2,Humanities,Excellent,Good,5 - 10 hours,Regularly,Every day,Regularly,Never,Frequently,Comfortable,Always,Very confident,Yes,To learn and gain knowledge,Yes,250 To 300,Strong,70% - 100%,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
72,15 - 16,Both parents,Higher Education,We meet our needs but can't afford luxuries,SS3,Commercial,Excellent,Poor,Less than 5 hours,Regularly,Every day,Regularly,Never,Never,Comfortable,Sometimes,Not confident,I'm not sure,To learn and gain knowledge,Yes,250 To 300,Strong,70% - 100%,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [91]:
val_targets

26    Likely To Succeed
53    Likely to Succeed
70       Likely To Fail
15    Likely to Succeed
45    Likely to Succeed
88    Likely to Succeed
40    Likely to Succeed
12    Likely To Succeed
72    Likely to Succeed
55    Likely to Succeed
80    Likely to Succeed
18    Likely to Succeed
90                 None
31    Likely to Succeed
22    Likely To Succeed
Name: Next Exam Prediction, dtype: object

## Training a Logistic Regression Model

In [96]:
from sklearn.linear_model import LogisticRegression

In [98]:
model = LogisticRegression(solver='liblinear')

In [100]:
model = LogisticRegression(solver='liblinear')

In [102]:
print(numeric_cols + encoded_cols)

['Age Group_12 - 14', 'Age Group_15 - 16', 'Age Group_17 - 18', 'Living Situation_Both parents', 'Living Situation_Guardian', 'Living Situation_Single parent', 'Parent Education Level_Higher Education', 'Parent Education Level_No formal education', 'Parent Education Level_Post-graduate education', 'Parent Education Level_Primary school', 'Parent Education Level_Secondary school', 'Family Financial Status_We are financially comfortable', 'Family Financial Status_We can afford some luxuries', "Family Financial Status_We meet our needs but can't afford luxuries", 'Family Financial Status_We struggle to meet basic needs', 'Class_SS1', 'Class_SS2', 'Class_SS3', 'Department_Commercial', 'Department_Humanities', 'Department_Science', 'Performance in English_Excellent', 'Performance in English_Fair', 'Performance in English_Good', 'Performance in Maths_Excellent', 'Performance in Maths_Fair', 'Performance in Maths_Good', 'Performance in Maths_Poor', 'Study Hours per Week_5 - 10 hours', 'Study 

## Making Predictions and Evaluating the Model

In [119]:
# We can now use the trained model to make predictions on the training, test

X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

In [121]:
train_preds = model.predict(X_train)

NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.