<h2><strong>Import Libraries</strong></h2>

In [230]:
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import VarianceThreshold, RFE
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [223]:
# Change Pandas Display Options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)  

<h2><strong>Import and Verify CSV</strong></h2>

In [180]:
df = pd.read_csv('../dataset.csv')
print(df.head())

   Marital status  Application mode  Application order  Course  Daytime/evening attendance  Previous qualification  Nacionality  Mother's qualification  Father's qualification  Mother's occupation  Father's occupation  Displaced  Educational special needs  Debtor  Tuition fees up to date  Gender  Scholarship holder  Age at enrollment  International  Curricular units 1st sem (credited)  Curricular units 1st sem (enrolled)  Curricular units 1st sem (evaluations)  Curricular units 1st sem (approved)  Curricular units 1st sem (grade)  Curricular units 1st sem (without evaluations)  Curricular units 2nd sem (credited)  Curricular units 2nd sem (enrolled)  Curricular units 2nd sem (evaluations)  Curricular units 2nd sem (approved)  Curricular units 2nd sem (grade)  Curricular units 2nd sem (without evaluations)  Unemployment rate  Inflation rate   GDP    Target
0               1                 8                  5       2                           1                       1            1     

<h2><strong>Data Prep for Model Building</strong></h2>

<h5>Map Target to be 0, 1, 2 Depending on its Category</h5>

In [181]:
target_map = {'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}
df['Target'] = df['Target'].map(target_map)

<h5>Set X and y</h5>

In [182]:
X = df.drop(columns=['Target']).copy()
y = df['Target'].copy()

print(X.head())


   Marital status  Application mode  Application order  Course  Daytime/evening attendance  Previous qualification  Nacionality  Mother's qualification  Father's qualification  Mother's occupation  Father's occupation  Displaced  Educational special needs  Debtor  Tuition fees up to date  Gender  Scholarship holder  Age at enrollment  International  Curricular units 1st sem (credited)  Curricular units 1st sem (enrolled)  Curricular units 1st sem (evaluations)  Curricular units 1st sem (approved)  Curricular units 1st sem (grade)  Curricular units 1st sem (without evaluations)  Curricular units 2nd sem (credited)  Curricular units 2nd sem (enrolled)  Curricular units 2nd sem (evaluations)  Curricular units 2nd sem (approved)  Curricular units 2nd sem (grade)  Curricular units 2nd sem (without evaluations)  Unemployment rate  Inflation rate   GDP
0               1                 8                  5       2                           1                       1            1               

<h5>Map Numerical Values to Their Specified Categories</h5>

In [183]:
yes_no_map = {0: 'No', 1: 'Yes'}
martial_status_map = {1: 'Single', 2: 'Married', 3: 'Widower', 4: 'Divorced', 5: 'Common-law Marriage', 6: 'Legally Separated'}
application_mode_map = {
    1: "1st phase — general contingent",
    2: "Ordinance No. 612/93",
    3: "1st phase — special contingent (Azores Island)",
    4: "Holders of other higher courses",
    5: "Ordinance No. 854-B/99",
    6: "International student (bachelor)",
    7: "1st phase — special contingent (Madeira Island)",
    8: "2nd phase — general contingent",
    9: "3rd phase — general contingent",
    10: "Ordinance No. 533-A/99, item b2) (Different Plan)",
    11: "Ordinance No. 533-A/99, item b3 (Other Institution)",
    12: "Over 23 years old",
    13: "Transfer",
    14: "Change in course",
    15: "Technological specialization diploma holders",
    16: "Change in institution/course",
    17: "Short cycle diploma holders",
    18: "Change in institution/course (International)"
}
course_map = {
    1: "Biofuel Production Technologies",
    2: "Animation and Multimedia Design",
    3: "Social Service (evening attendance)",
    4: "Agronomy",
    5: "Communication Design",
    6: "Veterinary Nursing",
    7: "Informatics Engineering",
    8: "Equiniculture",
    9: "Management",
    10: "Social Service",
    11: "Tourism",
    12: "Nursing",
    13: "Oral Hygiene",
    14: "Advertising and Marketing Management",
    15: "Journalism and Communication",
    16: "Basic Education",
    17: "Management (evening attendance)"
}
education_map = {
    1: "Secondary education",
    2: "Higher education—bachelor’s degree",
    3: "Higher education—degree",
    4: "Higher education—master’s degree",
    5: "Higher education—doctorate",
    6: "Frequency of higher education",
    7: "12th year of schooling—not completed",
    8: "11th year of schooling—not completed",
    9: "Other—11th year of schooling",
    10: "10th year of schooling",
    11: "10th year of schooling—not completed",
    12: "Basic education 3rd cycle (9th/10th/11th year) or equivalent",
    13: "Basic education 2nd cycle (6th/7th/8th year) or equivalent",
    14: "Technological specialization course",
    15: "Higher education—degree (1st cycle)",
    16: "Professional higher technical course",
    17: "Higher education—master’s degree (2nd cycle)"
}
nationality_map = {
    1: "Portuguese",
    2: "German",
    3: "Spanish",
    4: "Italian",
    5: "Dutch",
    6: "English",
    7: "Lithuanian",
    8: "Angolan",
    9: "Cape Verdean",
    10: "Guinean",
    11: "Mozambican",
    12: "Santomean",
    13: "Turkish",
    14: "Brazilian",
    15: "Romanian",
    16: "Moldova (Republic of)",
    17: "Mexican",
    18: "Ukrainian",
    19: "Russian",
    20: "Cuban",
    21: "Colombian"
}
qual_map = {
    1: "Secondary Education—12th Year of Schooling or Equivalent",
    2: "Higher Education—bachelor’s degree",
    3: "Higher Education—degree",
    4: "Higher Education—master’s degree",
    5: "Higher Education—doctorate",
    6: "Frequency of Higher Education",
    7: "12th Year of Schooling—not completed",
    8: "11th Year of Schooling—not completed",
    9: "7th Year (Old)",
    10: "Other—11th Year of Schooling",
    11: "2nd year complementary high school course",
    12: "10th Year of Schooling",
    13: "General commerce course",
    14: "Basic Education 3rd Cycle (9th/10th/11th Year) or Equivalent",
    15: "Complementary High School Course",
    16: "Technical-professional course",
    17: "Complementary High School Course—not concluded",
    18: "7th year of schooling",
    19: "2nd cycle of the general high school course",
    20: "9th Year of Schooling—not completed",
    21: "8th year of schooling",
    22: "General Course of Administration and Commerce",
    23: "Supplementary Accounting and Administration",
    24: "Unknown",
    25: "Cannot read or write",
    26: "Can read without having a 4th year of schooling",
    27: "Basic education 1st cycle (4th/5th year) or equivalent",
    28: "Basic Education 2nd Cycle (6th/7th/8th Year) or equivalent",
    29: "Technological specialization course",
    30: "Higher education—degree (1st cycle)",
    31: "Specialized higher studies course",
    32: "Professional higher technical course",
    33: "Higher Education—master’s degree (2nd cycle)",
    34: "Higher Education—doctorate (3rd cycle)"
}
ocupation_map = {
    1: "Student",
    2: "Representatives of the Legislative Power and Executive Bodies, Directors, Directors and Executive Managers",
    3: "Specialists in Intellectual and Scientific Activities",
    4: "Intermediate Level Technicians and Professions",
    5: "Administrative staff",
    6: "Personal Services, Security and Safety Workers, and Sellers",
    7: "Farmers and Skilled Workers in Agriculture, Fisheries, and Forestry",
    8: "Skilled Workers in Industry, Construction, and Craftsmen",
    9: "Installation and Machine Operators and Assembly Workers",
    10: "Unskilled Workers",
    11: "Armed Forces Professions",
    12: "Other Situation; 13—(blank)",
    13: "Armed Forces Officers",
    14: "Armed Forces Sergeants",
    15: "Other Armed Forces personnel",
    16: "Directors of administrative and commercial services",
    17: "Hotel, catering, trade, and other services directors",
    18: "Specialists in the physical sciences, mathematics, engineering, and related techniques",
    19: "Health professionals",
    20: "Teachers",
    21: "Specialists in finance, accounting, administrative organization, and public and commercial relations",
    22: "Intermediate level science and engineering technicians and professions",
    23: "Technicians and professionals of intermediate level of health",
    24: "Intermediate level technicians from legal, social, sports, cultural, and similar services",
    25: "Information and communication technology technicians",
    26: "Office workers, secretaries in general, and data processing operators",
    27: "Data, accounting, statistical, financial services, and registry-related operators",
    28: "Other administrative support staff",
    29: "Personal service workers",
    30: "Sellers",
    31: "Personal care workers and the like",
    32: "Protection and security services personnel",
    33: "Market-oriented farmers and skilled agricultural and animal production workers",
    34: "Farmers, livestock keepers, fishermen, hunters and gatherers, and subsistence",
    35: "Skilled construction workers and the like, except electricians",
    36: "Skilled workers in metallurgy, metalworking, and similar",
    37: "Skilled workers in electricity and electronics",
    38: "Workers in food processing, woodworking, and clothing and other industries and crafts",
    39: "Fixed plant and machine operators",
    40: "Assembly workers",
    41: "Vehicle drivers and mobile equipment operators",
    42: "Unskilled workers in agriculture, animal production, and fisheries and forestry",
    43: "Unskilled workers in extractive industry, construction, manufacturing, and transport",
    44: "Meal preparation assistants",
    45: "Street vendors (except food) and street service provider"
}
gender_map = {0: 'Female', 1: 'Male'}
attendance = {0: 'Evening', 1: 'Daytime'}


In [184]:
X['Marital status'] = X['Marital status'].map(martial_status_map)
X['Application mode'] = X['Application mode'].map(application_mode_map)
X['Course'] = X['Course'].map(course_map)
X['Previous qualification'] = X['Previous qualification'].map(qual_map)
X['Nacionality'] = X['Nacionality'].map(nationality_map)
X['Mother\'s qualification'] = X['Mother\'s qualification'].map(qual_map)
X['Father\'s qualification'] = X['Father\'s qualification'].map(qual_map)
X['Mother\'s occupation'] = X['Mother\'s occupation'].map(ocupation_map)
X['Father\'s occupation'] = X['Father\'s occupation'].map(ocupation_map)
X['Displaced'] = X['Displaced'].map(yes_no_map)
X['Debtor'] = X['Debtor'].map(yes_no_map)
X['Gender'] = X['Gender'].map(gender_map)
X['Scholarship holder'] = X['Scholarship holder'].map(yes_no_map)
X['Daytime/evening attendance'] = X['Daytime/evening attendance'].map(attendance)
X['Educational special needs'] = X['Educational special needs'].map(yes_no_map)
X['Tuition fees up to date'] = X['Tuition fees up to date'].map(yes_no_map)
X['International'] = X['International'].map(yes_no_map)

print(X.head())

  Marital status                  Application mode  Application order                               Course Daytime/evening attendance                             Previous qualification Nacionality                             Mother's qualification                             Father's qualification                                Mother's occupation                             Father's occupation Displaced Educational special needs Debtor Tuition fees up to date  Gender Scholarship holder  Age at enrollment International  Curricular units 1st sem (credited)  Curricular units 1st sem (enrolled)  Curricular units 1st sem (evaluations)  Curricular units 1st sem (approved)  Curricular units 1st sem (grade)  Curricular units 1st sem (without evaluations)  Curricular units 2nd sem (credited)  Curricular units 2nd sem (enrolled)  Curricular units 2nd sem (evaluations)  Curricular units 2nd sem (approved)  Curricular units 2nd sem (grade)  Curricular units 2nd sem (without evaluations)  \
0     

<h5>One Hot Encode Categorical Values</h5>

In [185]:
categorical_cols = [
    'Marital status', 'Application mode', 'Course', 'Daytime/evening attendance', 'Previous qualification', 'Nacionality', 'Mother\'s qualification', 'Father\'s qualification', 
    'Mother\'s occupation', 'Father\'s occupation', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'International']
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X[categorical_cols])

encoded_cols = encoder.get_feature_names_out(categorical_cols)
X_encoded_df = pd.DataFrame(X_encoded, columns=encoded_cols, index=X.index)

numeric_cols = X.drop(columns=categorical_cols).columns
X_numeric = X[numeric_cols]

X = pd.concat([X_numeric, X_encoded_df], axis=1)
print(X.head())

   Application order  Age at enrollment  Curricular units 1st sem (credited)  Curricular units 1st sem (enrolled)  Curricular units 1st sem (evaluations)  Curricular units 1st sem (approved)  Curricular units 1st sem (grade)  Curricular units 1st sem (without evaluations)  Curricular units 2nd sem (credited)  Curricular units 2nd sem (enrolled)  Curricular units 2nd sem (evaluations)  Curricular units 2nd sem (approved)  Curricular units 2nd sem (grade)  Curricular units 2nd sem (without evaluations)  Unemployment rate  Inflation rate   GDP  Marital status_Common-law Marriage  Marital status_Divorced  Marital status_Legally Separated  Marital status_Married  Marital status_Single  Marital status_Widower  Application mode_1st phase — general contingent  Application mode_1st phase — special contingent (Azores Island)  Application mode_1st phase — special contingent (Madeira Island)  Application mode_2nd phase — general contingent  Application mode_3rd phase — general contingent  \
0     

<h5>Remove Features With Little Variance</h5>

In [186]:
selector = VarianceThreshold(threshold=0.1)
X = pd.DataFrame(selector.fit_transform(X), columns=X.columns[selector.get_support()], index=X.index)

correlation_threshold = 0.1
correlations = X.apply(lambda col: col.corr(y)).abs()
features_to_keep = correlations[correlations >= correlation_threshold].index.tolist()

X = X[features_to_keep]

print(X.head())

   Age at enrollment  Curricular units 1st sem (enrolled)  Curricular units 1st sem (approved)  Curricular units 1st sem (grade)  Curricular units 2nd sem (enrolled)  Curricular units 2nd sem (approved)  Curricular units 2nd sem (grade)  Marital status_Single  Application mode_1st phase — general contingent  Application mode_Over 23 years old  Course_Nursing  Previous qualification_Secondary Education—12th Year of Schooling or Equivalent  Displaced_No  Displaced_Yes  Debtor_No  Debtor_Yes  Tuition fees up to date_No  Tuition fees up to date_Yes  Gender_Female  Gender_Male  Scholarship holder_No  Scholarship holder_Yes
0               20.0                                  0.0                                  0.0                          0.000000                                  0.0                                  0.0                          0.000000                    1.0                                              0.0                                 0.0             0.0              

<h5>Train Test Split</h5>

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h2><strong>Feature Selection</strong></h2>

<h5>Try Sequential Forward Floating Selection to Find Optimal Features</h5>

In [None]:
# Takes a long time to run (15~20> mins)
accs = []
features = []
for i in range(X.shape[1], X.shape[1] // 2, -1):
    estimator = LogisticRegression(max_iter=1000)
    sbs = SequentialFeatureSelector(estimator, k_features=i, forward=False, floating=True, scoring='accuracy', cv=5)
    sbs.fit(X_train, y_train)
    accs.append(float(sbs.subsets_[i]['avg_score']))
    features.append(i)

In [203]:
print(accs.index(max(accs)))
print(features[10])
print(accs)

10
12
[0.7688610265384892, 0.7708388272241268, 0.7731003124525528, 0.7736640855368829, 0.7739469709682834, 0.7739469709682834, 0.7739469709682834, 0.7742310550667657, 0.7750793118052726, 0.7747968259295663, 0.7753617976809788]


In [201]:
estimator = LogisticRegression(max_iter=1000)
sbs = SequentialFeatureSelector(estimator, k_features=12, forward=False, floating=True, scoring='accuracy', cv=5)
sbs.fit(X_train, y_train)
print(sbs.subsets_[i]['avg_score'])
print(sbs.subsets_[i]['feature_names'])

0.7753617976809788
('Curricular units 1st sem (approved)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Application mode_1st phase — general contingent', 'Application mode_Over 23 years old', 'Displaced_Yes', 'Debtor_No', 'Tuition fees up to date_Yes', 'Gender_Female', 'Scholarship holder_No', 'Scholarship holder_Yes')


<h5>Try Sequential Backward Floating Selection to Find Optimal Features</h5>

In [None]:
# Takes a long time to run (15~20> mins)
accs_sfs = []
features_sfs = []
for i in range(2, (X.shape[1] + 1) // 2):
    estimator = LogisticRegression(max_iter=1000)
    sbs = SequentialFeatureSelector(estimator, k_features=i, forward=True, floating=True, scoring='accuracy', cv=5)
    sbs.fit(X_train, y_train)
    accs_sfs.append(float(sbs.subsets_[i]['avg_score']))
    features_sfs.append(i)

In [216]:
print(accs_sfs.index(max(accs_sfs)))
print(max(accs_sfs))
print(features_sfs[9])
print(accs_sfs)

11
0.7750793118052726
11
[0.7253446167861337, 0.7482403567233236, 0.7558670758116974, 0.7575631897330168, 0.763212108135753, 0.7654719951414027, 0.7702758532511848, 0.7699929678197844, 0.7685801388855593, 0.7731023102310232, 0.7731023102310232, 0.7750793118052726, 0.7747968259295663, 0.7747968259295663, 0.7747968259295663, 0.7739485691910596, 0.7739469709682834, 0.7736640855368829, 0.7731003124525528, 0.7708388272241268, 0.7688610265384892]


<h5>Try Recursive Feature Elimination to Find Optimal Features</h5>

In [210]:
accs_rfe = []
features_rfe = []
for i in range(2, X.shape[1] + 1):
    estimator = LogisticRegression(max_iter=1000)
    rfe = RFE(estimator=estimator, n_features_to_select=i)
    rfe.fit(X_train, y_train)
    accs_rfe.append(float(rfe.score(X_train, y_train)))
    features_rfe.append(i)

In [213]:
print(max(accs_rfe))
print(accs_rfe.index(max(accs_rfe)))
print(features_rfe[16])

0.7750777055665442
16
18


<h2><strong>Model Building</strong></h2>

<h5>Use Recursive Feature Elimination to Find Optimal Features (Recursive Feature Elimination Provided the Best Results)</h5>

In [None]:
estimator = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=estimator, n_features_to_select=18)
rfe.fit(X_train, y_train)
print(rfe.score(X_train, y_train))
cols = [f for (f,support) in zip(X.columns, rfe.support_) if support]

0.7750777055665442
['Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (approved)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Marital status_Single', 'Application mode_1st phase — general contingent', 'Application mode_Over 23 years old', 'Course_Nursing', 'Previous qualification_Secondary Education—12th Year of Schooling or Equivalent', 'Displaced_Yes', 'Debtor_No', 'Debtor_Yes', 'Tuition fees up to date_No', 'Tuition fees up to date_Yes', 'Gender_Male', 'Scholarship holder_No', 'Scholarship holder_Yes']


<h5>Build Logistic Regression Model</h5>

In [225]:
X = X[cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lrModel = LogisticRegression(max_iter=1000)
lrModel.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [228]:
acc = accuracy_score(y_test, lrModel.predict(X_test)) * 100
print(f'Logistic Regression Model Accuracy: {acc:.2f}%')

Logistic Regression Model Accuracy: 75.25%


<h5>Export Model</h5>

In [232]:
joblib.dump(lrModel, 'lr_model.joblib')

['lr_model.joblib']