In [62]:
import pandas as pd
import numpy as np
from numpy import inf
from sklearn import preprocessing, linear_model
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# load data
df_train = pd.read_csv('br_takehome_exam_2022_training.csv')
df_scoring = pd.read_csv('br_takehome_exam_2022_scoring.csv')

# preview data
df_train.head()


Unnamed: 0,job_aptitude_exam,same_industry,unexcused_absences,hs_gpa,job_offered,good_behavior,high_school,enrolled_late,instructor,sensitive_01,...,sensitive_14,sensitive_15,sensitive_17,sensitive_18,sensitive_19,sensitive_20,sensitive_22,sensitive_23,sensitive_24,sensitive_25
0,96.0,0,1,2.255,1,0.0,0,0,inst_9,0.25,...,-0.6843,1,0,1,0.82,1,0,0.33,1,0.44
1,88.0,0,2,2.772,1,1.0,1,0,inst_5,0.36,...,-3.93832,1,0,0,0.91,0,0,-0.35,0,0.83
2,90.0,0,0,3.74,1,0.0,2,0,inst_6,0.77,...,-1.3366,1,1,1,-0.88,1,0,-1.18,1,-0.82
3,122.0,0,2,3.206,1,0.0,3,0,inst_5,0.39,...,-0.32674,0,1,1,-0.98,1,0,-0.74,0,-0.63
4,82.0,0,1,2.837,1,1.0,4,0,inst_2,0.85,...,0.90079,1,0,0,-0.41,0,1,-0.56,1,1.63


In [63]:
# check for missing values
df_train.isnull().sum()

job_aptitude_exam       0
same_industry           0
unexcused_absences      0
hs_gpa                 20
job_offered             0
good_behavior         100
high_school             0
enrolled_late           0
instructor             50
sensitive_01            0
sensitive_02            0
sensitive_03            0
sensitive_04            0
sensitive_05            0
sensitive_06            0
sensitive_07            0
sensitive_08            0
sensitive_09            0
sensitive_10          595
sensitive_11            0
sensitive_12            0
sensitive_13            0
sensitive_14            0
sensitive_15            0
sensitive_17            0
sensitive_18            0
sensitive_19            0
sensitive_20            0
sensitive_22            0
sensitive_23            0
sensitive_24            0
sensitive_25            0
dtype: int64

In [75]:
# clean dataset
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.fillna(0, inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep]#.astype(np.float64)

df_train_clean = clean_dataset(df_train)

# np.any(np.isnan(df_train.hs_gpa)) # True
# np.any(np.isnan(df_train_clean.hs_gpa)) # False

# np.all(np.isfinite(df_train.hs_gpa)) # False
# np.all(np.isfinite(df_train_clean.hs_gpa)) # True

# np.any(np.isfinite(df_train.hs_gpa)) # True
# np.any(np.isfinite(df_train_clean.hs_gpa)) # True


# np.isfinite(df_train.hs_gpa).sum() # 980 # 1000
# np.isfinite(df_train_clean.hs_gpa).sum() # 344
# df_train_clean.hs_gpa.count() # 344

# np.isnan(df_train.hs_gpa).sum() # 20
# np.isnan(df_train_clean.hs_gpa).sum() # 0

# check for missing values
df_train_clean.isnull().sum()


job_aptitude_exam     0
same_industry         0
unexcused_absences    0
hs_gpa                0
job_offered           0
good_behavior         0
high_school           0
enrolled_late         0
instructor            0
sensitive_01          0
sensitive_02          0
sensitive_03          0
sensitive_04          0
sensitive_05          0
sensitive_06          0
sensitive_07          0
sensitive_08          0
sensitive_09          0
sensitive_10          0
sensitive_11          0
sensitive_12          0
sensitive_13          0
sensitive_14          0
sensitive_15          0
sensitive_17          0
sensitive_18          0
sensitive_19          0
sensitive_20          0
sensitive_22          0
sensitive_23          0
sensitive_24          0
sensitive_25          0
dtype: int64

In [65]:
# check for missing values
df_scoring.isnull().sum()

job_aptitude_exam        0
same_industry            0
unexcused_absences       0
hs_gpa                  89
good_behavior          479
high_school              0
enrolled_late            0
instructor             238
sensitive_01             0
sensitive_02             0
sensitive_03             0
sensitive_04             0
sensitive_05             0
sensitive_06             0
sensitive_07             0
sensitive_08             0
sensitive_09             0
sensitive_10          2883
sensitive_11             0
sensitive_12             0
sensitive_13             0
sensitive_14             0
sensitive_15             0
sensitive_17             0
sensitive_18             0
sensitive_19             0
sensitive_20             0
sensitive_22             0
sensitive_23             0
sensitive_24             0
sensitive_25             0
dtype: int64

In [76]:
# clean scoring dataset
df_scoring_clean = clean_dataset(df_scoring)

# check for missing values
df_scoring_clean.isnull().sum()

job_aptitude_exam     0
same_industry         0
unexcused_absences    0
hs_gpa                0
good_behavior         0
high_school           0
enrolled_late         0
instructor            0
sensitive_01          0
sensitive_02          0
sensitive_03          0
sensitive_04          0
sensitive_05          0
sensitive_06          0
sensitive_07          0
sensitive_08          0
sensitive_09          0
sensitive_10          0
sensitive_11          0
sensitive_12          0
sensitive_13          0
sensitive_14          0
sensitive_15          0
sensitive_17          0
sensitive_18          0
sensitive_19          0
sensitive_20          0
sensitive_22          0
sensitive_23          0
sensitive_24          0
sensitive_25          0
dtype: int64

In [77]:
# define feature columns
feature_cols = ['hs_gpa', 'high_school', 'unexcused_absences']

# scale features
scaler = preprocessing.StandardScaler()
df_train_clean[feature_cols] = scaler.fit_transform(df_train_clean[feature_cols])
df_scoring_clean[feature_cols] = scaler.transform(df_scoring_clean[feature_cols])


In [78]:
# define X and y
X = df_train_clean[feature_cols]
y = df_train_clean['job_offered']

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# train logistic regression model
logreg = linear_model.LogisticRegression()
logreg.fit(X_train, y_train)

# train decision tree model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)


DecisionTreeClassifier()

In [79]:
# calculate accuracy of logistic regression model
logreg_acc = logreg.score(X_test, y_test)

# calculate accuracy of decision tree model
dt_acc = dt.score(X_test, y_test)

# compare model accuracy
print("Logistic regression accuracy:", logreg_acc)
print("Decision tree accuracy:", dt_acc)


Logistic regression accuracy: 0.92
Decision tree accuracy: 0.8033333333333333
