# Predictive Model

In [243]:
# Import the relevant libraries
import pandas as pd
import numpy as np

In [244]:
# Read preprocessed data
data_preprocessed = pd.read_csv('data.csv')

### Removing unnecessary columns and converting strings into integers

In [245]:
data_preprocessed = data_preprocessed.drop (['Unnamed: 0'], axis = 1)

In [246]:
data_preprocessed=data_preprocessed.replace(to_replace=['A', 'F','M','A1', 'F1','M1','A2', 'F2','M2'], value=[1,2,3,11,21,31,12,22,32])

In [247]:
data_preprocessed.head()

Unnamed: 0,Grade,Age,Date,Month,Weekday,Subject,Teacher,Absent,Reason 1,Reason 2,Reason 3,Reason 4
0,4,9,2022-01-03,1,0,2,21,1,1,0,0,0
1,4,9,2022-01-03,1,0,1,11,1,0,1,0,0
2,5,10,2022-01-04,1,1,3,31,0,0,0,0,0
3,4,8,2022-01-04,1,1,3,31,0,0,0,0,0
4,5,10,2022-01-05,1,2,3,31,0,0,0,0,0


In [248]:
targets = data_preprocessed['Absent']

In [249]:
targets.sum()/ targets.shape[0]

0.30612244897959184

In [250]:
data_preprocessed = data_preprocessed.drop (['Absent'], axis = 1)
data_preprocessed = data_preprocessed.drop (['Reason 1'], axis = 1)
data_preprocessed = data_preprocessed.drop (['Reason 2'], axis = 1)
data_preprocessed = data_preprocessed.drop (['Reason 3'], axis = 1)
data_preprocessed = data_preprocessed.drop (['Reason 4'], axis = 1)

In [251]:
data_with_targets is data_preprocessed


False

In [252]:
data_with_targets.head()

Unnamed: 0,Grade,Age,Month,Weekday,Subject,Teacher,Reason 1,Reason 2,Reason 3,Reason 4
0,4,9,1,0,2,21,1,0,0,0
1,4,9,1,0,1,11,0,1,0,0
2,5,10,1,1,3,31,0,0,0,0
3,4,8,1,1,3,31,0,0,0,0
4,5,10,1,2,3,31,0,0,0,0


In [253]:
#Selecting input for regression
data_with_targets.shape

(49, 10)

In [254]:
data_with_targets.iloc[:,0:12]

Unnamed: 0,Grade,Age,Month,Weekday,Subject,Teacher,Reason 1,Reason 2,Reason 3,Reason 4
0,4,9,1,0,2,21,1,0,0,0
1,4,9,1,0,1,11,0,1,0,0
2,5,10,1,1,3,31,0,0,0,0
3,4,8,1,1,3,31,0,0,0,0
4,5,10,1,2,3,31,0,0,0,0
5,6,10,1,3,1,11,0,0,0,0
6,6,11,1,4,1,11,1,0,0,0
7,6,11,1,2,2,21,0,0,1,0
8,4,8,1,2,2,21,1,0,0,0
9,6,11,1,2,1,11,0,0,0,0


In [255]:
unscaled_inputs = data_with_targets.iloc[:,0:12]

### Scaling inputs

In [256]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()

In [257]:
unscaled_inputs.columns.values


array(['Grade', 'Age', 'Month', 'Weekday', 'Subject', 'Teacher',
       'Reason 1', 'Reason 2', 'Reason 3', 'Reason 4'], dtype=object)

In [258]:
columns_to_omit = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4']

In [259]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]


In [260]:
absenteeism_scaler = StandardScaler(columns_to_scale)




In [261]:
absenteeism_scaler.fit(unscaled_inputs)

StandardScaler(copy=['Grade', 'Age', 'Month', 'Weekday', 'Subject', 'Teacher'])

In [262]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [263]:
scaled_inputs


array([[-1.2004901 , -0.71628495, -0.8304548 , -1.27670784,  0.02653724,
        -0.02372285,  2.96647939, -0.25537696, -0.33709993, -0.20628425],
       [-1.2004901 , -0.71628495, -0.8304548 , -1.27670784, -1.27378774,
        -1.31530019, -0.33709993,  3.91578004, -0.33709993, -0.20628425],
       [ 0.        ,  0.25865846, -0.8304548 , -0.69204724,  1.32686223,
         1.26785449, -0.33709993, -0.25537696, -0.33709993, -0.20628425],
       [-1.2004901 , -1.69122836, -0.8304548 , -0.69204724,  1.32686223,
         1.26785449, -0.33709993, -0.25537696, -0.33709993, -0.20628425],
       [ 0.        ,  0.25865846, -0.8304548 , -0.10738664,  1.32686223,
         1.26785449, -0.33709993, -0.25537696, -0.33709993, -0.20628425],
       [ 1.2004901 ,  0.25865846, -0.8304548 ,  0.47727396, -1.27378774,
        -1.31530019, -0.33709993, -0.25537696, -0.33709993, -0.20628425],
       [ 1.2004901 ,  1.23360186, -0.8304548 ,  1.06193456, -1.27378774,
        -1.31530019,  2.96647939, -0.25537696

In [264]:
scaled_inputs.shape


(49, 10)

In [265]:
# Split the data into train and test
from sklearn.model_selection import train_test_split

In [266]:
train_test_split(scaled_inputs, targets)


[array([[ 0.        ,  0.25865846, -0.8304548 , -1.27670784,  0.02653724,
         -0.02372285, -0.33709993, -0.25537696, -0.33709993, -0.20628425],
        [-1.2004901 , -1.69122836, -0.8304548 , -0.10738664,  0.02653724,
         -0.02372285,  2.96647939, -0.25537696, -0.33709993, -0.20628425],
        [-1.2004901 , -0.71628495, -0.8304548 , -1.27670784,  0.02653724,
         -0.02372285, -0.33709993, -0.25537696, -0.33709993, -0.20628425],
        [ 1.2004901 ,  1.23360186,  1.20415946, -0.10738664,  0.02653724,
          0.10543488, -0.33709993, -0.25537696, -0.33709993, -0.20628425],
        [-1.2004901 , -0.71628495, -0.8304548 , -1.27670784, -1.27378774,
         -1.31530019, -0.33709993, -0.25537696, -0.33709993, -0.20628425],
        [ 0.        ,  0.25865846, -0.8304548 , -0.10738664,  1.32686223,
          1.26785449, -0.33709993, -0.25537696, -0.33709993, -0.20628425],
        [ 0.        , -0.71628495, -0.8304548 ,  1.06193456,  0.02653724,
         -0.02372285, -0.3370999

In [267]:
X_train, X_test, y_train, y_test=train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)


In [268]:
print (X_train.shape, y_train.shape)


(39, 10) (39,)


In [269]:
print (X_test.shape, y_test.shape)

(10, 10) (10,)


In [270]:
#Logistic regression with sklearn 
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [271]:
reg = LogisticRegression()

In [272]:
reg.fit(X_train, y_train)

LogisticRegression()

In [273]:
reg.score(X_train, y_train)

1.0

In [274]:
#Manual Check
model_outputs = reg.predict(X_train)
model_outputs

array([0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0], dtype=int64)

In [275]:
model_outputs == y_train

17    True
0     True
33    True
18    True
46    True
41    True
36    True
10    True
3     True
40    True
13    True
44    True
2     True
25    True
38    True
5     True
42    True
45    True
30    True
6     True
29    True
16    True
19    True
37    True
43    True
21    True
32    True
34    True
7     True
22    True
11    True
20    True
9     True
47    True
28    True
31    True
15    True
26    True
35    True
Name: Absent, dtype: bool

In [276]:
np.sum(model_outputs == y_train)


39

In [277]:
model_outputs.shape[0]


39

In [278]:
np.sum(model_outputs == y_train) / model_outputs.shape[0]


1.0

In [279]:
#Finding intercept and coefficients
reg.intercept_

array([-1.43515923])

In [280]:
reg.coef_

array([[-0.18597342,  0.00631942, -0.30831336, -0.18489297,  0.09435894,
         0.07405217,  1.48772641,  0.97679802,  1.45810243,  0.96348725]])

In [281]:
feature_name = unscaled_inputs.columns.values

In [282]:
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,Grade,-0.185973
1,Age,0.006319
2,Month,-0.308313
3,Weekday,-0.184893
4,Subject,0.094359
5,Teacher,0.074052
6,Reason 1,1.487726
7,Reason 2,0.976798
8,Reason 3,1.458102
9,Reason 4,0.963487


In [283]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summay_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
1,Grade,-0.185973
2,Age,0.006319
3,Month,-0.308313
4,Weekday,-0.184893
5,Subject,0.094359
6,Teacher,0.074052
7,Reason 1,1.487726
8,Reason 2,0.976798
9,Reason 3,1.458102
10,Reason 4,0.963487


In [284]:
summary_table['Odd ratio'] = np.exp(summary_table.Coefficient)

In [285]:
summary_table 


Unnamed: 0,Feature name,Coefficient,Odd ratio
1,Grade,-0.185973,0.830296
2,Age,0.006319,1.006339
3,Month,-0.308313,0.734685
4,Weekday,-0.184893,0.831193
5,Subject,0.094359,1.098954
6,Teacher,0.074052,1.076863
7,Reason 1,1.487726,4.427019
8,Reason 2,0.976798,2.655938
9,Reason 3,1.458102,4.297796
10,Reason 4,0.963487,2.62082


In [286]:
summary_table.sort_values('Odd ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odd ratio
7,Reason 1,1.487726,4.427019
9,Reason 3,1.458102,4.297796
8,Reason 2,0.976798,2.655938
10,Reason 4,0.963487,2.62082
5,Subject,0.094359,1.098954
6,Teacher,0.074052,1.076863
2,Age,0.006319,1.006339
4,Weekday,-0.184893,0.831193
1,Grade,-0.185973,0.830296
3,Month,-0.308313,0.734685


In [287]:
#Test the model
reg.score(X_test, y_test)

1.0

In [289]:
predicted_proba = reg.predict_proba(X_test)

In [290]:
predicted_proba.shape

(10, 2)

In [291]:
predicted_proba[:,1]

array([0.08472737, 0.92605655, 0.0845614 , 0.04749746, 0.84438312,
       0.10295378, 0.06920129, 0.07634013, 0.03480008, 0.57576578])