In [39]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.datasets import make_classification 
from imblearn.over_sampling import SMOTE 
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

In [40]:
# Loading data
attendance_model_df = pd.read_csv("./Resources/attendance_model_data.csv")
attendance_model_df.head()

Unnamed: 0,year,month,member,small_family_group,medium_family_group,large_family_group,adult,student,military,teen,child,two_and_under,senior,salt_lake_county,out_of_state,utah_county,other_in_state_county,davis_county,unknown_county
0,2022,2,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
1,2022,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
2,2022,2,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
3,2022,2,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
4,2022,2,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0


In [41]:
# Define features set
X = attendance_model_df.copy()
X.drop("member", axis=1, inplace=True)
X.head()

Unnamed: 0,year,month,small_family_group,medium_family_group,large_family_group,adult,student,military,teen,child,two_and_under,senior,salt_lake_county,out_of_state,utah_county,other_in_state_county,davis_county,unknown_county
0,2022,2,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
1,2022,2,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
2,2022,2,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
3,2022,2,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
4,2022,2,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0


In [42]:
# Define target vector
y = attendance_model_df["member"].ravel()
y[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [43]:
# define dataset 
x, y = make_classification(n_samples=5000) 
smote = SMOTE() 
x_smote, y_smote = smote.fit_resample(x, y) 
  
# print the features and the labels 
print('x_smote:\n', x_smote) 
print('y_smote:\n', y_smote) 

x_smote:
 [[ 1.70935668 -0.45897228  0.65137255 ... -1.4930817   0.224908
  -0.50630468]
 [-0.35191743 -0.36028028  1.08723356 ... -1.23923129  0.72404851
   1.24254108]
 [-0.98040519 -1.32817969  0.01009789 ...  2.06773885 -1.57774854
  -0.39447474]
 ...
 [ 2.73086601 -1.08649777  1.01756933 ...  0.01225409  0.3604994
  -0.65946254]
 [-0.85207561  0.17028021  0.96594904 ...  0.65771031  1.13027911
  -0.81371798]
 [ 0.52886869 -1.03599509 -0.08718251 ...  1.00724422  1.1305199
   0.17340104]]
y_smote:
 [0 0 1 ... 1 1 1]


In [44]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(x_smote, y_smote, random_state=24)

In [45]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [46]:
# Fitting the Random Forest Model
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=5, random_state=24)

In [47]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [48]:
# Making Predictions Using the Random Forest Model
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [49]:
# Model Evaluation
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

In [50]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,597,32
Actual 1,42,581


Accuracy Score : 0.9408945686900958
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       629
           1       0.95      0.93      0.94       623

    accuracy                           0.94      1252
   macro avg       0.94      0.94      0.94      1252
weighted avg       0.94      0.94      0.94      1252



In [51]:
#Tune to optimize performance and accuracy score with higher n_samples (not 150,000 lol) and different n_estimators (probably between 20 and 5)
#Document iterations in excel document (reporting classification scores for each parameter)

In [None]:
#Then explore feature importance