# Phase 2 — Baseline Models

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
import os
from pathlib import Path
import pandas as pd

### Loading the Dataset

In [4]:
# loading data
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")
test_df = pd.read_csv("test.csv")

# takeing a peak at data 
train_df.head()

# turning them into matreses 
X_train = train_df[[f"p{i}" for i in range(1, 43)] + ["turn"]].values.astype("int")
y_train = train_df["label_move_col"].values.astype("int")

X_val = val_df[[f"p{i}" for i in range(1, 43)] + ["turn"]].values.astype("int")
y_val = val_df["label_move_col"].values.astype("int")

X_test = test_df[[f"p{i}" for i in range(1, 43)] + ["turn"]].values.astype("int")



### Standardization of Features

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)
X_test_scaled = scaler.fit_transform(X_test)

train_df["label_move_col"].value_counts().sort_index()

label_move_col
0     7136
1     4111
2     4923
3    17653
4     4923
5     4111
6     7136
Name: count, dtype: int64

# Logistic Regression Model
### Training and Evaluation

In [6]:
# training logistic regression

Logistic_regression_model =  LogisticRegression(max_iter = 3000 , solver ="lbfgs" ,class_weight="balanced" , C=1 ,random_state=42 )

Logistic_regression_model.fit(X_train_scaled , y_train)

# predict on validation data
y_val_pred = Logistic_regression_model.predict(X_val_scaled)

#accuracy
val_acc = accuracy_score(y_val , y_val_pred)
print("validation accuracy: ", val_acc)

# confusion matrix
print("\nconfusion matrix: \n", confusion_matrix(y_val , y_val_pred))
print("\n\nclassification report: \n", classification_report(y_val , y_val_pred , digits =3))

validation accuracy:  0.17672173751624282

confusion matrix: 
 [[194 208 259 185 249 219 136]
 [ 83 207 169  85 133 148 100]
 [ 90 133 281 123 175 137  94]
 [364 575 769 542 769 575 364]
 [ 94 137 175 123 281 133  90]
 [100 148 133  85 169 207  83]
 [138 219 249 185 259 208 192]]


classification report: 
               precision    recall  f1-score   support

           0      0.183     0.134     0.154      1450
           1      0.127     0.224     0.162       925
           2      0.138     0.272     0.183      1033
           3      0.408     0.137     0.205      3958
           4      0.138     0.272     0.183      1033
           5      0.127     0.224     0.162       925
           6      0.181     0.132     0.153      1450

    accuracy                          0.177     10774
   macro avg      0.186     0.199     0.172     10774
weighted avg      0.247     0.177     0.180     10774



 decision tree Model
Training and Evaluation

In [7]:
# training decision tree
Decision_tree_model = DecisionTreeClassifier(random_state=42, class_weight="balanced")
Decision_tree_model.fit(X_train_scaled, y_train)

# predict on validation data
y_val_pred = Decision_tree_model.predict(X_val_scaled)

# accuracy
val_acc = accuracy_score(y_val, y_val_pred)
print("Validation accuracy: ", val_acc)

# confusion matrix
print("\nconfusion matrix: \n", confusion_matrix(y_val, y_val_pred))


Validation accuracy:  0.48422127343604976

confusion matrix: 
 [[ 515  110  122  294  118   68  223]
 [ 103  359   72  155   79   72   85]
 [ 107   63  421  175   80   66  121]
 [ 280  183  193 2619  189  193  301]
 [ 135   68   73  172  425   60  100]
 [  93   68   78  157   65  357  107]
 [ 229   89  104  299   99  109  521]]


# Random Forest Model
### Training and Evaluation

In [8]:
#  Training the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluating on the validation set
y_pred = rf.predict(X_val)
print("Random Forest Accuracy is:", accuracy_score(y_val, y_pred))

# Compute confusion matrix
cm = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(cm)

Random Forest Accuracy is: 0.5825134583255986
Confusion Matrix:
[[ 667   50   70  369   76   37  181]
 [ 100  345   40  272   55   39   74]
 [  98   38  440  267   46   36  108]
 [ 162   56   72 3401   71   54  142]
 [ 120   48   45  252  445   30   93]
 [  84   37   59  277   33  333  102]
 [ 189   42   75  371   72   56  645]]
