In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Load the training 
df_train = pd.read_excel("./train.xlsx")

In [3]:
# Specify the target column
target_column = 'target'

In [4]:
# Handle missing values
df_train.fillna(df_train.mean(), inplace=True)

  df_train.fillna(df_train.mean(), inplace=True)


In [5]:
X_train = df_train.drop(columns=[target_column])

In [6]:
df_test = pd.read_excel("test.xlsx")
df_test.fillna(df_test.mean(), inplace=True)


In [7]:
y_train = df_train[target_column]
# Prepare the features for testing
X_test = df_test.drop(columns=[target_column], errors='ignore')
y_test = df_test[target_column] if target_column in df_test.columns else None

In [8]:
# Select numeric columns
import numpy as np
X_train_numeric = X_train.select_dtypes(include=[np.number])
X_test_numeric = X_test.select_dtypes(include=[np.number])

In [9]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_numeric)
X_test_scaled = scaler.transform(X_test_numeric)

In [10]:
# Initialize and train classifiers
models_random = RandomForestClassifier(random_state=0)
    

In [11]:

models_random.fit(X_train_scaled, y_train)

RandomForestClassifier(random_state=0)

In [12]:
# Predict on test data
rf_test_predictions = models_random.predict(X_test_scaled)

# Save predictions to Excel
df_test['RandomForest_Prediction'] = rf_test_predictions


In [13]:
# Train accuracy
rf_train_accuracy = accuracy_score(y_train, models_random.predict(X_train_scaled))
print(f"Random Forest Train Accuracy: {rf_train_accuracy:.4f}")
print(f"Random Forest Classification Report:\n{classification_report(y_train,models_random.predict(X_train_scaled))}")

Random Forest Train Accuracy: 0.9994
Random Forest Classification Report:
              precision    recall  f1-score   support

          A1       1.00      1.00      1.00       215
         A10       1.00      1.00      1.00       204
         A11       1.00      1.00      1.00       212
         A12       1.00      1.00      1.00       203
         A13       1.00      1.00      1.00       219
         A14       1.00      1.00      1.00       418
         A15       1.00      1.00      1.00       413
         A16       1.00      1.00      1.00       210
         A17       1.00      1.00      1.00       204
         A18       1.00      1.00      1.00       189
         A19       1.00      1.00      1.00       208
          A2       1.00      1.00      1.00       204
         A20       1.00      1.00      1.00       205
         A21       1.00      1.00      1.00       411
         A22       1.00      1.00      1.00       210
         A23       1.00      1.00      1.00       202
       

# LOGISTIC REGRESSION

In [14]:
model_logistic = LogisticRegression(max_iter=1000, random_state=0)
model_logistic.fit(X_train_scaled, y_train)

# Predict on test data
lr_test_predictions = model_logistic.predict(X_test_scaled)

# Add predictions to the test dataframe
df_test['LogisticRegression_Prediction'] = lr_test_predictions


In [15]:

# Train accuracy
lr_train_accuracy = accuracy_score(y_train, model_logistic.predict(X_train_scaled))
print(f"Logistic Regression Train Accuracy: {lr_train_accuracy:.4f}")
print(f"Logistic Regression Classification Report:\n{classification_report(y_train,model_logistic.predict(X_train_scaled))}")

Logistic Regression Train Accuracy: 0.9775
Logistic Regression Classification Report:
              precision    recall  f1-score   support

          A1       0.93      0.92      0.93       215
         A10       0.83      0.85      0.84       204
         A11       0.95      1.00      0.97       212
         A12       1.00      1.00      1.00       203
         A13       1.00      1.00      1.00       219
         A14       0.99      1.00      1.00       418
         A15       0.99      0.95      0.97       413
         A16       1.00      1.00      1.00       210
         A17       0.97      0.93      0.95       204
         A18       0.99      1.00      1.00       189
         A19       0.99      1.00      0.99       208
          A2       1.00      1.00      1.00       204
         A20       1.00      1.00      1.00       205
         A21       0.89      0.93      0.91       411
         A22       1.00      1.00      1.00       210
         A23       1.00      1.00      1.00      

# SUPPORT VECTOR MACHINE

In [16]:
# from sklearn.svm import SVC

# Train SVC
svc_model = SVC(random_state=0)
svc_model.fit(X_train_scaled, y_train)

# Predict on test data
svc_test_predictions = svc_model.predict(X_test_scaled)

In [17]:
# Add predictions to the test dataframe
df_test['SVC_Prediction'] = svc_test_predictions



In [18]:
# Train accuracy
svc_train_accuracy = accuracy_score(y_train, svc_model.predict(X_train_scaled))
print(f"Support Vector Classifier Train Accuracy: {svc_train_accuracy:.4f}")
print(f"Support Vector Classifier Classification Report:\n{classification_report(y_train, svc_model.predict(X_train_scaled))}")

Support Vector Classifier Train Accuracy: 0.9901
Support Vector Classifier Classification Report:
              precision    recall  f1-score   support

          A1       0.98      0.93      0.95       215
         A10       0.79      1.00      0.88       204
         A11       0.96      1.00      0.98       212
         A12       1.00      1.00      1.00       203
         A13       1.00      1.00      1.00       219
         A14       1.00      1.00      1.00       418
         A15       1.00      0.97      0.98       413
         A16       1.00      1.00      1.00       210
         A17       1.00      0.99      1.00       204
         A18       0.99      1.00      1.00       189
         A19       1.00      1.00      1.00       208
          A2       1.00      1.00      1.00       204
         A20       1.00      1.00      1.00       205
         A21       0.96      0.99      0.97       411
         A22       1.00      1.00      1.00       210
         A23       1.00      1.00    

# KNN

In [19]:
# Train KNN
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_scaled, y_train)

# Predict on test data
knn_test_predictions = knn_model.predict(X_test_scaled)

# Add predictions to the test dataframe
df_test['KNN_Prediction'] = knn_test_predictions

In [21]:
# Save predictions to Excel
df_test.to_excel("./Desktop/predictions.xlsx", index=False)

# Train accuracy
knn_train_accuracy = accuracy_score(y_train, knn_model.predict(X_train_scaled))
print(f"K-Nearest Neighbors Train Accuracy: {knn_train_accuracy:.4f}")
print(f"K-Nearest Neighbors Classification Report:\n{classification_report(y_train, knn_model.predict(X_train_scaled))}")

K-Nearest Neighbors Train Accuracy: 0.9900
K-Nearest Neighbors Classification Report:
              precision    recall  f1-score   support

          A1       0.93      0.98      0.96       215
         A10       0.95      0.99      0.97       204
         A11       0.99      1.00      1.00       212
         A12       1.00      1.00      1.00       203
         A13       1.00      1.00      1.00       219
         A14       1.00      1.00      1.00       418
         A15       1.00      0.96      0.98       413
         A16       1.00      1.00      1.00       210
         A17       0.99      0.98      0.98       204
         A18       0.99      0.99      0.99       189
         A19       0.98      1.00      0.99       208
          A2       1.00      1.00      1.00       204
         A20       0.98      1.00      0.99       205
         A21       0.96      0.99      0.97       411
         A22       1.00      1.00      1.00       210
         A23       1.00      1.00      1.00      