<a href="https://colab.research.google.com/github/Adam-Aber/Evaluation-of-11-Machine-Learning-Models-on-the-OEDI-Dataset/blob/main/Evaluation_of_11_Machine_Learning_Models_on_the_OEDI_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("cleaned_oedi_dataset.csv")

# Here I convert the TRUE / FALSE of the one-hot encoding to 1s and 0s
converted_df = df.replace({True: 1, False: 0})
converted_df.to_csv('converted_dataset.csv', index=False)

  converted_df = df.replace({True: 1, False: 0})


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

df_encoded = pd.read_csv("converted_dataset.csv")


X = df_encoded.drop(columns=["theft_binary"]) # I specify that the features are all columns except the label column
y = df_encoded["theft_binary"]  # This is the label / target we have for theft detection

df_encoded = df_encoded.dropna(subset=['theft_binary'])
X = df_encoded.drop(columns=["theft_binary"])
y = df_encoded["theft_binary"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # I use this extra argument stratify to keep the proportions the same
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(f"Class distribution in training set:\n{y_train.value_counts(normalize=True)}")
print(f"Class distribution in testing set:\n{y_test.value_counts(normalize=True)}")

Training set shape: (448524, 24)
Testing set shape: (112131, 24)
Class distribution in training set:
theft_binary
0    0.59185
1    0.40815
Name: proportion, dtype: float64
Class distribution in testing set:
theft_binary
0    0.591852
1    0.408148
Name: proportion, dtype: float64


In [None]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=42)

dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)

print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt, target_names=["Normal", "Theft"]))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))

Decision Tree Classification Report:
              precision    recall  f1-score   support

      Normal       0.93      0.94      0.94     66365
       Theft       0.91      0.90      0.91     45766

    accuracy                           0.92    112131
   macro avg       0.92      0.92      0.92    112131
weighted avg       0.92      0.92      0.92    112131


Confusion Matrix:
[[62319  4046]
 [ 4588 41178]]


In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=["Normal", "Theft"]))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

      Normal       0.93      0.95      0.94     66365
       Theft       0.93      0.90      0.91     45766

    accuracy                           0.93    112131
   macro avg       0.93      0.93      0.93    112131
weighted avg       0.93      0.93      0.93    112131


Confusion Matrix:
[[63124  3241]
 [ 4562 41204]]


In [None]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()

nb_model.fit(X_train, y_train)

y_pred_nb = nb_model.predict(X_test)

print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb, target_names=["Normal", "Theft"]))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))

print('Accuracy of GaussianNB classifier on training set: {:.6f}'
     .format(nb_model.score(X_train, y_train)))
print('Accuracy of GaussianNB classifier on test set: {:.6f}'
     .format(nb_model.score(X_test, y_test)))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

      Normal       0.62      1.00      0.76     66365
       Theft       1.00      0.11      0.19     45766

    accuracy                           0.64    112131
   macro avg       0.81      0.55      0.48    112131
weighted avg       0.77      0.64      0.53    112131


Confusion Matrix:
[[66365     0]
 [40894  4872]]
Accuracy of GaussianNB classifier on training set: 0.635482
Accuracy of GaussianNB classifier on test set: 0.635302


In [None]:
# SVM with Linear Kernel

from sklearn.svm import SVC

clf = SVC(kernel = 'linear')
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7973174233708787

In [None]:
y_pred_svm = clf.predict(X_test)

print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=["Normal", "Theft"]))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))

SVM Classification Report:
              precision    recall  f1-score   support

      Normal       0.76      0.95      0.85     66365
       Theft       0.89      0.58      0.70     45766

    accuracy                           0.80    112131
   macro avg       0.83      0.76      0.77    112131
weighted avg       0.81      0.80      0.79    112131


Confusion Matrix:
[[63020  3345]
 [19382 26384]]


In [None]:
# SVM with RBF Kernel

clf = SVC(kernel = 'rbf')
clf.fit(X_train, y_train)
print ("rbf accuracy: ")
print (clf.score(X_test, y_test))

rbf accuracy: 
0.9097662555403947


In [None]:
y_pred_svm = clf.predict(X_test)

print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=["Normal", "Theft"]))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))

SVM Classification Report:
              precision    recall  f1-score   support

      Normal       0.87      0.99      0.93     66365
       Theft       0.99      0.79      0.88     45766

    accuracy                           0.91    112131
   macro avg       0.93      0.89      0.90    112131
weighted avg       0.92      0.91      0.91    112131


Confusion Matrix:
[[65960   405]
 [ 9713 36053]]


In [None]:
# SVM with Poly Kernel

clf = SVC(kernel = 'poly', degree = 3)
clf.fit(X_train, y_train)
print ("poly accuracy: ")
print (clf.score(X_test, y_test))

poly accuracy: 
0.8927326073967057


In [None]:
y_pred_svm = clf.predict(X_test)

print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=["Normal", "Theft"]))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))

SVM Classification Report:
              precision    recall  f1-score   support

      Normal       0.86      0.99      0.92     66365
       Theft       0.97      0.76      0.85     45766

    accuracy                           0.89    112131
   macro avg       0.91      0.87      0.88    112131
weighted avg       0.90      0.89      0.89    112131


Confusion Matrix:
[[65377   988]
 [11040 34726]]


In [None]:
from sklearn.svm import SVC

# using Sigmoid Kernel
clf = SVC(kernel = 'sigmoid')
clf.fit(X_train, y_train)
print ("Sigmoid accuracy: ")
print (clf.score(X_test, y_test))

Sigmoid accuracy: 
0.5463163621121724


Second Part: KNN, Linear Regression, Logistic Regression, Neural Network

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix


df = pd.read_csv("cleaned_oedi_dataset.csv") # this to load the file

df = df.replace({True: 1, False: 0}) # converts the true/false into boolean values

df = df.dropna(subset=['theft_binary']) # removes any rows with missing target values

X = df.drop(columns=["theft_binary"])
y = df["theft_binary"] #removes theft binary from dataset and only leaves features so model can start predicting

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) #standardizes features

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train) # training for our model

y_pred = knn.predict(X_test) #predicts and evaluates

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


  df = df.replace({True: 1, False: 0}) # converts the true/false into boolean values


Confusion Matrix:
 [[66096   269]
 [ 4816 40950]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96     66365
           1       0.99      0.89      0.94     45766

    accuracy                           0.95    112131
   macro avg       0.96      0.95      0.95    112131
weighted avg       0.96      0.95      0.95    112131



In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("cleaned_oedi_dataset.csv")

df = df.replace({True: 1, False: 0})

df = df.dropna(subset=['theft_binary'])

X = df.drop(columns=["theft_binary"]).astype(np.float32)
y = df["theft_binary"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# all of the above are common in all models where we preprocess our data

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
) # data is being trained and tested in 80/20 split


logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train) # model is trained

y_pred = logreg.predict(X_test) # model predicts and evaluates

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


  df = df.replace({True: 1, False: 0})


Confusion Matrix:
 [[60411  5954]
 [17457 28309]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.91      0.84     66365
           1       0.83      0.62      0.71     45766

    accuracy                           0.79    112131
   macro avg       0.80      0.76      0.77    112131
weighted avg       0.80      0.79      0.78    112131



In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, classification_report

df = pd.read_csv("cleaned_oedi_dataset.csv")
df = df.replace({True: 1, False: 0})
df = df.dropna(subset=['theft_binary'])
X = df.drop(columns=["theft_binary"]).astype(np.float32)
y = df["theft_binary"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# our standard steps

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
) # train and test split for model

linreg = LinearRegression()
linreg.fit(X_train, y_train) #training for model

y_pred_continuous = linreg.predict(X_test)
y_pred = (y_pred_continuous >= 0.5).astype(int)# predicts the continuous values and rounds them to 0/1

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


  df = df.replace({True: 1, False: 0})


Confusion Matrix:
 [[65187  1178]
 [25564 20202]]

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.98      0.83     66365
           1       0.94      0.44      0.60     45766

    accuracy                           0.76    112131
   macro avg       0.83      0.71      0.72    112131
weighted avg       0.81      0.76      0.74    112131



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

df = pd.read_csv("cleaned_oedi_dataset.csv")
df = df.replace({True: 1, False: 0})
df = df.dropna(subset=['theft_binary'])
X = df.drop(columns=["theft_binary"]).astype(np.float32)
y = df["theft_binary"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
) # train test split

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)), # first layer is 64 neurons and uses RELU activation
    Dense(32, activation='relu'), # second layer 32 neurons and RELU activation
    Dense(1, activation='sigmoid')  #final layer with 1 neuron and sigmoid function so values are between 0,1
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # model is compiled

history = model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=256, verbose=1) #model training

y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob >= 0.5).astype(int) #predicts and evaluates

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


  df = df.replace({True: 1, False: 0})
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1402/1402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.7798 - loss: 0.4706 - val_accuracy: 0.8782 - val_loss: 0.3262
Epoch 2/10
[1m1402/1402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8829 - loss: 0.3134 - val_accuracy: 0.9018 - val_loss: 0.2780
Epoch 3/10
[1m1402/1402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9016 - loss: 0.2787 - val_accuracy: 0.9095 - val_loss: 0.2603
Epoch 4/10
[1m1402/1402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.9097 - loss: 0.2602 - val_accuracy: 0.9144 - val_loss: 0.2485
Epoch 5/10
[1m1402/1402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9143 - loss: 0.2500 - val_accuracy: 0.9194 - val_loss: 0.2421
Epoch 6/10
[1m1402/1402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.9192 - loss: 0.2402 - val_accuracy: 0.9230 - val_loss: 0.2341
Epoch 7/10
[1