# Machine Learning Project: Classification 5
Calvin Wong<br>
Mohammed Hussain

In [1]:
# Useful libraries
import pandas as pd
import numpy as np
import sys

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report
from sklearn import metrics  
from sklearn import tree

# Data preprocessing

In [2]:
Test = pd.read_csv("TestData5.csv", header=None)

#Replace all 1.000000e+99 with nan
Test[Test > 100] = np.nan

# Separate numerical columns
numerical_cols = Test.select_dtypes(include=['float64', 'int64']).columns

# Imput missing values for numerical columns using the mean
for col in numerical_cols:
    if Test[col].isnull().sum() > 0:
        mean_value = Test[col].mean()
        Test[col] = Test[col].fillna(mean_value)
        #print(f"Imputed missing values in '{col}' with mean: {mean_value}")
Test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,7.9,0.600,0.06,1.6,0.069,15.0,59.000000,0.996,3.30,0.46,9.4
1,7.5,0.500,0.36,6.1,0.071,17.0,40.056541,0.998,3.35,0.80,10.5
2,7.8,0.610,0.29,1.6,0.114,9.0,29.000000,0.997,3.26,1.56,9.1
3,8.5,0.280,0.56,1.8,0.092,35.0,40.056541,0.997,3.30,0.75,10.5
4,8.1,0.560,0.28,1.7,0.368,16.0,56.000000,0.997,3.11,1.28,9.3
...,...,...,...,...,...,...,...,...,...,...,...
475,7.5,0.520,0.40,2.2,0.060,12.0,20.000000,0.995,3.26,0.64,11.8
476,6.2,0.560,0.09,1.7,0.053,24.0,32.000000,0.994,3.54,0.60,11.3
477,7.4,0.350,0.33,2.4,0.068,9.0,26.000000,0.995,3.36,0.60,11.9
478,6.2,0.560,0.09,1.7,0.053,24.0,32.000000,0.994,3.54,0.60,11.3


In [3]:
Train = pd.read_csv("TrainData5.csv", header=None)
Label = pd.read_csv("TrainLabel5.csv", header=None)
df = pd.concat([Label,Train], axis=1, ignore_index=True)

#Replace all 1.000000e+99 with nan
df[df > 100] = np.nan

# Separate numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Imput missing values for numerical columns using the mean
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        mean_value = df[col].mean()
        df[col] = df[col].fillna(mean_value)
        #print(f"Imputed missing values in '{col}' with mean: {mean_value}")
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,5,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.998,3.51,0.56,9.4
1,5,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.997,3.20,0.68,9.8
2,5,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,6,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,5,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.998,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...,...
1114,6,6.8,0.62,0.08,1.9,0.068,28.0,38.0,0.997,3.42,0.82,9.5
1115,5,6.2,0.60,0.08,2.0,0.090,32.0,44.0,0.995,3.45,0.58,10.5
1116,6,5.9,0.55,0.10,2.2,0.062,39.0,51.0,0.995,3.52,0.76,11.2
1117,6,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.996,3.42,0.75,11.0


# SVM Model

In [4]:
# Split into features (X) and target variable (y)
X = df.drop(df.columns[0], axis=1)
y = df[df.columns[0]]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create an SVM model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

# Evaluate the model's accuracy
f1 = f1_score(y_test, y_pred,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred,average='weighted', zero_division= np.nan)
precision = precision_score(y_test, y_pred,average='weighted', zero_division= np.nan)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.5803571428571429
F1 Score: 0.525058810556495
Recall: 0.5803571428571429
Precision: 0.5924946495781162


# KNN Model

In [5]:
# Split into features (X) and target variable (y)
X = df.drop(df.columns[0], axis=1)
y = df[df.columns[0]]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Initialize the KNN classifier with k=5
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test_scaled)

# Evaluate the model's accuracy
f1 = f1_score(y_test, y_pred,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred,average='weighted')
precision = precision_score(y_test, y_pred,average='weighted', zero_division= np.nan)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)


Accuracy: 0.5803571428571429
F1 Score: 0.5637251166573912
Recall: 0.5803571428571429
Precision: 0.594673888512751


# Predict labels for testing data

In [6]:
# Split into features (X) and target variable (y)
X = df.drop(df.columns[0], axis=1)
y = df[df.columns[0]]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
Test_scaled = scaler.transform(Test)

# Create an SVM model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(Test_scaled)


print("\nPrediction labels for test set")
print(y_pred)

#print("Actual labels for training set")
#y_test_array = y_test.to_numpy()
#print(y_test_array)


Prediction labels for test set
[5 6 6 6 5 5 5 5 5 6 6 5 6 5 5 5 5 5 5 6 6 5 6 6 5 5 5 6 5 5 5 5 5 5 5 6 5
 6 5 3 6 5 5 5 5 5 6 6 5 6 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 5 5 6 5 6 6 6 5 5
 5 5 5 6 6 5 5 5 6 6 6 6 6 5 5 6 5 5 5 6 5 6 6 6 6 5 6 6 6 5 5 6 6 6 6 6 6
 5 5 5 5 5 5 6 5 6 6 6 5 6 6 5 6 6 6 6 6 5 6 5 5 5 6 6 6 5 5 5 6 6 6 6 6 6
 5 6 5 5 6 6 5 6 6 6 6 5 5 6 5 6 5 6 6 6 5 5 6 6 6 6 5 5 5 5 6 6 6 5 5 5 5
 6 6 6 5 5 6 6 5 5 5 6 5 5 5 6 5 6 6 5 6 5 5 5 5 5 5 5 5 5 6 5 5 6 5 6 5 5
 6 6 5 5 6 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 5 5 5 6 6 6 6 6 6 5 5 6 6
 6 6 6 5 5 6 5 6 6 6 6 6 6 5 6 6 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 5 6 6 6 6 5
 6 6 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 5 5 6 6 6 6 5 6 6 6 5 6 6 6 5 5 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 5 5 6 6 6 6 6 5 6 6 6 5 6 5 5 5 6 6 6 6 6 6 5 6
 6 5 5 5 6 6 5 6 6 5 5 6 6 5 6 6 5 6 5 6 5 6 6 5 5 5 5 5 6 5 5 6 6 6 5 5 5
 5 6 5 5 5 5 6 6 5 5 5 5 6 5 5 6 6 6 6 6 5 5 6 6 5 5 5 6 5 6 5 5 6 5 6 5 5
 6 5 5 6 6 6 6 6 6 5 5 5 6 6 5 6 6 5 6 6 6 6 5 6 6 5 5 6 6 5 6 6 6 6