# Machine Learning Project: Classification 3
Calvin Wong<br>
Mohammed Hussain

In [1]:
# Useful libraries
import pandas as pd
import numpy as np
import sys

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report
from sklearn import metrics  
from sklearn import tree

# Data preprocessing

In [2]:
Test = pd.read_csv("TestData3.csv", header=None)

#Replace all 1.000000e+99 with nan
Test[Test > 100] = np.nan

# Separate numerical columns
numerical_cols = Test.select_dtypes(include=['float64', 'int64']).columns

# Imput missing values for numerical columns using the mean
for col in numerical_cols:
    if Test[col].isnull().sum() > 0:
        mean_value = Test[col].mean()
        Test[col] = Test[col].fillna(mean_value)
        #print(f"Imputed missing values in '{col}' with mean: {mean_value}")
Test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1,5.0,3,4.0,1.0,5.0,1,3.0,0,1.0,1.0,7.0,1.0
1,2,1.0,4,3.0,5.0,5.0,3,2.0,0,1.0,1.0,7.0,1.0
2,1,5.0,2,3.0,9.0,2.0,1,4.0,1,3.0,1.0,7.0,1.0
3,1,2.0,2,2.0,3.0,2.0,1,4.0,2,3.0,3.0,7.0,1.0
4,2,5.0,3,4.0,1.0,2.0,1,1.0,0,2.0,2.0,7.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2688,2,5.0,1,1.0,2.0,5.0,1,3.0,2,3.0,1.0,7.0,1.0
2689,1,5.0,2,4.0,1.0,5.0,1,4.0,0,3.0,1.0,7.0,1.0
2690,2,5.0,1,2.0,1.0,5.0,1,3.0,2,3.0,1.0,7.0,1.0
2691,1,1.0,6,4.0,3.0,5.0,2,3.0,1,2.0,3.0,7.0,1.0


In [3]:
Train = pd.read_csv("TrainData3.csv", header=None)
Label = pd.read_csv("TrainLabel3.csv", header=None)
df = pd.concat([Label,Train], axis=1, ignore_index=True)

#Replace all 1.000000e+99 with nan
df[df > 1000] = np.nan

# Total number of entries
total = df.size

# Count of NaN values
nan_count = df.isna().sum().sum()

# Count of Non-NaN values
non_nan_count = total - nan_count
print("Missing value percentage:",nan_count/total * 100,"%")

# Initialize the KNNImputer
imputer = KNNImputer(n_neighbors=2) 

# Fit and transform the data
imputed_data = imputer.fit_transform(df)

# Convert the result back to a DataFrame
df = pd.DataFrame(imputed_data, columns=df.columns)

df

Missing value percentage: 2.1383219954648527 %


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,9.0,2.0,1.0,5.0,4.0,5.0,5.0,3.0,3.0,0.0,1.0,1.0,7.0,1.0
1,9.0,1.0,1.0,5.0,5.0,5.0,5.0,3.0,5.0,2.0,1.0,1.0,7.0,1.0
2,9.0,2.0,1.0,3.0,5.0,1.0,5.0,2.0,3.0,1.0,2.0,3.0,7.0,1.0
3,1.0,2.0,5.0,1.0,2.0,6.0,5.0,1.0,4.0,2.0,3.0,1.0,7.0,1.0
4,1.0,2.0,5.0,1.0,2.0,6.0,3.0,1.0,4.0,2.0,3.0,1.0,7.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6295,7.0,1.0,5.0,4.0,5.0,3.0,3.5,1.0,1.0,0.0,2.0,3.0,7.0,1.0
6296,9.0,2.0,1.0,3.0,3.0,9.0,5.0,3.0,4.0,2.0,1.0,1.0,8.0,1.0
6297,5.0,2.0,1.0,4.0,4.0,6.0,5.0,3.0,4.0,2.0,1.0,1.0,7.0,1.0
6298,9.0,1.0,3.0,3.0,4.0,1.0,5.0,1.0,1.0,0.0,1.0,1.0,7.0,1.0


# SVM Model

In [4]:
# Split into features (X) and target variable (y)
X = df.drop(df.columns[0], axis=1)
y = df[df.columns[0]]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create an SVM model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

# Evaluate the model's accuracy
f1 = f1_score(y_test, y_pred,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred,average='weighted', zero_division= np.nan)
precision = precision_score(y_test, y_pred,average='weighted', zero_division= np.nan)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.3388888888888889
F1 Score: 0.2573438567547597
Recall: 0.3388888888888889
Precision: 0.26347318998637914


# KNN Model

In [5]:
# Split into features (X) and target variable (y)
X = df.drop(df.columns[0], axis=1)
y = df[df.columns[0]]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Initialize the KNN classifier with k=5
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test_scaled)

# Evaluate the model's accuracy
f1 = f1_score(y_test, y_pred,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred,average='weighted')
precision = precision_score(y_test, y_pred,average='weighted', zero_division= np.nan)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)


Accuracy: 0.32142857142857145
F1 Score: 0.30034679634441624
Recall: 0.32142857142857145
Precision: 0.29451070990001726


# Predict labels for testing data

In [6]:
# Split into features (X) and target variable (y)
X = df.drop(df.columns[0], axis=1)
y = df[df.columns[0]]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
Test_scaled = scaler.transform(Test)

# Create an SVM model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(Test_scaled)

np.set_printoptions(threshold=sys.maxsize)
print("\nPrediction labels for test set 3")
print(y_pred)

#print("Actual labels for training set")
#y_test_array = y_test.to_numpy()
#print(y_test_array)


Prediction labels for test set 3
[8. 8. 1. 1. 6. 1. 1. 7. 8. 8. 8. 8. 1. 8. 6. 6. 6. 8. 8. 8. 8. 8. 8. 2.
 4. 8. 1. 1. 1. 1. 1. 1. 8. 8. 1. 1. 1. 1. 7. 6. 6. 8. 6. 8. 6. 1. 1. 8.
 1. 6. 1. 7. 1. 4. 6. 3. 8. 1. 3. 1. 8. 8. 8. 7. 1. 1. 1. 8. 8. 8. 8. 8.
 1. 8. 8. 8. 8. 6. 8. 1. 8. 1. 8. 1. 6. 8. 1. 1. 6. 6. 4. 8. 8. 6. 1. 8.
 8. 1. 1. 1. 8. 1. 8. 6. 6. 8. 1. 8. 1. 8. 8. 6. 1. 6. 1. 1. 1. 4. 6. 8.
 1. 1. 8. 1. 4. 1. 1. 1. 1. 6. 1. 8. 1. 8. 8. 4. 4. 6. 6. 1. 8. 8. 1. 1.
 1. 6. 4. 8. 1. 6. 8. 8. 1. 6. 1. 8. 1. 8. 1. 4. 8. 6. 1. 1. 3. 8. 1. 6.
 1. 1. 8. 6. 8. 1. 8. 4. 7. 6. 6. 8. 6. 1. 4. 7. 1. 1. 8. 6. 8. 6. 8. 6.
 1. 8. 1. 1. 1. 4. 6. 1. 1. 8. 6. 1. 8. 1. 1. 8. 8. 8. 4. 3. 6. 6. 3. 1.
 4. 6. 8. 4. 6. 6. 8. 8. 8. 6. 1. 8. 6. 8. 8. 1. 1. 8. 2. 7. 8. 6. 8. 8.
 1. 8. 6. 6. 8. 1. 7. 1. 8. 1. 7. 7. 1. 1. 7. 8. 6. 7. 6. 8. 8. 7. 6. 8.
 1. 4. 8. 3. 1. 6. 1. 6. 8. 2. 6. 8. 1. 8. 6. 6. 4. 8. 8. 7. 6. 4. 8. 8.
 8. 4. 1. 8. 8. 7. 1. 1. 1. 8. 1. 7. 4. 4. 6. 6. 6. 8. 7. 8. 7. 6. 6. 8.
 1. 3. 5. 2. 1. 7