# Machine Learning Project: Classification 1
Calvin Wong<br>
Mohammed Hussain

In [1]:
# Useful libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report
from sklearn import metrics  
from sklearn import tree

# Data preprocessing

In [2]:
Test = pd.read_csv("TestData1.csv", header=None)

#Replace all 1.000000e+99 with nan
Test[Test > 1000] = np.nan

# Separate numerical columns
numerical_cols = Test.select_dtypes(include=['float64', 'int64']).columns

# Imput missing values for numerical columns using the mean
for col in numerical_cols:
    if Test[col].isnull().sum() > 0:
        mean_value = Test[col].mean()
        Test[col] = Test[col].fillna(mean_value)
        #print(f"Imputed missing values in '{col}' with median: {mean_value}")
Test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3302,3303,3304,3305,3306,3307,3308,3309,3310,3311
0,3.841106,1.732474,1.711807,2.415107,3.008562,3.068572,3.336568,3.475879,3.591656,3.705524,...,1.833975,1.874598,1.397636,2.136625,2.924662,2.018034,2.674641,1.888965,2.721975,1.87944
1,3.812389,2.126927,1.942529,2.591743,3.55261,3.408792,3.557571,3.60642,3.681815,3.661349,...,1.381205,1.43353,1.631495,2.222555,3.314882,1.729286,2.509639,2.238134,2.425284,1.815777
2,3.840183,1.897517,1.897517,2.412864,3.209561,3.181998,3.316767,3.520033,3.595931,3.613428,...,1.75074,1.778947,1.0,2.165185,2.732659,1.707996,2.599151,2.182814,2.406421,2.022923
3,3.893489,2.065841,1.959947,2.60622,3.401351,3.545868,3.690366,3.254935,3.429752,3.637449,...,1.78311,1.847511,2.262593,2.061716,3.370104,1.757775,2.52577,1.0,2.280373,1.610767
4,3.822854,1.644242,1.736715,2.404782,3.388622,3.304796,3.344968,3.480182,3.567112,3.603737,...,1.842047,1.754272,1.0,2.146903,3.12006,1.907196,2.70682,1.997954,2.348733,2.11284
5,3.821707,2.123525,2.149927,2.571441,3.2515,3.258747,3.412988,3.617221,3.646515,3.67769,...,1.827628,1.704151,1.0,2.195789,2.983198,2.31971,2.568178,1.0,2.858224,2.195789
6,3.851158,1.798616,1.788699,2.602689,3.402186,3.217792,3.334842,3.548319,3.666192,3.674317,...,1.03862,1.220762,1.0,2.205475,3.050658,1.636688,2.42282,1.950779,2.425284,2.081797
7,3.830171,1.0,1.328583,2.300921,3.338239,3.435537,3.584471,3.412608,3.554056,3.588417,...,2.168114,1.587037,1.0,1.794279,3.229003,2.054766,2.441192,1.1529,2.063709,2.179552
8,3.807966,2.064795,2.002123,2.674733,3.609705,3.596401,3.624641,3.517717,3.632171,3.64395,...,1.0,1.521792,1.0,2.355452,3.351129,1.539829,2.516985,1.0,2.6267,1.341237
9,3.943649,2.005481,1.988693,2.526404,2.873989,2.954146,3.119754,3.255569,3.468497,3.693295,...,1.77122,2.029465,1.640481,2.107651,2.733438,2.025552,2.518922,1.924279,2.599064,2.162535


In [3]:
Train = pd.read_csv("TrainData1.csv", header=None)
Label = pd.read_csv("TrainLabel1.csv", header=None)
df = pd.concat([Label,Train], axis=1, ignore_index=True)

#Replace all 1.000000e+99 with nan
df[df > 1000] = np.nan

# Total number of entries
total = df.size

# Count of NaN values
nan_count = df.isna().sum().sum()

# Count of Non-NaN values
non_nan_count = total - nan_count
print("Missing value percentage:",nan_count/total * 100,"%")

# Initialize the KNNImputer
imputer = KNNImputer(n_neighbors=5) 

# Fit and transform the data
imputed_data = imputer.fit_transform(df)

# Convert the result back to a DataFrame
df = pd.DataFrame(imputed_data, columns=df.columns)
df

Missing value percentage: 1.9993963175369758 %


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3303,3304,3305,3306,3307,3308,3309,3310,3311,3312
0,1.0,3.824254,1.923762,1.918450,2.352067,3.117298,3.051735,3.307977,3.430222,3.586667,...,1.836830,1.855640,1.142389,2.054345,2.808224,1.782186,2.665703,2.468214,2.478581,2.308842
1,1.0,3.904190,2.309524,2.152930,2.553395,3.532368,3.524866,3.677791,3.636671,3.696868,...,1.951532,1.442323,1.000000,2.127914,2.979658,1.961089,2.519027,2.054383,2.689903,2.090928
2,2.0,3.750908,1.161068,1.017033,2.347993,3.381889,3.393096,3.509134,3.512466,3.622203,...,1.000000,1.584105,1.000000,1.945321,3.257004,1.965061,2.536066,1.449324,2.605230,1.368659
3,1.0,3.809383,1.912355,1.856940,2.498944,3.289406,3.371232,3.541995,3.491048,3.473179,...,1.869965,1.481658,1.000000,2.155032,3.270371,1.928473,2.618074,2.154013,2.530046,2.185514
4,1.0,3.893561,2.094192,1.881271,2.785707,3.344339,3.274417,3.485872,3.516527,3.642358,...,1.480725,1.510545,1.000000,2.094192,3.246666,1.824516,2.562317,1.942256,2.598517,1.764624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,1.0,3.901178,1.672421,1.869554,2.234758,3.174569,3.469689,3.565574,3.653364,3.720899,...,2.140555,1.960423,1.240300,2.149958,3.042383,1.799134,2.772116,2.460484,2.413685,2.018971
146,1.0,3.865257,1.796990,1.643650,2.307902,3.411704,3.410870,3.527868,3.488609,3.580376,...,2.183981,2.519171,1.000000,2.174641,2.902840,1.844850,2.647755,1.000000,2.219480,1.992730
147,1.0,3.860198,2.097778,1.969556,2.438788,3.365076,3.391729,3.461051,3.408433,3.531799,...,2.118074,2.478032,1.358316,2.125741,3.041140,1.779308,2.585771,1.000000,2.282543,2.055703
148,4.0,3.907102,1.000000,1.473633,2.158604,3.459391,3.478222,3.621380,3.480155,3.595473,...,2.232310,1.380573,1.000000,2.105578,2.902655,2.156428,2.272538,1.000000,1.968483,2.318502


# SVM Model

In [4]:
# Split into features (X) and target variable (y)
X = df.drop(df.columns[0], axis=1)
y = df[df.columns[0]]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create an SVM model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

# Evaluate the model's accuracy
f1 = f1_score(y_test, y_pred,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred,average='weighted', zero_division= np.nan)
precision = precision_score(y_test, y_pred,average='weighted', zero_division= np.nan)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.9666666666666667
F1 Score: 0.9637037037037037
Recall: 0.9666666666666667
Precision: 0.9681159420289855


# KNN Model

In [5]:
# Split into features (X) and target variable (y)
X = df.drop(df.columns[0], axis=1)
y = df[df.columns[0]]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Initialize the KNN classifier with k=5
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test_scaled)

# Evaluate the model's accuracy
f1 = f1_score(y_test, y_pred,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred,average='weighted')
precision = precision_score(y_test, y_pred,average='weighted', zero_division= np.nan)
print("Scores for KNN 1st data set")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)


Scores for KNN 1st data set
Accuracy: 0.9
F1 Score: 0.8798581560283687
Recall: 0.9
Precision: 0.9089655172413793


# Predict labels for testing data

In [6]:
# Split into features (X) and target variable (y)
X = df.drop(df.columns[0], axis=1)
y = df[df.columns[0]]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
Test_scaled = scaler.transform(Test)

# Create an SVM model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(Test_scaled)


print("\nPrediction labels for test set 1")
print(y_pred)

#print("Actual labels for training set")
#y_test_array = y_test.to_numpy()
#print(y_test_array)


Prediction labels for test set 1
[2. 1. 1. 1. 1. 2. 1. 1. 3. 1. 3. 1. 1. 3. 5. 1. 1. 1. 1. 1. 1. 4. 3. 3.
 4. 1. 5. 4. 1. 3. 1. 1. 4. 1. 3. 1. 1. 4. 3. 5. 1. 1. 4. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1.]
