In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt

drive.mount('/content/drive')

training_path1 = '/content/drive/My Drive/FML/Assignment Data/training1.csv'
training_path2 = '/content/drive/My Drive/FML/Assignment Data/training2.csv'
testing_path = '/content/drive/My Drive/FML/Assignment Data/test.csv'

Mounted at /content/drive


In [3]:
# loading the data needed for the project
train_data1 = pd.read_csv(training_path1)
train_data2 = pd.read_csv(training_path2)
test_data = pd.read_csv(testing_path)

# concatenating the training data to improve the training process
train_data = pd.concat([train_data1, train_data2], axis=0)

# separate the data features, variables and confidence values
trainX = train_data.iloc[:, :-2]
trainY = train_data.iloc[:, -2]
confidence_values = train_data.iloc[:, -1]
# turned the confidence values into binary data
contidence_values = (confidence_values - 0.66) / 0.34

# split the training data into 80% / 20% validation data for evaluation after training
X_train, X_val, Y_train,  Y_val, confidence_train, confidence_val = train_test_split(trainX, trainY, confidence_values, test_size=0.2, random_state=31)

# missing data was replaced with mean value of the training data using imputing
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)
test_data = imputer.transform(test_data)

# normalisation was done using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_data = scaler.transform(test_data)

# random foreest classifier was used to pick the best and valuable data that is going to be used for feature selection
clf = RandomForestClassifier(n_estimators=150, random_state=31)
clf.fit(X_train, Y_train, confidence_train)

# feature selection was done to all of the data using SelectFromModel
sfm = SelectFromModel(clf, threshold='median')
X_train_selected = sfm.fit_transform(X_train, Y_train)
X_val_selected = sfm.transform(X_val)
test_data_selected = sfm.transform(test_data)

# SVC was used as the main classifier to train the final data
clf2 = SVC(kernel='rbf', C = 0.16)
clf2.fit(X_train_selected, Y_train, confidence_train)

# predictions were made on data using the trained data 
val_predictions= clf2.predict(X_val_selected)
training_predictions= clf2.predict(X_train_selected)
test_predictions= clf2.predict(test_data_selected)
#these predictions were then evaluated on validation data
val_accuracy = accuracy_score(Y_val, val_predictions)*100
training_accuracy = accuracy_score(Y_train, training_predictions)*100
print("Validation Accuracy:", val_accuracy)
print("Training Accuracy:", training_accuracy)

print(test_predictions.shape)

predictions = pd.DataFrame({'prediction': test_predictions})
predictions.to_csv('/content/drive/My Drive/FML/Assignment Data/predictions.csv', index=False)


Validation Accuracy: 74.83333333333333
Training Accuracy: 77.16666666666666
(1000,)
