In [None]:
# ************************************************
# Author: Gordon Dou
# Date: May 9th, 2022
# Description: This class runs the Support Vector Machine (SVM) to train on the Kaggle dataset, and predict the
# labels for our dataset. This class saves the prediction results as a csv file named "SVM_predicted_label.csv" in
# the output folder.
#
# side note: it took us about 20 minutes to finish running.
# ************************************************

from sklearn.svm import SVC
import pandas as pd

In [None]:
clf = SVC(kernel="linear", C=0.025) # Choose SVM as the classifier to prediect our dataset

In [None]:
df = pd.read_csv("data/embedded_whats_cooking/embedded_train.csv") # load the dataset to train
y_labels = df["label"].tolist()
label_dict = {}

# Create a dictionary that maps true labels into an integer in the range 0 - 19
count = 0
for ele in set(y_labels):
    label_dict[ele] = count
    count += 1
print("="*20)
print(label_dict)

In [None]:
encoded_labels = []
for label in y_labels:
    encoded_labels.append(label_dict[label])
print(len(encoded_labels))
print(encoded_labels[:20])

In [None]:
# Cleaned and reformatted the dataframe
df["y"] = encoded_labels
df = df.drop(['label', 'id'], axis=1)
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
print(df)

In [None]:
data = df.to_numpy()
X = data[:, 1:]
y = data[:, 0]

In [None]:
clf.fit(X, y)

In [None]:
# Read the testing dataset
# If you want to load the sample corpus, uncomment the following line
# x_test = pd.read_csv("sample_data/embedded_recipe.csv")
# and coomment out the following line:
x_test = pd.read_csv("data/embedded_recipe.csv")
ids = x_test["id"] # a list of ids of the testing dataset for each recipe
x_test = x_test.drop(["id"], axis=1)


y_pred = clf.predict(x_test) #a list of the predicted results for our dataset
print(len(y_pred))

In [None]:
# reverse the labels dictionary so that each integer maps to a true label
reverse_label_dict = {}
for key,value in label_dict.items():
    reverse_label_dict[value] = key
print(reverse_label_dict)

In [None]:
# add the corresponding true labels based on the predicted results
y_test_label = []
for num in y_pred:
    y_test_label.append(reverse_label_dict[num])
print(y_test_label)

In [None]:
recording = {"SVM Predicted label": y_test_label}
df_record = pd.DataFrame(recording, index = ids)
df_record.to_csv("output/SVM_predicted_label.csv") # save the SVM predicted results as a CSV file