In [30]:
import pandas as pd
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay


In [31]:
# Reads and processes COGS
# test
COGs = pd.read_csv('terms_COG.csv', index_col=0)
COGs['value'] = 1
X = COGs.pivot_table(index='key', columns='COG', values='value', fill_value=0)

In [32]:
# Processes data for traits
traits = pd.read_csv('reducedDataset.csv', sep=';')
# The missing values for the oxygen column are removed and then the keys are grouped together (some species may have strains which have different traits)
y = traits.dropna(subset=['oxygen']).groupby('key').agg({'oxygen': lambda x: x.value_counts().index[0]})

In [33]:
# Finds all the common keys between X and Y due to descrepancies after the removal of empty oxygen values in traits
common_keys = X.index.intersection(y.index)

In [34]:
# Align X and Y based on common keys
X_aligned = X.loc[common_keys]
Y = y.loc[common_keys].values.ravel()
# Ensures X and Y are aligned
assert X_aligned.shape[0] == len(Y), "X and Y are not aligned"

In [35]:
# Checks shapes so see if they are equal
print("Shape of X:", X_aligned.shape)
print("Shape of Y:", Y.shape)

Shape of X: (3256, 9744)
Shape of Y: (3256,)


In [36]:
# Gamma value can be checked later
clf = svm.SVC()
# 90% of data is trained on, the rest if for predicting
clf.fit(X_aligned[:-326],Y[:-326])

In [37]:
test_predictions = clf.predict(X_aligned[-326:])
actual_traits = Y[-326:]

In [38]:
# Fraction of correct predicitons
accuracy = accuracy_score(actual_traits, test_predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7576687116564417


In [39]:
#confusion matrix
confusion_matrix(actual_traits, test_predictions)


array([[192,   2,   0,   2,   1,   0,   0],
       [  6,  32,   0,   2,   4,   0,   0],
       [  1,   0,   0,   0,   0,   0,   0],
       [ 31,   0,   0,   6,   5,   0,   0],
       [  2,   2,   0,   5,  17,   0,   0],
       [ 10,   0,   0,   0,   0,   0,   0],
       [  0,   6,   0,   0,   0,   0,   0]], dtype=int64)

In [40]:
#report

class_report = classification_report(actual_traits, test_predictions)
print("Classification Report:\n", class_report)

Classification Report:
                     precision    recall  f1-score   support

           aerobic       0.79      0.97      0.87       197
         anaerobic       0.76      0.73      0.74        44
          conflict       0.00      0.00      0.00         1
       facultative       0.40      0.14      0.21        42
   microaerophilic       0.63      0.65      0.64        26
  obligate aerobic       0.00      0.00      0.00        10
obligate anaerobic       0.00      0.00      0.00         6

          accuracy                           0.76       326
         macro avg       0.37      0.36      0.35       326
      weighted avg       0.68      0.76      0.71       326



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
