In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB, ComplementNB, BernoulliNB

In [2]:
# Combine all PDBs into a single dataframe
dfs = []
for filename in os.listdir('data/features_ring'):
    dfs.append(pd.read_csv('data/features_ring/' + filename, sep='\t'))
df = pd.concat(dfs)

# Create a new class 'Missing' for unclassified contacts
df['Interaction'].fillna('Missing', inplace=True)
df

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_down,t_phi,t_psi,t_ss3,t_a1,t_a2,t_a3,t_a4,t_a5,Interaction
0,2osx,A,129,,K,E,0.234,18.0,23.0,-1.824,...,14.0,-1.684,-0.314,H,-0.591,-1.302,-0.733,1.570,-0.146,Missing
1,2osx,A,196,,N,T,0.318,13.0,16.0,1.056,...,8.0,-1.623,-0.216,H,1.831,-0.561,0.533,-0.277,1.648,Missing
2,2osx,A,416,,T,E,0.296,9.0,27.0,-1.413,...,21.0,-1.505,2.051,H,1.357,-1.453,1.477,0.113,-0.837,HBOND
3,2osx,A,416,,T,E,0.296,9.0,27.0,-1.413,...,21.0,-1.505,2.051,H,1.357,-1.453,1.477,0.113,-0.837,VDW
4,2osx,A,476,,P,-,0.463,9.0,17.0,-1.171,...,11.0,-2.002,2.294,H,-1.337,-0.279,-0.544,1.242,-1.262,Missing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532,2oln,A,208,,T,H,0.000,22.0,17.0,-0.918,...,12.0,-2.216,2.836,H,0.260,0.830,3.097,-0.838,1.512,HBOND
533,2oln,A,60,,D,H,0.196,9.0,15.0,-1.108,...,11.0,-1.263,-0.413,H,1.538,-0.055,1.502,0.440,2.897,IONIC
534,2oln,A,126,,I,H,0.000,25.0,11.0,-1.099,...,13.0,-1.976,-0.173,H,-1.006,-0.590,1.891,-0.397,0.412,HBOND
535,2oln,A,125,,D,H,0.221,12.0,12.0,-1.241,...,14.0,-1.115,-0.830,H,1.538,-0.055,1.502,0.440,2.897,HBOND


In [4]:
# Remove all rows with NaN in at least one column
df.dropna(inplace=True)

# Define ground truth values
y = df['Interaction'].astype('category')
y

0      Missing
1      Missing
2        HBOND
3          VDW
4      Missing
        ...   
532      HBOND
533      IONIC
534      HBOND
535      HBOND
536      HBOND
Name: Interaction, Length: 652574, dtype: category
Categories (7, object): ['HBOND', 'IONIC', 'Missing', 'PICATION', 'PIPISTACK', 'SSBOND', 'VDW']

In [5]:
# Define training features
X = df[['s_rsa', 's_up', 's_down', 's_phi', 's_psi', 's_a1', 's_a2', 's_a3', 's_a4', 's_a5', 
        't_rsa', 't_up', 't_down', 't_phi', 't_psi', 't_a1', 't_a2', 't_a3', 't_a4', 't_a5']]

# Calculate percentiles and tranform into categories
X = X.rank(pct=True).round(1).astype('category') 
X

Unnamed: 0,s_rsa,s_up,s_down,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,t_rsa,t_up,t_down,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5
0,0.7,0.6,0.8,0.3,0.8,1.0,0.3,0.5,0.3,0.9,0.3,0.6,0.3,0.3,0.4,0.4,0.1,0.3,1.0,0.5
1,0.8,0.3,0.4,1.0,0.6,0.8,0.8,0.6,0.4,0.7,0.9,0.2,0.1,0.3,0.5,1.0,0.3,0.5,0.3,0.9
2,0.7,0.2,1.0,0.4,0.7,0.5,0.7,0.9,0.7,0.8,0.8,0.1,0.8,0.4,0.6,0.9,0.1,0.7,0.4,0.4
3,0.7,0.2,1.0,0.4,0.7,0.5,0.7,0.9,0.7,0.8,0.8,0.1,0.8,0.4,0.6,0.9,0.1,0.7,0.4,0.4
4,0.9,0.2,0.5,0.6,0.9,0.6,1.0,0.2,0.5,0.2,0.5,0.7,0.2,0.2,0.7,0.0,0.5,0.4,0.8,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532,0.1,0.8,0.5,0.9,0.1,0.5,0.7,0.9,0.7,0.8,0.4,0.8,0.2,0.1,1.0,0.6,0.8,1.0,0.1,0.8
533,0.6,0.2,0.3,0.7,0.3,0.8,0.7,0.1,0.3,0.0,0.8,0.1,0.2,0.5,0.4,0.9,0.6,0.7,0.6,1.0
534,0.1,0.9,0.1,0.7,0.1,0.1,0.4,0.8,0.5,0.6,0.4,0.8,0.3,0.2,0.5,0.3,0.3,0.8,0.2,0.6
535,0.7,0.3,0.2,0.5,0.3,0.8,0.7,0.1,0.3,0.0,0.9,0.1,0.3,0.7,0.1,0.9,0.6,0.7,0.6,1.0


In [12]:
# Split the dataset to define training and testing examples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [None]:
# Test different versions of the 

In [13]:
nb = GaussianNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 65258 points : 33437


In [14]:
nb = MultinomialNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 65258 points : 34276


In [15]:
nb = ComplementNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 65258 points : 31786


In [16]:
nb = BernoulliNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 65258 points : 34605


In [17]:
nb = CategoricalNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 65258 points : 34552
