In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import re
from sklearn.svm import SVC
import pickle
import joblib
from preprocess import pfeature_process

In [2]:
model_path = '../models/svc_model.pkl'
cd_hit_path = '../data/processed/generated_seqs_cd_hit.txt'

In [3]:
model = joblib.load(model_path)

In [4]:
model.feature_names_in_

array(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9',
       'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17',
       'PC18'], dtype=object)

In [5]:
test_seqs = pfeature_process(cd_hit_path)
test_seqs


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18
LDLDDWYTVDRDAMSM,1.837949,4.840107,-1.346732,-2.656623,-7.338778,-2.767022,-2.165862,1.325178,2.312721,-0.523267,3.584726,-0.935799,-3.049087,0.430496,1.876711,-2.622317,-0.531721,1.522512
KEAKEGATEWCPIVIN,-0.568820,-0.202947,-5.232013,3.124897,-1.335624,0.048294,2.267393,-0.566828,-1.088184,1.337355,-0.223229,0.117899,1.453551,1.707601,0.713888,1.843985,0.381098,0.461066
IYMYQNPQADYQKTVV,-3.017279,-0.494642,-0.121134,-1.773381,3.953475,-0.026841,2.132448,0.567821,1.115801,0.202115,1.469380,2.944839,2.261731,-0.124776,-0.615567,0.557680,-1.846161,-0.128353
YYIENVMHVAMPMYYK,-9.005871,-3.115693,6.102775,-2.664138,1.976713,-2.158312,-2.036847,-1.237674,0.749734,0.945519,0.728870,-0.265956,-0.653033,1.762363,-0.942135,-0.639599,1.217786,1.182850
DPAMEFDNAEIIDDDD,8.979129,3.274629,-8.581540,-4.372976,-6.572749,-2.914252,-2.991454,-1.163224,1.694454,0.896028,-1.788821,0.308110,-0.014367,0.052919,-0.402434,-0.671586,-2.284984,1.034657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YVYMMYYMYMVRMCHD,-12.379661,-5.053110,12.332297,-5.327318,-1.421743,-3.704994,-2.403009,-1.897881,2.784063,-2.361427,0.045013,1.521183,-0.673484,1.190574,-2.282150,1.662655,-0.135265,-0.694698
DNKHYYDYDTKFNYVV,0.790335,-0.722205,5.335459,-7.882551,-2.099492,1.918258,2.882965,1.351603,-0.544123,0.956719,-1.562085,1.896733,0.519393,-1.007120,-0.163305,-0.746073,-1.001879,-0.329537
WEHEQQHDNQDDGKDN,15.646699,-1.677398,-1.730841,-3.529699,0.123181,-0.802544,-1.731994,-0.947546,-2.755935,-2.185756,1.237285,0.234506,1.676907,-0.482760,0.675301,-0.101538,-1.913603,-0.276086
YYCIMNKMTDKHFFAA,-5.927433,-0.592338,7.104879,-0.959287,-3.463211,-0.251893,-0.099849,-1.084611,0.107002,-2.813442,0.429898,0.811975,1.132685,1.122255,0.237902,0.597569,-1.754633,-0.767523


In [6]:
predict_test = model.predict(test_seqs)
predict_test

array([1, 1, 1, ..., 0, 1, 0])

In [7]:
predict_test = pd.DataFrame(predict_test)

In [10]:
predict_test.index = test_seqs.index
predict_test.reset_index(inplace=True)
predict_test.shape

(1365, 2)

In [11]:
predict_test.rename(columns={'index':'Secuence', 0:'Label'}, inplace=True)



In [12]:
predict_test

Unnamed: 0,Secuence,Label
0,LDLDDWYTVDRDAMSM,1
1,KEAKEGATEWCPIVIN,1
2,IYMYQNPQADYQKTVV,1
3,YYIENVMHVAMPMYYK,1
4,DPAMEFDNAEIIDDDD,0
...,...,...
1360,YVYMMYYMYMVRMCHD,1
1361,DNKHYYDYDTKFNYVV,1
1362,WEHEQQHDNQDDGKDN,0
1363,YYCIMNKMTDKHFFAA,1


In [13]:
predict_positive = predict_test[predict_test['Label']==1]
predict_positive

Unnamed: 0,Secuence,Label
0,LDLDDWYTVDRDAMSM,1
1,KEAKEGATEWCPIVIN,1
2,IYMYQNPQADYQKTVV,1
3,YYIENVMHVAMPMYYK,1
9,LERGNRVEKRWCCCSR,1
...,...,...
1357,EFEFEYFKKDYMYNRI,1
1359,LDEKWHRDAKCYNIKN,1
1360,YVYMMYYMYMVRMCHD,1
1361,DNKHYYDYDTKFNYVV,1


In [14]:
predict_positive.to_csv('../data/processed/predicted_positive.csv', index = False)