In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import spearmanr
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv("data/train.csv")
df_updated = pd.read_csv("data/train_updates_20220929.csv")
df_test =  pd.read_csv("data/test.csv")

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31390 entries, 0 to 31389
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   seq_id            31390 non-null  int64  
 1   protein_sequence  31390 non-null  object 
 2   pH                31104 non-null  float64
 3   data_source       28043 non-null  object 
 4   tm                31390 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.2+ MB


In [4]:
df_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2434 entries, 0 to 2433
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   seq_id            2434 non-null   int64  
 1   protein_sequence  25 non-null     object 
 2   pH                25 non-null     float64
 3   data_source       0 non-null      float64
 4   tm                25 non-null     float64
dtypes: float64(3), int64(1), object(1)
memory usage: 95.2+ KB


In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2413 entries, 0 to 2412
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   seq_id            2413 non-null   int64 
 1   protein_sequence  2413 non-null   object
 2   pH                2413 non-null   int64 
 3   data_source       2413 non-null   object
dtypes: int64(2), object(2)
memory usage: 75.5+ KB


In [6]:
for seq_id in df_updated.seq_id:
    df_train = df_train.drop(index = seq_id)

In [7]:
df_train.drop(columns=["seq_id","data_source"],inplace=True)

In [8]:
df_train.describe().T.style.background_gradient("YlOrRd")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pH,28670.0,6.872918,0.79255,1.99,7.0,7.0,7.0,11.0
tm,28956.0,51.360399,12.060858,25.1,43.6,48.8,54.6,130.0


In [9]:
df_train["pH"].describe()

count    28670.000000
mean         6.872918
std          0.792550
min          1.990000
25%          7.000000
50%          7.000000
75%          7.000000
max         11.000000
Name: pH, dtype: float64

In [10]:
df_train["tm"].value_counts()

50.0    200
45.0    178
49.7    176
49.0    175
48.4    174
       ... 
92.2      1
26.2      1
98.1      1
26.1      1
93.3      1
Name: tm, Length: 725, dtype: int64

In [11]:
arr = []
for i in range (len(df_train)):
    arr.append(len(df_train["protein_sequence"].iloc[i]))
df_train["length"] = arr
arr

[341,
 286,
 497,
 265,
 1451,
 380,
 380,
 301,
 287,
 163,
 217,
 265,
 55,
 1643,
 81,
 228,
 114,
 380,
 904,
 284,
 203,
 645,
 213,
 192,
 341,
 501,
 400,
 206,
 313,
 109,
 329,
 354,
 324,
 278,
 506,
 150,
 448,
 155,
 477,
 352,
 449,
 448,
 345,
 346,
 330,
 455,
 448,
 286,
 446,
 530,
 169,
 88,
 210,
 352,
 499,
 461,
 1417,
 530,
 1539,
 676,
 341,
 247,
 203,
 792,
 299,
 56,
 415,
 225,
 631,
 384,
 495,
 436,
 194,
 566,
 189,
 219,
 518,
 480,
 636,
 332,
 679,
 621,
 530,
 686,
 406,
 181,
 1515,
 194,
 308,
 306,
 516,
 514,
 348,
 291,
 168,
 867,
 761,
 255,
 389,
 301,
 402,
 469,
 322,
 247,
 1047,
 1041,
 1019,
 1006,
 340,
 722,
 789,
 329,
 462,
 90,
 57,
 942,
 339,
 319,
 487,
 300,
 223,
 327,
 411,
 534,
 271,
 547,
 555,
 248,
 369,
 358,
 740,
 109,
 343,
 186,
 129,
 372,
 261,
 313,
 227,
 442,
 210,
 297,
 152,
 141,
 335,
 223,
 415,
 447,
 399,
 243,
 514,
 345,
 178,
 510,
 137,
 588,
 393,
 481,
 630,
 280,
 100,
 261,
 472,
 233,
 666,
 110,
 

In [12]:
new_arr = []
for i in range (len(df_test)):
    new_arr.append(len(df_test["protein_sequence"].iloc[i]))
df_test["length"] = new_arr

In [13]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28956 entries, 0 to 31389
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   protein_sequence  28956 non-null  object 
 1   pH                28670 non-null  float64
 2   tm                28956 non-null  float64
 3   length            28956 non-null  int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 1.1+ MB


In [14]:
df_train.head()

Unnamed: 0,protein_sequence,pH,tm,length
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7,341
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,286
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5,497
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,265
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,49.5,1451


In [15]:
amino_acids= ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
for letter in amino_acids:
    df_train[letter] = df_train.protein_sequence.str.count(letter)
    df_test[letter] = df_test.protein_sequence.str.count(letter)

In [16]:
df_train.head()

Unnamed: 0,protein_sequence,pH,tm,length,A,C,D,E,F,G,...,M,N,P,Q,R,S,T,V,W,Y
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7,341,45,1,13,30,13,38,...,8,5,18,6,25,11,14,37,4,3
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,286,28,0,10,52,6,18,...,2,6,8,22,30,14,12,13,3,3
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5,497,50,9,27,32,21,65,...,6,15,20,25,31,33,30,30,3,16
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,265,20,5,19,29,12,16,...,2,9,16,9,10,16,19,14,3,4
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,49.5,1451,86,14,78,78,32,84,...,31,65,128,54,63,148,120,124,16,47


In [18]:
df_train = df_train[df_train["tm"]>51.36]
df_train.head()

Unnamed: 0,protein_sequence,pH,tm,length,A,C,D,E,F,G,...,M,N,P,Q,R,S,T,V,W,Y
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7,341,45,1,13,30,13,38,...,8,5,18,6,25,11,14,37,4,3
7,AAENRPPIPGSLGMLYDSTLCVGCQACVTKCQDINFPERNPQGEQT...,7.0,55.9,301,15,16,18,17,3,28,...,6,18,21,11,10,11,18,23,2,17
11,AAIDPNRIVALEWLPVELLLALGIVPYGVADTINYRLWVSEPPLPD...,7.0,60.6,265,27,1,17,13,13,17,...,11,10,20,7,16,14,11,18,7,7
12,AAIGIGILGGKFLEGAARQPDLIPLLRTQFFIVMGLVDAIPMIAVG...,7.0,62.8,55,7,0,2,1,4,8,...,3,0,3,2,2,0,1,5,0,1
13,AAKGDCGGSGCGKCDCHGVKGQKGERGLPGLQGVIGFPGMQGPEGP...,7.0,62.0,1643,51,20,57,68,47,477,...,31,19,321,79,44,75,47,54,5,14


In [18]:
df_train.shape

(10637, 24)

In [19]:
# fitting our model
X = df_train.drop(columns=["protein_sequence","tm"])
y = df_train["tm"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [21]:
model = XGBRegressor().fit(X_train,y_train)
prediction = model.predict(X_test)
corr, p = spearmanr(y_test,prediction)
print("Spearman Correlation: ",corr)
X_test.shape

Spearman Correlation:  0.7289528964067167


(2128, 22)

In [22]:
model = XGBRegressor()
params = {"n_estimators":[100,200,300],"max_depth":[3,5,7],"learning_rate":[0.1,0.05,0.001]}
tuned_model = GridSearchCV(model,param_grid=params,cv=5,verbose=2,n_jobs=-1).fit(X_train,y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [23]:
# The best params are
tuned_model.best_params_

{'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 300}

In [24]:
final_model = XGBRegressor(n_estimators=300,learning_rate=0.05,max_depth=7).fit(X_train,y_train)
final_prediction = final_model.predict(X_test)
corr, p = spearmanr(y_test,final_prediction)
print("Spearman Correlation with Tuned Parameters: ",corr)
# as we can see, after tuning process we got 0.02 difference 

Spearman Correlation with Tuned Parameters:  0.749974796167916


In [25]:
submission =pd.DataFrame()

In [26]:
submission['seq_id']=df_test['seq_id']

In [27]:
X_test.head()

Unnamed: 0,pH,length,A,C,D,E,F,G,H,I,...,M,N,P,Q,R,S,T,V,W,Y
12198,7.0,587,26,8,40,39,18,38,11,57,...,15,32,23,13,32,47,22,38,1,28
12167,7.0,154,13,1,10,14,7,15,5,8,...,3,2,8,3,13,6,4,11,2,8
15740,3.0,155,15,3,7,12,2,14,4,7,...,4,7,5,8,10,4,10,9,6,5
31170,7.0,149,9,3,6,18,9,10,3,12,...,4,10,6,3,5,4,4,8,4,13
12009,7.0,198,18,0,9,22,4,6,1,7,...,6,4,9,9,16,17,2,8,4,9


In [28]:
df_test.head()

Unnamed: 0,seq_id,protein_sequence,pH,data_source,length,A,C,D,E,F,...,M,N,P,Q,R,S,T,V,W,Y
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,221,22,4,15,8,10,...,0,19,17,13,3,18,8,13,6,6
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,221,22,4,15,7,10,...,0,19,17,13,3,18,8,13,6,6
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,220,22,4,15,7,10,...,0,19,17,13,3,18,8,13,6,6
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,221,22,5,15,7,10,...,0,19,17,13,3,18,8,13,6,6
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,221,22,4,15,7,11,...,0,19,17,13,3,18,8,13,6,6


In [29]:
df_test.drop(columns=["seq_id","protein_sequence","data_source"])

Unnamed: 0,pH,length,A,C,D,E,F,G,H,I,...,M,N,P,Q,R,S,T,V,W,Y
0,8,221,22,4,15,8,10,19,0,6,...,0,19,17,13,3,18,8,13,6,6
1,8,221,22,4,15,7,10,19,0,6,...,0,19,17,13,3,18,8,13,6,6
2,8,220,22,4,15,7,10,19,0,6,...,0,19,17,13,3,18,8,13,6,6
3,8,221,22,5,15,7,10,19,0,6,...,0,19,17,13,3,18,8,13,6,6
4,8,221,22,4,15,7,11,19,0,6,...,0,19,17,13,3,18,8,13,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,8,221,21,4,15,7,10,19,0,7,...,0,19,17,13,3,18,8,13,6,6
2409,8,221,21,4,15,7,10,19,0,6,...,0,19,17,13,3,18,8,13,6,6
2410,8,221,21,4,15,7,10,19,0,6,...,0,20,17,13,3,18,8,13,6,6
2411,8,221,21,4,15,7,10,19,0,6,...,0,19,18,13,3,18,8,13,6,6


In [30]:
submission['tm'] =final_model.predict(df_test.drop(columns=["seq_id","protein_sequence","data_source"]))

In [31]:
submission.to_csv('submission.csv',index=False)
submission

Unnamed: 0,seq_id,tm
0,31390,62.691925
1,31391,62.691925
2,31392,62.691925
3,31393,62.720409
4,31394,63.090912
...,...,...
2408,33798,62.775951
2409,33799,60.946922
2410,33800,62.775951
2411,33801,62.827885


[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   2.2s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=200; total time=   3.5s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=300; total time=   5.3s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=300; total time=   5.5s
[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=200; total time=   5.8s
[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=200; total time=   5.7s
[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=300; total time=   8.4s
[CV] END ...learning_rate=0.1, max_depth=7, n_estimators=100; total time=   4.0s
[CV] END ...learning_rate=0.1, max_depth=7, n_estimators=200; total time=   8.1s
[CV] END ...learning_rate=0.1, max_depth=7, n_estimators=300; total time=  12.1s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=100; total time=   1.7s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=100; total time=   1.7s
[CV] END ..learning_rate=0.0