In [11]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from scipy import signal

In [12]:
train='/kaggle/input/predict-dna-methylation/train.csv'
df=pd.read_csv(train,index_col=0)
df.head()

In [13]:
df['Regulatory_Feature_Group'].value_counts()

In [14]:
df['Relation_to_UCSC_CpG_Island'].value_counts()

In [15]:
m=500
df['CG']=df.seq.apply(lambda x:x[1000-m:1000+m].count('CG'))
df['TG']=df.seq.apply(lambda x:x[1000-m:1000+m].count('TG'))
df['CA']=df.seq.apply(lambda x:x[1000-m:1000+m].count('CA'))
df=pd.get_dummies(df,columns=['Regulatory_Feature_Group'])
df.head()

In [16]:
X=df[['CG','Beta','TG','CA','Regulatory_Feature_Group_Promoter_Associated']]
X['mutation']=(X.TG+X.CA)/(2*X.CG)
y=X.Beta
X=X[['mutation','Regulatory_Feature_Group_Promoter_Associated']]
clf = LogisticRegression(random_state=0).fit(X, y)
y=y.values
prob=clf.predict_proba(X)
print('AUC: '+str(roc_auc_score(y,prob[:,1])))
pred=clf.predict(X)
print(classification_report(y, pred))

In [17]:
from sklearn.ensemble import RandomForestClassifier
X1=df[['CG','Beta','TG','CA','Regulatory_Feature_Group_Promoter_Associated']]
X1['mutation']=(X1.TG+X1.CA)/(2*X1.CG)
y1=X1.Beta
X1=X1[['mutation','Regulatory_Feature_Group_Promoter_Associated']]
clf1 = RandomForestClassifier(max_depth=2, random_state=0).fit(X1, y1)
y1=y1.values
prob1=clf1.predict_proba(X1)
print('AUC: '+str(roc_auc_score(y1,prob1[:,1])))
pred1=clf1.predict(X1)
print(classification_report(y1, pred1))

In [19]:
from sklearn.ensemble import GradientBoostingClassifier
X2=df[['CG','Beta','TG','CA','Regulatory_Feature_Group_Promoter_Associated']]
X2['mutation']=(X2.TG+X2.CA)/(2*X2.CG)
y2=X2.Beta
X2=X2[['mutation','Regulatory_Feature_Group_Promoter_Associated']]
clf2 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(X2, y2)
y2=y2.values
prob2=clf2.predict_proba(X2)
print('AUC: '+str(roc_auc_score(y2,prob[:,1])))
pred2=clf2.predict(X2)
print(classification_report(y2, pred2))

In [21]:
from sklearn.neighbors import KNeighborsClassifier
X3=df[['CG','Beta','TG','CA','Regulatory_Feature_Group_Promoter_Associated']]
X3['mutation']=(X3.TG+X3.CA)/(2*X3.CG)
y3=X3.Beta
X3=X3[['mutation','Regulatory_Feature_Group_Promoter_Associated']]
clf3 = KNeighborsClassifier(n_neighbors=3).fit(X3, y3)
y3=y3.values
prob3=clf3.predict_proba(X3)
print('AUC: '+str(roc_auc_score(y3,prob3[:,1])))
pred3=clf3.predict(X3)
print(classification_report(y3, pred3))

In [22]:
import matplotlib.pyplot as plt
from sklearn import metrics

metrics.plot_roc_curve(clf, X, y)
plt.show()

metrics.plot_roc_curve(clf1, X1, y1)
plt.show()

metrics.plot_roc_curve(clf2, X2, y2)
plt.show()

metrics.plot_roc_curve(clf3, X3, y3)
plt.show()

In [11]:
test='/kaggle/input/predict-dna-methylation/test.csv'
df=pd.read_csv(test,index_col=0)
df['CG']=df.seq.apply(lambda x:x[500:1500].count('CG'))
df['TG']=df.seq.apply(lambda x:x[500:1500].count('TG'))
df['CA']=df.seq.apply(lambda x:x[500:1500].count('CA'))
df['mutation']=(df.TG+df.CA)/(2*df.CG)
df=pd.get_dummies(df,columns=['Regulatory_Feature_Group'])
X=df[['mutation','Regulatory_Feature_Group_Promoter_Associated']]
pred=clf.predict(X)

In [12]:
df['Beta']=pred
df[['Beta']].to_csv('solution.csv')
df.head(1)