In [215]:
import csv
import xgboost

from IPython.display import display, Markdown
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

with open('./splice.data') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    df_li = []
    for row in csv_reader:
        df_li.append(row)
    df = pd.DataFrame(df_li, columns=["target", "instance_name", "sequence"])
df.head()

Unnamed: 0,target,instance_name,sequence
0,EI,ATRINS-DONOR-521,CCAGCTGCATCACAGGAGGCCAGCGAGCAGG...
1,EI,ATRINS-DONOR-905,AGACCCGCCGGGAGGCGGAGGACCTGCAGGG...
2,EI,BABAPOE-DONOR-30,GAGGTGAAGGACGTCCTTCCCCAGGAGCCGG...
3,EI,BABAPOE-DONOR-867,GGGCTGCGTTGCTGGTCACATTCCTGGCAGGT...
4,EI,BABAPOE-DONOR-2817,GCTCAGCCCCCAGGTCACCCAGGAACTGACGTG...


In [216]:
df.shape

(3190, 3)

In [217]:
df.target.value_counts()

N     1655
IE     768
EI     767
Name: target, dtype: int64

In [246]:
ze = np.zeros(10)
for i in list(range(10)):
    ze[i] = i

In [248]:
ze.mean()

4.5

In [218]:
for col in df.columns:
    df[col] = df[col].apply(lambda a : a.strip())

In [219]:
len(df["sequence"].unique())

3005

In [220]:
tmp = df.sequence.value_counts().keys()[0]
tmp

'GGCCTCTCCTTCTCTTCCTTCACTTTGCAGAGGCTGGAAGATGGCAGCCCCCGGACTGGG'

In [221]:
df[df.sequence == tmp]

Unnamed: 0,target,instance_name,sequence
1096,IE,HUMGH-ACCEPTOR-1572,GGCCTCTCCTTCTCTTCCTTCACTTTGCAGAGGCTGGAAGATGGCA...
1100,IE,HUMGHCSA-ACCEPTOR-6465,GGCCTCTCCTTCTCTTCCTTCACTTTGCAGAGGCTGGAAGATGGCA...
1112,IE,HUMGHCSA-ACCEPTOR-43393,GGCCTCTCCTTCTCTTCCTTCACTTTGCAGAGGCTGGAAGATGGCA...
1120,IE,HUMGHN-ACCEPTOR-1797,GGCCTCTCCTTCTCTTCCTTCACTTTGCAGAGGCTGGAAGATGGCA...
1121,IE,HUMGHVA-ACCEPTOR-741,GGCCTCTCCTTCTCTTCCTTCACTTTGCAGAGGCTGGAAGATGGCA...


In [222]:
len(df["instance_name"].unique())

3178

In [223]:
df["seq_len"] = df.sequence.apply(lambda seq: len(seq))
# for i in list(range(3)):
#     df["name_part_" + str(i)] = df.instance_name.apply(lambda name: name.split("-")[i])

In [224]:
df.head()

Unnamed: 0,target,instance_name,sequence,seq_len
0,EI,ATRINS-DONOR-521,CCAGCTGCATCACAGGAGGCCAGCGAGCAGGTCTGTTCCAAGGGCC...,60
1,EI,ATRINS-DONOR-905,AGACCCGCCGGGAGGCGGAGGACCTGCAGGGTGAGCCCCACCGCCC...,60
2,EI,BABAPOE-DONOR-30,GAGGTGAAGGACGTCCTTCCCCAGGAGCCGGTGAGAAGCGCAGTCG...,60
3,EI,BABAPOE-DONOR-867,GGGCTGCGTTGCTGGTCACATTCCTGGCAGGTATGGGGCGGGGCTT...,60
4,EI,BABAPOE-DONOR-2817,GCTCAGCCCCCAGGTCACCCAGGAACTGACGTGAGTGTCCCCATCC...,60


In [225]:
df.sequence.apply(lambda seq: list(set(seq)))[0]

['G', 'C', 'T', 'A']

In [226]:
df["seq_unique_letters"] = df.sequence.apply(lambda seq: "".join(sorted(list(set(seq)))))

In [227]:
df.sequence[0]

'CCAGCTGCATCACAGGAGGCCAGCGAGCAGGTCTGTTCCAAGGGCCTTCGAGCCAGTCTG'

In [228]:
df.tail()

Unnamed: 0,target,instance_name,sequence,seq_len,seq_unique_letters
3185,N,ORAHBPSBD-NEG-2881,TCTCTTCCCTTCCCCTCTCTCTTTCTTTCTTTTCTCTCCTCTTCTC...,60,CT
3186,N,ORAINVOL-NEG-2161,GAGCTCCCAGAGCAGCAAGAGGGCCAGCTGAAGCACCTGGAGAAGC...,60,ACGT
3187,N,ORARGIT-NEG-241,TCTCGGGGGCGGCCGGCGCGGCGGGGAGCGGTCCCCGGCCGCGGCC...,60,ACGT
3188,N,TARHBB-NEG-541,ATTCTACTTAGTAAACATAATTTCTTGTGCTAGATAACCAAATTAA...,60,ACGT
3189,N,TARHBD-NEG-1981,AGGCTGCCTATCAGAAGGTGGTGGCTGGTGTGGCTGCTGCTCTGGC...,60,ACGT


In [229]:
df.dtypes

target                object
instance_name         object
sequence              object
seq_len                int64
seq_unique_letters    object
dtype: object

In [230]:
df.seq_unique_letters.unique()

array(['ACGT', 'ACGNT', 'ACDGT', 'ACGRT', 'ACGST', 'AG', 'CGT', 'CT'],
      dtype=object)

In [231]:
df.seq_unique_letters.value_counts()

ACGT     3169
ACGNT      11
CGT         3
ACDGT       2
AG          2
ACGRT       1
CT          1
ACGST       1
Name: seq_unique_letters, dtype: int64

In [232]:
df.pivot_table("seq_len", "target", "seq_unique_letters", aggfunc=np.sum, fill_value=0)/60

seq_unique_letters,ACDGT,ACGNT,ACGRT,ACGST,ACGT,AG,CGT,CT
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
EI,0,5,0,0,762,0,0,0
IE,1,0,1,1,765,0,0,0
N,1,6,0,0,1642,2,3,1


In [233]:
cnt_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('mb', MultinomialNB()),
])

In [234]:
vect = CountVectorizer(analyzer="char").fit(df.sequence.values)
vect.vocabulary_

{'c': 1, 'a': 0, 'g': 3, 't': 7, 'n': 4, 'd': 2, 'r': 5, 's': 6}

In [23]:
X = df.sequence
y = df.target

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [27]:
%%time
cnt_pipeline.fit(X_train, y_train)

CPU times: user 20 ms, sys: 2.17 ms, total: 22.2 ms
Wall time: 20.6 ms


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [28]:
tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB(alpha=0.01)) 
])

parameters = {
#     'tfidf__max_df': tfidf_max_df_params,
#     'tfidf__min_df': tfidf_min_df_params,
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

grid_search_tune = GridSearchCV(tfidf_pipeline, parameters, cv=10, n_jobs=3, verbose=3)

# %%time
grid_search_tune.fit(X_train, y_train)

Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV] tfidf__ngram_range=(1, 1) .......................................
[CV] tfidf__ngram_range=(1, 1) .......................................
[CV] tfidf__ngram_range=(1, 1) .......................................
[CV]  tfidf__ngram_range=(1, 1), score=0.6302083333333334, total=   0.0s
[CV] ........ tfidf__ngram_range=(1, 1), score=0.609375, total=   0.0s
[CV]  tfidf__ngram_range=(1, 1), score=0.5803108808290155, total=   0.0s
[CV] tfidf__ngram_range=(1, 1) .......................................
[CV] tfidf__ngram_range=(1, 1) .......................................
[CV] tfidf__ngram_range=(1, 1) .......................................
[CV]  tfidf__ngram_range=(1, 1), score=0.5989583333333334, total=   0.0s
[CV] tfidf__ngram_range=(1, 1) .......................................
[CV] .......... tfidf__ngram_range=(1, 1), score=0.5625, total=   0.0s
[CV]  tfidf__ngram_range=(1, 1), score=0.6197916666666666, total=   0.0s
[CV] tfi

[Parallel(n_jobs=3)]: Done  14 out of  30 | elapsed:    0.4s remaining:    0.4s


[CV]  tfidf__ngram_range=(1, 3), score=0.6197916666666666, total=   0.0s
[CV]  tfidf__ngram_range=(1, 3), score=0.6157894736842106, total=   0.0s


[Parallel(n_jobs=3)]: Done  30 out of  30 | elapsed:    0.6s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [29]:
grid_search_tune.best_params_

{'tfidf__ngram_range': (1, 1)}

In [30]:
tfidf_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])

In [31]:
cnt_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [32]:
cnt_y_pred = cnt_pipeline.predict(X_test)

In [33]:
tfidf_y_pred = tfidf_pipeline.predict(X_test)

In [34]:
print(classification_report(y_test, cnt_y_pred))

             precision    recall  f1-score   support

         EI       1.00      0.14      0.25       320
         IE       1.00      0.15      0.27       327
          N       0.53      1.00      0.70       629

avg / total       0.77      0.57      0.47      1276



In [35]:
print(classification_report(y_test, tfidf_y_pred))

             precision    recall  f1-score   support

         EI       1.00      0.14      0.25       320
         IE       1.00      0.15      0.27       327
          N       0.53      1.00      0.70       629

avg / total       0.77      0.57      0.47      1276



In [37]:
label_enc = LabelEncoder().fit(y)

In [38]:
cnt_vect = CountVectorizer(analyzer="char").fit(X)
label_enc = LabelEncoder().fit(y)

In [158]:
def get_encoded_target(y, label_enc):
    y_labeled = label_enc.transform(y)
    
    return y_labeled

def xgb_preprocess(x, y_label, cnt_vect):
    tmp = x
    if cnt_vect != None:
        tmp = cnt_vect.transform(x).toarray()
    
    return xgboost.DMatrix(tmp, label = y_label)

In [209]:
def runXgboost(x, y, cnt_vect=None, num_boost_round = 10000, lr = 0.01, max_delta_step = 4):
    y_labeled = get_encoded_target(y, label_enc)
    
    X_train, X_test, y_train, y_test = train_test_split(x, y_labeled, test_size=0.2, random_state=0)
    
    dtrain = xgb_preprocess(X_train, y_train, cnt_vect)
    dtest = xgb_preprocess(X_test, y_test, cnt_vect)
    
    params = {'objective': 'multi:softprob', 
              'eval_metric': 'mlogloss',
              'num_class': 3, 
              'max_delta_step': max_delta_step, 
              'eta': lr}

    evals = [(dtrain, 'train'), (dtest, 'eval')]

    xgb = xgboost.train(params=params,  
                    dtrain=dtrain, 
                    num_boost_round=num_boost_round, 
                    evals=evals,
                    early_stopping_rounds=10)
    return xgb

In [160]:
xgb = runXgboost(X, y, cnt_vect)

[0]	train-mlogloss:1.09515	eval-mlogloss:1.09564
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.09173	eval-mlogloss:1.09272
[2]	train-mlogloss:1.08835	eval-mlogloss:1.08986
[3]	train-mlogloss:1.08503	eval-mlogloss:1.08703
[4]	train-mlogloss:1.08174	eval-mlogloss:1.08426
[5]	train-mlogloss:1.07851	eval-mlogloss:1.08151
[6]	train-mlogloss:1.0753	eval-mlogloss:1.07883
[7]	train-mlogloss:1.07215	eval-mlogloss:1.07617
[8]	train-mlogloss:1.06903	eval-mlogloss:1.07362
[9]	train-mlogloss:1.06597	eval-mlogloss:1.07108
[10]	train-mlogloss:1.06295	eval-mlogloss:1.06861
[11]	train-mlogloss:1.05997	eval-mlogloss:1.06615
[12]	train-mlogloss:1.05702	eval-mlogloss:1.06373
[13]	train-mlogloss:1.05413	eval-mlogloss:1.06136
[14]	train-mlogloss:1.05126	eval-mlogloss:1.05905
[15]	train-mlogloss:1.04844	eval-mlogloss:1.05678
[16]	train-mlogloss:1.04569	eval-mlogloss:1.05454
[17]	train

[157]	train-mlogloss:0.84852	eval-mlogloss:0.910508
[158]	train-mlogloss:0.847774	eval-mlogloss:0.910016
[159]	train-mlogloss:0.847104	eval-mlogloss:0.90966
[160]	train-mlogloss:0.846445	eval-mlogloss:0.909252
[161]	train-mlogloss:0.8458	eval-mlogloss:0.908824
[162]	train-mlogloss:0.845147	eval-mlogloss:0.908432
[163]	train-mlogloss:0.844507	eval-mlogloss:0.908024
[164]	train-mlogloss:0.843879	eval-mlogloss:0.907677
[165]	train-mlogloss:0.843248	eval-mlogloss:0.907305
[166]	train-mlogloss:0.842625	eval-mlogloss:0.906914
[167]	train-mlogloss:0.842011	eval-mlogloss:0.906565
[168]	train-mlogloss:0.841403	eval-mlogloss:0.906171
[169]	train-mlogloss:0.840804	eval-mlogloss:0.905768
[170]	train-mlogloss:0.84021	eval-mlogloss:0.905376
[171]	train-mlogloss:0.839565	eval-mlogloss:0.905003
[172]	train-mlogloss:0.838924	eval-mlogloss:0.904577
[173]	train-mlogloss:0.838274	eval-mlogloss:0.904168
[174]	train-mlogloss:0.837597	eval-mlogloss:0.903774
[175]	train-mlogloss:0.836927	eval-mlogloss:0.90341

[313]	train-mlogloss:0.78167	eval-mlogloss:0.877789
[314]	train-mlogloss:0.781425	eval-mlogloss:0.877712
[315]	train-mlogloss:0.781213	eval-mlogloss:0.877661
[316]	train-mlogloss:0.781003	eval-mlogloss:0.877601
[317]	train-mlogloss:0.780772	eval-mlogloss:0.877479
[318]	train-mlogloss:0.780541	eval-mlogloss:0.877421
[319]	train-mlogloss:0.780322	eval-mlogloss:0.877373
[320]	train-mlogloss:0.780085	eval-mlogloss:0.877269
[321]	train-mlogloss:0.77986	eval-mlogloss:0.877211
[322]	train-mlogloss:0.779604	eval-mlogloss:0.877188
[323]	train-mlogloss:0.779354	eval-mlogloss:0.877136
[324]	train-mlogloss:0.779102	eval-mlogloss:0.877062
[325]	train-mlogloss:0.778853	eval-mlogloss:0.877027
[326]	train-mlogloss:0.778629	eval-mlogloss:0.876923
[327]	train-mlogloss:0.778377	eval-mlogloss:0.876861
[328]	train-mlogloss:0.778134	eval-mlogloss:0.876833
[329]	train-mlogloss:0.777891	eval-mlogloss:0.876793
[330]	train-mlogloss:0.77764	eval-mlogloss:0.876738
[331]	train-mlogloss:0.777406	eval-mlogloss:0.876

[469]	train-mlogloss:0.752108	eval-mlogloss:0.868588
[470]	train-mlogloss:0.75198	eval-mlogloss:0.868491
[471]	train-mlogloss:0.751861	eval-mlogloss:0.868435
[472]	train-mlogloss:0.751757	eval-mlogloss:0.868352
[473]	train-mlogloss:0.75165	eval-mlogloss:0.868302
[474]	train-mlogloss:0.751554	eval-mlogloss:0.868227
[475]	train-mlogloss:0.751424	eval-mlogloss:0.868152
[476]	train-mlogloss:0.751279	eval-mlogloss:0.868062
[477]	train-mlogloss:0.75116	eval-mlogloss:0.868006
[478]	train-mlogloss:0.75107	eval-mlogloss:0.867938
[479]	train-mlogloss:0.750945	eval-mlogloss:0.867866
[480]	train-mlogloss:0.750801	eval-mlogloss:0.86778
[481]	train-mlogloss:0.750691	eval-mlogloss:0.867726
[482]	train-mlogloss:0.750579	eval-mlogloss:0.867673
[483]	train-mlogloss:0.750462	eval-mlogloss:0.867595
[484]	train-mlogloss:0.750327	eval-mlogloss:0.867523
[485]	train-mlogloss:0.750233	eval-mlogloss:0.86746
[486]	train-mlogloss:0.75011	eval-mlogloss:0.867423
[487]	train-mlogloss:0.750016	eval-mlogloss:0.867355


[625]	train-mlogloss:0.734227	eval-mlogloss:0.862414
[626]	train-mlogloss:0.734089	eval-mlogloss:0.862357
[627]	train-mlogloss:0.733918	eval-mlogloss:0.862348
[628]	train-mlogloss:0.733753	eval-mlogloss:0.862288
[629]	train-mlogloss:0.733565	eval-mlogloss:0.86223
[630]	train-mlogloss:0.733416	eval-mlogloss:0.862169
[631]	train-mlogloss:0.733309	eval-mlogloss:0.862153
[632]	train-mlogloss:0.733189	eval-mlogloss:0.862102
[633]	train-mlogloss:0.733084	eval-mlogloss:0.862076
[634]	train-mlogloss:0.732953	eval-mlogloss:0.862042
[635]	train-mlogloss:0.732871	eval-mlogloss:0.861998
[636]	train-mlogloss:0.732803	eval-mlogloss:0.861985
[637]	train-mlogloss:0.73269	eval-mlogloss:0.861955
[638]	train-mlogloss:0.732597	eval-mlogloss:0.861914
[639]	train-mlogloss:0.732468	eval-mlogloss:0.86189
[640]	train-mlogloss:0.732359	eval-mlogloss:0.861844
[641]	train-mlogloss:0.732267	eval-mlogloss:0.861798
[642]	train-mlogloss:0.732116	eval-mlogloss:0.861759
[643]	train-mlogloss:0.731985	eval-mlogloss:0.861

In [42]:
test_y = get_encoded_target(y_test, label_enc)
test_X = xgb_preprocess(X_test, test_y, cnt_vect)

In [43]:
y_pred_proba = xgb.predict(test_X)

In [44]:
y_pred = [np.argmax(line) for line in y_pred_proba]

In [45]:
y_true = label_enc.inverse_transform(test_y)
y_pred = label_enc.inverse_transform(y_pred)

  if diff:
  if diff:


In [46]:
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

         EI       0.62      0.34      0.44       320
         IE       0.61      0.47      0.53       327
          N       0.59      0.79      0.68       629

avg / total       0.60      0.60      0.58      1276



In [235]:
tmp = df.sequence.apply(lambda seq: list(seq))

In [236]:
zeros = np.zeros((3190, 60))

In [237]:
total = ""
for letter in df.seq_unique_letters.unique():
    total += letter
letter_bags = list(set(list(total)))

In [238]:
letter_bags

['G', 'T', 'R', 'S', 'D', 'A', 'C', 'N']

In [151]:
for idx, z in enumerate(zeros):
    converted = map(convert_to_int, list(tmp.values[idx]))
    zeros[idx] = list(converted)

In [152]:
new_df = pd.DataFrame(zeros)

In [188]:
X_train, X_test, y_train, y_test = train_test_split(new_df, y, test_size=0.4, random_state=0)

In [154]:
multi_model = MultinomialNB(alpha=0.01)

In [155]:
multi_model = multi_model.fit(X_train, y_train)

In [156]:
y_pred = multi_model.predict(X_test)

In [157]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

         EI       0.71      0.80      0.75       320
         IE       0.76      0.83      0.79       327
          N       0.88      0.79      0.83       629

avg / total       0.81      0.80      0.80      1276



In [207]:
xgb_ = runXgboost(X_train, y_train)

[0]	train-mlogloss:1.0866	eval-mlogloss:1.08717
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.07475	eval-mlogloss:1.07588
[2]	train-mlogloss:1.06314	eval-mlogloss:1.06475
[3]	train-mlogloss:1.05173	eval-mlogloss:1.05381
[4]	train-mlogloss:1.04051	eval-mlogloss:1.04306
[5]	train-mlogloss:1.02948	eval-mlogloss:1.03249
[6]	train-mlogloss:1.01863	eval-mlogloss:1.0221
[7]	train-mlogloss:1.00793	eval-mlogloss:1.01196
[8]	train-mlogloss:0.997432	eval-mlogloss:1.00197
[9]	train-mlogloss:0.987105	eval-mlogloss:0.992072
[10]	train-mlogloss:0.976944	eval-mlogloss:0.982387
[11]	train-mlogloss:0.966944	eval-mlogloss:0.972869
[12]	train-mlogloss:0.957211	eval-mlogloss:0.963732
[13]	train-mlogloss:0.947489	eval-mlogloss:0.954574
[14]	train-mlogloss:0.938051	eval-mlogloss:0.945721
[15]	train-mlogloss:0.928773	eval-mlogloss:0.937189
[16]	train-mlogloss:0.919513	eval-mlogloss:0.9

[155]	train-mlogloss:0.319179	eval-mlogloss:0.374961
[156]	train-mlogloss:0.317204	eval-mlogloss:0.373047
[157]	train-mlogloss:0.315214	eval-mlogloss:0.371333
[158]	train-mlogloss:0.313252	eval-mlogloss:0.369513
[159]	train-mlogloss:0.311345	eval-mlogloss:0.36775
[160]	train-mlogloss:0.309439	eval-mlogloss:0.366052
[161]	train-mlogloss:0.307546	eval-mlogloss:0.364424
[162]	train-mlogloss:0.305687	eval-mlogloss:0.362698
[163]	train-mlogloss:0.303893	eval-mlogloss:0.361278
[164]	train-mlogloss:0.302071	eval-mlogloss:0.359572
[165]	train-mlogloss:0.300228	eval-mlogloss:0.357988
[166]	train-mlogloss:0.298411	eval-mlogloss:0.35638
[167]	train-mlogloss:0.296695	eval-mlogloss:0.354967
[168]	train-mlogloss:0.295013	eval-mlogloss:0.35345
[169]	train-mlogloss:0.29325	eval-mlogloss:0.351917
[170]	train-mlogloss:0.291489	eval-mlogloss:0.35041
[171]	train-mlogloss:0.289842	eval-mlogloss:0.348973
[172]	train-mlogloss:0.288133	eval-mlogloss:0.347457
[173]	train-mlogloss:0.286512	eval-mlogloss:0.34601

[311]	train-mlogloss:0.137776	eval-mlogloss:0.23101
[312]	train-mlogloss:0.137135	eval-mlogloss:0.230488
[313]	train-mlogloss:0.136487	eval-mlogloss:0.230005
[314]	train-mlogloss:0.135866	eval-mlogloss:0.229474
[315]	train-mlogloss:0.135245	eval-mlogloss:0.228946
[316]	train-mlogloss:0.134599	eval-mlogloss:0.228544
[317]	train-mlogloss:0.133995	eval-mlogloss:0.22815
[318]	train-mlogloss:0.133372	eval-mlogloss:0.227652
[319]	train-mlogloss:0.13277	eval-mlogloss:0.227098
[320]	train-mlogloss:0.132157	eval-mlogloss:0.22673
[321]	train-mlogloss:0.13156	eval-mlogloss:0.226278
[322]	train-mlogloss:0.130962	eval-mlogloss:0.225818
[323]	train-mlogloss:0.130384	eval-mlogloss:0.225366
[324]	train-mlogloss:0.129802	eval-mlogloss:0.225071
[325]	train-mlogloss:0.129203	eval-mlogloss:0.224615
[326]	train-mlogloss:0.128627	eval-mlogloss:0.224227
[327]	train-mlogloss:0.128033	eval-mlogloss:0.223854
[328]	train-mlogloss:0.127439	eval-mlogloss:0.223473
[329]	train-mlogloss:0.126891	eval-mlogloss:0.22307

[467]	train-mlogloss:0.071385	eval-mlogloss:0.18571
[468]	train-mlogloss:0.071143	eval-mlogloss:0.185507
[469]	train-mlogloss:0.070858	eval-mlogloss:0.185292
[470]	train-mlogloss:0.070595	eval-mlogloss:0.185063
[471]	train-mlogloss:0.070328	eval-mlogloss:0.184875
[472]	train-mlogloss:0.070077	eval-mlogloss:0.184658
[473]	train-mlogloss:0.069773	eval-mlogloss:0.184489
[474]	train-mlogloss:0.069506	eval-mlogloss:0.184338
[475]	train-mlogloss:0.069246	eval-mlogloss:0.184226
[476]	train-mlogloss:0.068963	eval-mlogloss:0.184148
[477]	train-mlogloss:0.068742	eval-mlogloss:0.18396
[478]	train-mlogloss:0.068458	eval-mlogloss:0.183793
[479]	train-mlogloss:0.068198	eval-mlogloss:0.183603
[480]	train-mlogloss:0.067924	eval-mlogloss:0.18345
[481]	train-mlogloss:0.067692	eval-mlogloss:0.183292
[482]	train-mlogloss:0.067419	eval-mlogloss:0.183208
[483]	train-mlogloss:0.06717	eval-mlogloss:0.183041
[484]	train-mlogloss:0.066905	eval-mlogloss:0.182885
[485]	train-mlogloss:0.066669	eval-mlogloss:0.1827

[623]	train-mlogloss:0.041582	eval-mlogloss:0.16997
[624]	train-mlogloss:0.041431	eval-mlogloss:0.169908
[625]	train-mlogloss:0.041311	eval-mlogloss:0.169827
[626]	train-mlogloss:0.041192	eval-mlogloss:0.169708
[627]	train-mlogloss:0.041063	eval-mlogloss:0.169674
[628]	train-mlogloss:0.040931	eval-mlogloss:0.169665
[629]	train-mlogloss:0.040784	eval-mlogloss:0.169517
[630]	train-mlogloss:0.040665	eval-mlogloss:0.169459
[631]	train-mlogloss:0.040555	eval-mlogloss:0.169381
[632]	train-mlogloss:0.040435	eval-mlogloss:0.169296
[633]	train-mlogloss:0.040293	eval-mlogloss:0.169171
[634]	train-mlogloss:0.040158	eval-mlogloss:0.169104
[635]	train-mlogloss:0.040042	eval-mlogloss:0.169092
[636]	train-mlogloss:0.039911	eval-mlogloss:0.169012
[637]	train-mlogloss:0.039771	eval-mlogloss:0.168955
[638]	train-mlogloss:0.039651	eval-mlogloss:0.168858
[639]	train-mlogloss:0.039537	eval-mlogloss:0.168736
[640]	train-mlogloss:0.039409	eval-mlogloss:0.168722
[641]	train-mlogloss:0.039289	eval-mlogloss:0.1

[779]	train-mlogloss:0.026366	eval-mlogloss:0.162666
[780]	train-mlogloss:0.026288	eval-mlogloss:0.162611
[781]	train-mlogloss:0.026227	eval-mlogloss:0.162553
[782]	train-mlogloss:0.026157	eval-mlogloss:0.162524
[783]	train-mlogloss:0.026092	eval-mlogloss:0.162447
[784]	train-mlogloss:0.026018	eval-mlogloss:0.16244
[785]	train-mlogloss:0.025953	eval-mlogloss:0.162388
[786]	train-mlogloss:0.025889	eval-mlogloss:0.162361
[787]	train-mlogloss:0.025819	eval-mlogloss:0.162278
[788]	train-mlogloss:0.025758	eval-mlogloss:0.162312
[789]	train-mlogloss:0.02569	eval-mlogloss:0.162311
[790]	train-mlogloss:0.025636	eval-mlogloss:0.162281
[791]	train-mlogloss:0.025559	eval-mlogloss:0.162265
[792]	train-mlogloss:0.025504	eval-mlogloss:0.16215
[793]	train-mlogloss:0.025432	eval-mlogloss:0.162136
[794]	train-mlogloss:0.025371	eval-mlogloss:0.162052
[795]	train-mlogloss:0.025306	eval-mlogloss:0.162033
[796]	train-mlogloss:0.025248	eval-mlogloss:0.162002
[797]	train-mlogloss:0.025178	eval-mlogloss:0.161

[935]	train-mlogloss:0.018451	eval-mlogloss:0.157284
[936]	train-mlogloss:0.018414	eval-mlogloss:0.157246
[937]	train-mlogloss:0.018381	eval-mlogloss:0.157187
[938]	train-mlogloss:0.018343	eval-mlogloss:0.157162
[939]	train-mlogloss:0.018308	eval-mlogloss:0.157132
[940]	train-mlogloss:0.018272	eval-mlogloss:0.157137
[941]	train-mlogloss:0.018234	eval-mlogloss:0.157147
[942]	train-mlogloss:0.018202	eval-mlogloss:0.157134
[943]	train-mlogloss:0.018163	eval-mlogloss:0.157025
[944]	train-mlogloss:0.018125	eval-mlogloss:0.156991
[945]	train-mlogloss:0.018083	eval-mlogloss:0.15694
[946]	train-mlogloss:0.018051	eval-mlogloss:0.156925
[947]	train-mlogloss:0.018018	eval-mlogloss:0.156897
[948]	train-mlogloss:0.017979	eval-mlogloss:0.156834
[949]	train-mlogloss:0.017942	eval-mlogloss:0.156758
[950]	train-mlogloss:0.017903	eval-mlogloss:0.156751
[951]	train-mlogloss:0.017872	eval-mlogloss:0.156708
[952]	train-mlogloss:0.017833	eval-mlogloss:0.156654
[953]	train-mlogloss:0.017801	eval-mlogloss:0.1

[1089]	train-mlogloss:0.014009	eval-mlogloss:0.154397
[1090]	train-mlogloss:0.013989	eval-mlogloss:0.154358
[1091]	train-mlogloss:0.013965	eval-mlogloss:0.154398
[1092]	train-mlogloss:0.013944	eval-mlogloss:0.15441
[1093]	train-mlogloss:0.013922	eval-mlogloss:0.154384
[1094]	train-mlogloss:0.013899	eval-mlogloss:0.154331
[1095]	train-mlogloss:0.013877	eval-mlogloss:0.15432
[1096]	train-mlogloss:0.013855	eval-mlogloss:0.154312
[1097]	train-mlogloss:0.013832	eval-mlogloss:0.154284
[1098]	train-mlogloss:0.013813	eval-mlogloss:0.154309
[1099]	train-mlogloss:0.013789	eval-mlogloss:0.154314
[1100]	train-mlogloss:0.013766	eval-mlogloss:0.154323
[1101]	train-mlogloss:0.013746	eval-mlogloss:0.154296
[1102]	train-mlogloss:0.013724	eval-mlogloss:0.154319
[1103]	train-mlogloss:0.013705	eval-mlogloss:0.154342
[1104]	train-mlogloss:0.013682	eval-mlogloss:0.154294
[1105]	train-mlogloss:0.013662	eval-mlogloss:0.154301
[1106]	train-mlogloss:0.01364	eval-mlogloss:0.154277
[1107]	train-mlogloss:0.013617	

In [196]:
def predict_xgb(model, x, y, cnt_vect=None):
    test_y = get_encoded_target(y, label_enc)
    test_X = xgb_preprocess(x, test_y, cnt_vect)
    y_pred_proba = model.predict(test_X)
    
    y_pred = [np.argmax(line) for line in y_pred_proba]
    y_true = label_enc.inverse_transform(test_y)
    y_pred = label_enc.inverse_transform(y_pred)
    print(classification_report(y_true, y_pred))

In [208]:
predict_xgb(xgb_, X_test, y_test)

             precision    recall  f1-score   support

         EI       0.95      0.97      0.96       320
         IE       0.95      0.94      0.94       327
          N       0.98      0.98      0.98       629

avg / total       0.96      0.96      0.96      1276



  if diff:
  if diff:


In [186]:
y_pred = label_enc.inverse_transform(y_pred)

  if diff:


In [199]:
new_df["target"] = df.target

In [205]:
xgb_.best_iteration

998

In [212]:
cnt_tmp = CountVectorizer(analyzer="char").fit(df.sequence)

In [213]:
cnt_tmp.vocabulary_

{'c': 1, 'a': 0, 'g': 3, 't': 7, 'n': 4, 'd': 2, 'r': 5, 's': 6}

In [214]:
df.sequence[0]

'CCAGCTGCATCACAGGAGGCCAGCGAGCAGGTCTGTTCCAAGGGCCTTCGAGCCAGTCTG'