# Initialization

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

import string

# Prepare Data
## Load CSV

In [2]:
df_raw = pd.read_csv('trg.csv')
df_raw.head(10)

Unnamed: 0,id,class,abstract
0,1,B,the 4 202 353 bp genome of the alkaliphilic ba...
1,2,A,the complete 1751377-bp sequence of the genome...
2,3,E,in 1992 we started assembling an ordered libra...
3,4,E,the aim of this study is to measure human mito...
4,5,B,the amino acid sequence of the spirulina maxim...
5,6,B,the genus xanthomonas is a diverse and economi...
6,7,B,the complete nucleotide sequence of the genome...
7,8,B,the complete genome sequence of caulobacter cr...
8,9,V,the complete dna sequence of the a2 strain of ...
9,10,B,the complete genomic sequence of corynebacteri...


In [3]:
df_raw.describe(include = 'all')   #Describe

Unnamed: 0,id,class,abstract
count,4000.0,4000,4000
unique,,4,2686
top,,E,the national institutes of health mammalian ge...
freq,,2144,42
mean,2000.5,,
std,1154.844867,,
min,1.0,,
25%,1000.75,,
50%,2000.5,,
75%,3000.25,,


In [4]:
df_raw.groupby('class').describe(include = ['O'])   # Describe object type group by class

Unnamed: 0_level_0,abstract,abstract,abstract,abstract
Unnamed: 0_level_1,count,unique,top,freq
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,128,45,the complete 1751377-bp sequence of the genome...,17
B,1602,726,an approach for genome analysis based on seque...,30
E,2144,1800,the national institutes of health mammalian ge...,42
V,126,116,the complete 172282 base pairs nucleotide sequ...,4


In [5]:
print(f"{df_raw['class'].unique()}")    #Show unique values

['B' 'A' 'E' 'V']


## Preprocessing text

In [6]:
def text_process(arg_string):
    """
    Remove punctuation
    Return a list of words with lower cases.
    """
    return [word for word in ''.join([char for char in arg_string if char not in string.punctuation]).split() if word.lower()]
    pass


In [7]:
df_raw['abstract'].head(10).apply(text_process)

0    [the, 4, 202, 353, bp, genome, of, the, alkali...
1    [the, complete, 1751377bp, sequence, of, the, ...
2    [in, 1992, we, started, assembling, an, ordere...
3    [the, aim, of, this, study, is, to, measure, h...
4    [the, amino, acid, sequence, of, the, spirulin...
5    [the, genus, xanthomonas, is, a, diverse, and,...
6    [the, complete, nucleotide, sequence, of, the,...
7    [the, complete, genome, sequence, of, caulobac...
8    [the, complete, dna, sequence, of, the, a2, st...
9    [the, complete, genomic, sequence, of, coryneb...
Name: abstract, dtype: object

## Vectorization 
use **sklearn.feature.text.CountVectorizer**

In [8]:
bag_of_word_transformer = CountVectorizer(analyzer=text_process).fit(df_raw['abstract'])

In [9]:
len(bag_of_word_transformer.vocabulary_)

31424

We have got the transformer, try it on a sample

In [10]:
sample = df_raw['abstract'][0]
sample

'the 4 202 353 bp genome of the alkaliphilic bacterium bacillus halodurans c-125 contains 4066 predicted protein coding sequences cdss 2141 527 of which have functional assignments 1182 29 of which are conserved cdss with unknown function and 743 18 3 of which have no match to any protein database among the total cdss 88 match sequences of proteins found only in bacillus subtilis and 667 are widely conserved in comparison with the proteins of various organisms including bsubtilis the b halodurans genome contains 112 transposase genes indicating that transposases have played an important evolutionary role in horizontal gene transfer and also in internal genetic rearrangement in the genome strain c-125 lacks some of the necessary genes for competence such as coms srfa and rapc supporting the fact that competence has not been demonstrated experimentally in c-125 there is no paralog of tupa encoding teichuronopeptide which contributes to alkaliphily in the c-125 genome and an ortholog of t

In [11]:
sample_vector = bag_of_word_transformer.transform([sample])

In [12]:
print(sample_vector)

  (0, 103)	1
  (0, 228)	1
  (0, 261)	1
  (0, 316)	1
  (0, 962)	1
  (0, 1223)	1
  (0, 1299)	1
  (0, 1776)	1
  (0, 1907)	1
  (0, 2167)	1
  (0, 2476)	1
  (0, 2513)	1
  (0, 3002)	1
  (0, 3444)	1
  (0, 3618)	1
  (0, 3890)	1
  (0, 4125)	1
  (0, 4645)	1
  (0, 5137)	1
  (0, 5139)	1
  (0, 5140)	1
  (0, 5346)	1
  (0, 5466)	1
  (0, 5543)	3
  (0, 5595)	5
  :	:
  (0, 27737)	1
  (0, 27822)	1
  (0, 28034)	1
  (0, 28222)	1
  (0, 28439)	1
  (0, 28479)	1
  (0, 28501)	1
  (0, 28607)	1
  (0, 28966)	1
  (0, 29139)	3
  (0, 29141)	12
  (0, 29155)	1
  (0, 29204)	1
  (0, 29413)	5
  (0, 29479)	1
  (0, 29598)	1
  (0, 29676)	1
  (0, 29677)	1
  (0, 29994)	2
  (0, 30295)	1
  (0, 30307)	1
  (0, 30564)	1
  (0, 30996)	5
  (0, 31025)	1
  (0, 31055)	2


In [13]:
bag_of_word_transformer.get_feature_names()[29141]

'the'

In [14]:
sample_vector.shape

(1, 31424)

Transform everything

In [15]:
vectorized_abstracts = bag_of_word_transformer.transform(df_raw['abstract'])

In [16]:
print(f'Shape: {vectorized_abstracts.shape}')
print(f'# Non-Zero Occurence: {vectorized_abstracts.nnz}')

Shape: (4000, 31424)
# Non-Zero Occurence: 433933


## Training

In [17]:
X_train, X_val, y_train, y_val = train_test_split(vectorized_abstracts, df_raw['class'], test_size = 0.2)

In [18]:
mnb_model = MultinomialNB().fit(X_train, y_train)

In [22]:
predictions = mnb_model.predict(X_val)

In [20]:
print(classification_report(y_val, predictions)) 

              precision    recall  f1-score   support

           A       1.00      0.67      0.80        24
           B       0.96      0.97      0.96       317
           E       0.92      0.99      0.95       433
           V       1.00      0.08      0.14        26

    accuracy                           0.94       800
   macro avg       0.97      0.67      0.71       800
weighted avg       0.94      0.94      0.93       800



## Test on test set

In [25]:
df_test = pd.read_csv('tst.csv')

In [26]:
df_test

Unnamed: 0,id,abstract
0,1,in a previous work all three components of com...
1,2,we compared morphology of two geographically c...
2,3,factor xiii mr 320000 is a blood coagulation f...
3,4,we report the characterisation of a human gene...
4,5,fat tissue plays a critical role in the regula...
...,...,...
995,996,the molecular chaperonins such as groel are no...
996,997,the cdna sequence of the flavoprotein subunit ...
997,998,the higher plant arabidopsis thaliana arabidop...
998,999,the hyperthermophilic euryarchaeon pyrococcus ...


In [27]:
x_test_vectorized = bag_of_word_transformer.transform(df_test['abstract'])

In [30]:
predictions_on_test = mnb_model.predict(x_test_vectorized)

In [43]:
pd.DataFrame({
    'id':range(1,1001),
    'class':predictions_on_test
}).to_csv('submission.csv', index=False)

In [44]:
!kaggle competitions submit -c naivebayes-21 -f submission.csv -m "blyat"

100%|██████████████████████████████████████| 5.76k/5.76k [00:04<00:00, 1.41kB/s]
Successfully submitted to Naive Bayes Abstract Classification