## Initializations, Class Definitions and Type Declarations

In [None]:
from DataFrame import DataFrame
from CountVectorizer import CountVectorizer
from MultinomialNB import MultinomialNB,get_accuracy, cross_val_score
import numpy as np
from math import log, exp, sqrt, pow

rs = 42

## Read Data
Here The data file is expected to be at the root directory

In [None]:
df = DataFrame()
df.read_csv('trg.csv')

In [None]:
df.head()

array([['1', 'B',
        '"the 4 202 353 bp genome of the alkaliphilic bacterium bacillus halodurans c-125 contains 4066 predicted protein coding sequences cdss 2141 527 of which have functional assignments 1182 29 of which are conserved cdss with unknown function and 743 18 3 of which have no match to any protein database among the total cdss 88 match sequences of proteins found only in bacillus subtilis and 667 are widely conserved in comparison with the proteins of various organisms including bsubtilis the b halodurans genome contains 112 transposase genes indicating that transposases have played an important evolutionary role in horizontal gene transfer and also in internal genetic rearrangement in the genome strain c-125 lacks some of the necessary genes for competence such as coms srfa and rapc supporting the fact that competence has not been demonstrated experimentally in c-125 there is no paralog of tupa encoding teichuronopeptide which contributes to alkaliphily in the c-125 

## Train Test Split

In [None]:
X_raw = df.get_col(2)
y = df.get_col(1)
train, val = df.train_test_split()
X_train_raw = train.get_col(2)
X_val_raw = val.get_col(2)
y_train = train.get_col(1)
y_val = val.get_col(1)

  return self.data[rows]


## Vectorization

Vectorization uses a list of stopwords. The stopwords should be a txt file at the root directory



In [None]:
vectorizer = CountVectorizer(stop_words=np.genfromtxt('stopwords.txt', dtype = str), max_features=1000)

In [None]:
vectorizer.fit(X_raw)
vec_X_train, idf_train = vectorizer.transform(X_train_raw, tf_idf=True)
vec_X, idf = vectorizer.transform(X_raw, tf_idf=True)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  D = np.array(D, dtype=np.float)


In [None]:
vec_X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [3, 0, 1, ..., 0, 0, 0],
       [0, 3, 1, ..., 0, 0, 0],
       ...,
       [0, 2, 2, ..., 0, 0, 0],
       [0, 1, 6, ..., 0, 0, 0],
       [12, 3, 2, ..., 0, 0, 0]], dtype=object)

## MultinomialNB 

### Max Features = 1000, no TF_IDF

In [None]:
model = MultinomialNB()
model.fit(vec_X_train, y_train)

In [None]:
predictions_on_train = model.predict(vec_X_train)

In [None]:
get_accuracy(predictions_on_train, y_train, verbose = True)

Correct: 2833	Accuracy: 88.5312%


0.8853125

In [None]:
vec_X_val = vectorizer.transform(X_val_raw)

In [None]:
predictions_on_val = model.predict(vec_X_val)

In [None]:
get_accuracy(predictions_on_val, y_val, verbose = True)

Correct: 692	Accuracy: 86.5000%


0.865

#### Repeated Cross Validation
16 times repeated cross validation, dataset is shuffled randomly each time before cross validation. See `MultinomialNB` class for cross validation method implementation.

In [None]:
rep = 16
results = np.zeros(rep)
for i in range(rep):
    index_list = np.arange(len(df.data))
    np.random.default_rng(rs*(i+1)).shuffle(index_list)
    vec_X_shuffled = vec_X[index_list]
    vec_Y_shuffled = y[index_list]
    results[i]=np.mean(cross_val_score(vec_X_shuffled, vec_Y_shuffled))
results

array([0.87425, 0.873  , 0.87475, 0.87025, 0.872  , 0.87325, 0.872  ,
       0.8685 , 0.87175, 0.8705 , 0.87275, 0.8735 , 0.87   , 0.87175,
       0.867  , 0.87225])

In [None]:
print(f'mean: {np.mean(results)}\tstd: {np.std(results)}')

mean: 0.87171875	std: 0.0019919270663103566


### Use TF_IDF, Max Features = 1000

In [None]:
model_tf_idf = MultinomialNB()
model_tf_idf.fit(vec_X_train, y_train, idf = idf)

In [None]:
predictions_tf_idf = model_tf_idf.predict(vec_X_train)

In [None]:
get_accuracy(predictions_tf_idf, y_train)

0.89625

#### Repeated Cross Validation

In [None]:
rep = 16
results_tf_idf = np.zeros(rep)
for i in range(rep):
    index_list = np.arange(len(df.data))
    np.random.default_rng().shuffle(index_list)
    vec_X_shuffled = vec_X[index_list]
    vec_Y_shuffled = y[index_list]
    results_tf_idf[i]=np.mean(cross_val_score(vec_X_shuffled, vec_Y_shuffled, idf))
results_tf_idf

array([0.8875 , 0.8845 , 0.8845 , 0.88575, 0.884  , 0.887  , 0.88725,
       0.88675, 0.885  , 0.88675, 0.8845 , 0.885  , 0.88725, 0.88625,
       0.888  , 0.88525])

In [None]:
print(f'mean: {np.mean(results_tf_idf)}\tstd: {np.std(results_tf_idf)}')

mean: 0.8859531250000001	std: 0.0012475562048961737


## Use scipy to do hypotheses testing

In [None]:
from scipy.stats import sem, t
mean1 = 0.87171875	
mean2 = 0.8732656249999999	
std1=0.0019919270663103566
std2=0.0008075268474639141

n1, n2 = 16, 16
se1, se2 = std1/sqrt(n1), std2/sqrt(n2)

se1, se2 = sem(results), sem(results_tf_idf)

sed = sqrt(se1**2.0 + se2**2.0)

t_stat = (mean1 - mean2) / sed
df = 30

alpha = 0.05
cv = t.ppf(1.0 - alpha, df)

p = (1 - t.cdf(abs(t_stat), df)) * 2

P value

In [None]:
p

0.016157454136917826