### CCTS 40500: FINAL
#### Abdallah Aboelela

In [139]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# Just to see how these would do
from sklearn.ensemble import RandomForestClassifier

In [146]:
# setup
seed = 7
numpy.random.seed(seed)
files = ['hiv-db-LTNP.seq', 'hiv-db-SP.seq', 'hiv-db-zRP.seq']

In [147]:
# load dataset
combined = pd.DataFrame()
for fname in files:
    df = pd.read_csv('data/' + fname, header = None)
    df['type'] = fname
    combined = pd.concat([combined, df])
    
combined = combined.replace('hiv-db-LTNP.seq', 'LTNP')
combined = combined.replace('hiv-db-zRP.seq', 'RP')
combined = combined.replace('hiv-db-SP.seq', 'SP')

combined.columns = ['seq', 'type']
combined['len'] = combined.seq.apply(len)

In [142]:
for i in range(max(combined.len)):
    combined[i] = combined.seq.apply(lambda x: get_position(x, i))
    
combined = combined.drop(['seq', 'len'], axis = 1)

In [143]:
combined = combined.replace(['A', 'T', 'G', 'C'], list(range(4)))
combined = combined.replace(np.nan, int(4))

#### Attempt using random forest

In [144]:
clf = RandomForestClassifier(n_estimators = 100)
kfold = KFold(n_splits=10, shuffle=True, random_state = seed)
results = cross_val_score(clf, combined.drop('type', axis = 1), combined.type, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 74.89% (10.96%)


#### Attempt at Neural Net

In [145]:
clf = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(clf, combined.drop('type', axis = 1), combined.type, cv=kfold, error_score = 'raise')
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 47.42% (13.76%)


## Appendix

In [137]:
def get_position(x, i):
    try:
        return x[i]
        
    except Exception as e:
        return np.nan

In [138]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim=9840, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model