In [1]:
import csv
import numpy as np
from sklearn.metrics import accuracy_score, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
# Read sequences
sequences = list()
with open('data/sequences.txt', 'r') as f:
    for line in f:
        sequences.append(line[:-1])

In [3]:
# Split data into training and test sets
sequences_train = list()
sequences_test = list()
proteins_test = list()
y_train = list()
with open('data/graph_labels.txt', 'r') as f:
    for i,line in enumerate(f):
        t = line.split(',')
        if len(t[1][:-1]) == 0:
            proteins_test.append(t[0])
            sequences_test.append(sequences[i])
        else:
            sequences_train.append(sequences[i])
            y_train.append(int(t[1][:-1]))

len(sequences_train), len(y_train), len(sequences_test), len(proteins_test)

(4888, 4888, 1223, 1223)

In [4]:
max(map(len, sequences_train)), max(map(len, sequences_test))

(989, 910)

In [5]:
# Map sequences to 
vec = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
X_train = vec.fit_transform(sequences_train)
X_test = vec.transform(sequences_test)

X_train.shape, X_test.shape

((4888, 8466), (1223, 8466))

In [6]:
# Train a logistic regression classifier and use the classifier to
# make predictions
clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train) 
y_pred_proba = clf.predict_proba(X_test)

In [7]:
n_class = 1+max(y_train)
n_class

18

In [8]:
y_pred_proba.shape

(1223, 18)

In [None]:
# Write predictions to a file
with open('sample_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(18):
        lst.append('class'+str(i))
    lst.insert(0, "name")
    writer.writerow(lst)
    for i, protein in enumerate(proteins_test):
        lst = y_pred_proba[i,:].tolist()
        lst.insert(0, protein)
        writer.writerow(lst)
