In [37]:
import numpy
import math
import csv
from sklearn import linear_model, preprocessing

In [38]:
def import_features(filepath):
    data = []
    with open(filepath, 'r') as csvfile:
        file = csv.reader(csvfile, delimiter=',')
        for row in file:
            line = []
            for i in range(0, 264):
                line.append(float(row[i]))
            data.append(line)
        return data

In [39]:
def import_y(filepath):
    return numpy.loadtxt(filepath)

In [40]:
def scale_features(Xtrain, Xtest):
    scaler = preprocessing.StandardScaler().fit(Xtrain)
    Xtrain = scaler.transform(Xtrain)
    Xtest = scaler.transform(Xtest)
    return Xtrain, Xtest

In [41]:
def export_data_accuracy(y):
    with open('pred_label.csv', 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['Sample_id','Sample_label'])
        for i in range(0, len(y)):
            index = i+1
            writer.writerow([str(index),str(y[i])])

In [42]:
def export_data_log_loss(y):
    with open('pred_label.csv', 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['Sample_id','Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9','Class_10'])
        for i in range(0, len(y)):
            index = i+1
            temp = [str(index)]
            for k in range(0, len(y[i])):
                temp.append(str(y[i][k]))
            writer.writerow(temp)

In [43]:
def get_classifier(x, y):
    reg = linear_model.LogisticRegression()
    reg.fit(x, y)
    return reg

In [44]:
def classify(reg, x):
    return reg.predict([x])

In [45]:
def calculate_p(reg, x):
    return reg.predict_proba([x])

In [46]:
def filter_data(y, genre):
    y_temp = []
    for i in range(0, len(y)):
        if y[i] == genre:
            y_temp.append(1)
        else:
            y_temp.append(0)
    return y_temp

In [47]:
nr_genres = 10

def accuracy_test():
    classifiers = []
    features = import_features('train_data.csv')
    y = import_y('train_labels.csv')

    for i in range(1, nr_genres+1):
        filtered = filter_data(y, i)
        classifiers.append(get_classifier(features, filtered))

    test_set = import_features('test_data.csv')
    y_pred = []
    features, test_set = scale_features(features, test_set)
    
    for k in range(0, len(test_set)):
        predictions = []
        for j in range(0, len(classifiers)):
            prediction = classify(classifiers[j], test_set[k])
            if prediction == 1:
                prediction = calculate_p(classifiers[j], test_set[k])[0][1]
            predictions.append(prediction)
        y_pred.append(predictions.index(max(predictions)) + 1)

    export_data_accuracy(y_pred)

In [48]:
def log_loss_test():
    classifiers = []
    features = import_features('train_data.csv')
    y = import_y('train_labels.csv')

    for i in range(1, nr_genres+1):
        filtered = filter_data(y, i)
        classifiers.append(get_classifier(features, filtered))

    test_set = import_features('test_data.csv')
    y_pred = []
    features, test_set = scale_features(features, test_set)
    
    for k in range(0, len(test_set)):
        predictions = []
        for j in range(0, len(classifiers)):
            prediction = calculate_p(classifiers[j], test_set[k])[0][1]
            predictions.append(prediction)
        y_pred.append(predictions)

    export_data_log_loss(y_pred)

In [50]:
log_loss_test()