In [11]:
#!//usr/bin/python3.6

import numpy as np
from sklearn import svm
from sklearn import preprocessing

splitNum = 405
splitMod = 10
separator = '=' * 30

In [12]:
idToData = {}
scaler = None


def loadTrainLabels(path):
    pairs = np.loadtxt(path, delimiter=',', skiprows=1, dtype=int)
    # print(pairs) #############

    return pairs[:, 0], pairs[:, 1]

def sortData():
#     allTrainId, allTrainLabel

    indexesOfUser = np.zeros((21, 450), dtype=int)
    for u in range(1, 21):
        indexesOfUser[u, :] = np.arange(0, allTrainLabel.size)[ (allTrainLabel == u) ]

    return indexesOfUser

def splitData():
    trainArrays = []
    validationArrays = []

    for u in range(1, 21):
        splitted = np.split(indexesOfUser[u, :], [splitNum], axis=0)
        
        ###############################################
#         print(splitted[0].shape)
#         print(splitted[1].shape)
        
        trainArrays.append( splitted[0] )
        validationArrays.append( splitted[1] )

    trainIndex = np.concatenate(tuple(trainArrays), axis=0)
    validationIndex = np.concatenate(tuple(validationArrays), axis=0)
    return trainIndex, validationIndex

def splitDataMod():
    trainArrays = []
    validationArrays = []

    for u in range(1, 21):
#         splitted = np.split(indexesOfUser[u, :], [splitNum], axis=0)
        allIndexes = np.arange(0, indexesOfUser[u, :].size)
    
        trainArrayIndexes = allIndexes[ np.mod(allIndexes, splitMod) != 0 ] 
        trainArray = indexesOfUser[u, trainArrayIndexes]
        
        validationArrayIndexes = allIndexes[ np.mod(allIndexes, splitMod) == 0 ] 
        validationArray = indexesOfUser[u, validationArrayIndexes]
        
        
        ###############################################
#         print(splitted[0].shape)
#         print(splitted[1].shape)
        
        trainArrays.append( trainArray )
        validationArrays.append( validationArray )

    trainIndex = np.concatenate(tuple(trainArrays), axis=0)
    validationIndex = np.concatenate(tuple(validationArrays), axis=0)
    return trainIndex, validationIndex


def loadData(path, ids):
    count = 0
    for id in ids:
        count += 1
        idToData[id] = np.loadtxt(path + str(id) + ".csv", delimiter=',', skiprows=0, dtype=np.float64)

        ##################################
        if (count % 1000 == 0):
            print("loaded", count, "data")
            # print(idToData[id]) 
            print("=" * 20, "\n")
    
    print("loaded data")

            

def interpolateData(ids):
    for id in ids:
        array = idToData[id]
        newArray = np.zeros((150, 3))

        x = np.arange(1, 151, 1)
        xp = np.linspace(1, 150, array.shape[0])
        for k in range(0, 3):
            yp = array[:, k]
            newArray[:, k] = np.interp(x, xp, yp)

        idToData[id] = newArray

    print("interpolized data")
            
            
def getShapedData(ids):
    arrays = []
    for id in ids:
        data = idToData[id]
        
        ##################################################
        data = np.ravel(data, order='C')
        
        arrays.append(data)
    
    ret = np.stack(arrays, axis=0)
    return ret
    
    
            
def fitScaler(shapedData):
    global scaler

    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaler.fit(shapedData)

    print("Fitted the scaler")


def normalize(shapedData):
    ret = scaler.transform(shapedData)

    print("Normalized data")
    return ret


def computeProbabilityMatrix():
	probMatrix = np.zeros((150, numBins ** 3, 21))

	for userIndex in range(1, 21):
		resultBins = np.zeros(idsOfUser[userIndex].size, dtype=int)
		for featureIndex in range(150):

			for i in range(idsOfUser[userIndex].size):
				id = idsOfUser[userIndex][i]
				data = idToData[id]
				resultBins[i] = pointToBin(data[featureIndex, :])

			for binIndex in range(numBins ** 3):
				probMatrix[featureIndex][binIndex][userIndex] = np.sum(resultBins == binIndex) / (resultBins.size)

	print("computed probability matrix")
	return probMatrix





allTrainId, allTrainLabel = loadTrainLabels("../../data/train_labels.csv")
indexesOfUser = sortData()

# print(allTrainId); print(separator)
# print(allTrainLabel); print(separator)
# print(indexesOfUser); print(separator)

trainIndex, validationIndex = splitDataMod()

trainId = allTrainId[trainIndex]
validationId = allTrainId[validationIndex]

trainLabel = allTrainLabel[trainIndex]
validationLabel = allTrainLabel[validationIndex]
##########################################################################
print(trainIndex.size)
print(validationIndex.size)
# print(separator)
# print(trainIndex)
# print(validationIndex)


printId = 15065

loadData("../../data/train/", allTrainId)
# print(idToData[10003]) ##################
print("shape of ", printId, "is ", idToData[printId].shape) ##################
print(separator)

interpolateData(allTrainId)
# print(idToData[10003]) ##################
print("shape of ", printId, "is ", idToData[printId].shape) ##################
# print("data of ", printId, "is ", idToData[printId])
print(separator)

shapedTrainData = getShapedData(trainId)
shapedValidationData = getShapedData(validationId)
print(shapedTrainData.shape)
print(shapedValidationData.shape)

fitScaler(shapedTrainData)
shapedTrainData = normalize(shapedTrainData)
shapedValidationData = normalize(shapedValidationData)


print(separator)
print("done")


8100
900
loaded 1000 data

loaded 2000 data

loaded 3000 data

loaded 4000 data

loaded 5000 data

loaded 6000 data

loaded 7000 data

loaded 8000 data

loaded 9000 data

loaded data
shape of  15065 is  (149, 3)
interpolized data
shape of  15065 is  (150, 3)
(8100, 450)
(900, 450)
Fitted the scaler
Normalized data
Normalized data
done


In [None]:
aux = np.arange(0, trainId.size)[trainId == printId][0]
print("aux =", aux, "trainId[aux] =", trainId[aux])
print("data of ", printId, "is ", shapedTrainData[aux]) ##################
print(separator)

In [13]:
# do validation

print("starting do validation")

def doValidationRBF(C, g):
    svm_model = svm.SVC(C=C, kernel='rbf', gamma=g)
    svm_model.fit(shapedTrainData, trainLabel)
    predictedLabels = svm_model.predict(shapedValidationData)

    # print(predictedLabels); print(separator)

    accuracy = (predictedLabels == validationLabel).sum() / validationLabel.size
    print("accuracy of validation = ", accuracy, " where splitMod =", splitMod)
    print("C = %s, kernel = %s, gamma = %s" % (C, "rbf", g))

# Cs = np.array([1e-10, 1e-9, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0, 1, 10])
# Cs = np.array([1e-8, 1e-7, 1e-6, 1e-3, 1e-1, 1, 10][::-1])
# Cs = np.array([1e-8, 1e-7, 1e-6, 1e-3, 1e-1, 1, 10])
# Gammas = ['scale', 'auto', 1e-5, 1e-1, 10]

# Cs = np.logspace(-2, 10, 4)
# Cs = [1.0, 10]
# Gammas = np.logspace(-6, 3, 10)
# Gammas = ['scale', 'auto'] + list(Gammas)

# print("Cs =", Cs)
# print("Gammas =", Gammas)
# for c in Cs:
#     for g in Gammas:
#         doValidationRBF(c, g)

doValidationRBF(10, 'scale')
    
print("done")


starting do validation
accuracy of validation =  0.9377777777777778  where splitMod = 10
C = 10, kernel = rbf, gamma = scale
done


In [14]:
# for predicting on test data:
mask = np.ones(24000 + 1, dtype=bool)
mask[allTrainId] = False
mask = mask[10001 : 24000 + 1]
testId = np.arange(10001, 24000 + 1)[mask]

print(testId)



loadData("../../data/test/", testId)

interpolateData(testId)
shapedTestData = getShapedData(testId)

#######################################
print(shapedTrainData.shape)

shapedTestData = normalize(shapedTestData)



# svm_model = svm.SVC(svm_C, svm_kernel)
svm_model = svm.SVC(C=10, kernel='rbf', gamma='scale')
svm_model.fit(shapedTrainData, trainLabel)
predictedLabels = svm_model.predict(shapedTestData)

print("predicted the labels as:")
print(predictedLabels)

np.savetxt("result.csv", np.stack((testId, predictedLabels)).T, fmt="%s", delimiter=',', header="id,class", comments='')
print("done with writing")

[10001 10002 10004 ... 23992 23998 24000]
loaded 1000 data

loaded 2000 data

loaded 3000 data

loaded 4000 data

loaded 5000 data

loaded data
interpolized data
(8100, 450)
Normalized data
predicted the labels as:
[ 3 10  5 ... 10  5  1]
done with writing


In [110]:
# for testing stuff

import numpy as np

a = np.arange(1,5)
print(a % 2 == 0)

b = np.arange(1,5)
print(b[ np.array([True, True, False, False]) ])

print(np.linspace(1, 10, 5))

print(~np.array([True, False, True, False, False, False]))

[False  True False  True]
[1 2]
[ 1.    3.25  5.5   7.75 10.  ]
[False  True False  True  True  True]
