## Dataset generating part

In [1]:
import numpy as np
from scipy.sparse import csr_matrix, csc_matrix, coo_matrix
import itertools
from sklearn import preprocessing
NUMBER_OF_PARAMETERS = 100000

In [2]:
file = 'data/HW04_data/train.dat.txt'
file_test = 'data/HW04_data/test.dat.txt'

In [3]:
# dataset = np.zeros((800,100001))
# label_data = []
# with open(file, 'r') as csv:   
#     i = 0
#     for row in csv:
#         row_list = row.split('\t')
#         label_data.append(row_list[0])
# #         trainset[i].append(row_list[1]);
#         for item in row_list[1].split():
#             dataset[i][int(item)] = 1
#         i = i + 1

In [4]:
# prepare train data
dataset = list()
labels = list()
pos_set = list()
pos_labels = list()
neg_set = list()
neg_labels = list()


with open(file, 'r') as csv:   
    for row in csv:
        row_list = row.split('\t')
        dataset.append(row_list[1])
        labels.append(row_list[0])
        if int(row_list[0]) == 1:
            pos_set.append(row_list[1])
            pos_labels.append(row_list[0])
        else:
            neg_set.append(row_list[1])
            neg_labels.append(row_list[0])
        
#         dataset[i].append(row_list[1]);

In [5]:
# prepare test data
test_real = list()
with open(file_test, 'r') as csv_test:   
    for row in csv_test:
        row_test_list = row.split('\t')
        test_real.append(row_test_list[0])

print len(test_real)


350


In [6]:
def build_csc(lists):
    i = 0
    param_lists = []
    row_lists = []
    value_lists = []
    for list in lists:
        list = list.strip()
        params = [int(n) for n in list.split(' ')] # list with string to list of nums, index is +1
        row = [i] * len(params)
        value = [True] * len(params)
        param_lists.append(params)
        row_lists.append(row)
        value_lists.append(value)
        i += 1
    coo = create_coo(param_lists, row_lists, value_lists, i)
    return csc_matrix(coo)
def create_coo(param_lists, row_lists, value_lists, num_rows):
    # in create COO
    flattened_params = np.array(list(itertools.chain.from_iterable(param_lists)))
    flattened_rows = np.array(list(itertools.chain.from_iterable(row_lists)))
    flattened_values = np.array(list(itertools.chain.from_iterable(value_lists)))
    sparse_coo = coo_matrix((flattened_values, (flattened_rows, flattened_params)), #three 1D lists
                            shape=(num_rows, NUMBER_OF_PARAMETERS+1), #size of matrix, +1 bc of indexing,
                            dtype=np.bool)  # creates a boolean compressed sparse row matrix

    return sparse_coo

In [7]:
dataset_csc = build_csc(dataset)
pos_csc = build_csc(pos_set)
neg_csc = build_csc(neg_set)
test_real_csc = build_csc(test_real)

In [8]:
dataset_csr = csc_matrix.tocsr(dataset_csc)
# print dataset_csr
print dataset_csr.shape
pos_csr = csc_matrix.tocsr(pos_csc)
print pos_csr.shape
neg_csr = csc_matrix.tocsr(neg_csc)
print neg_csr.shape
test_real_csr = csc_matrix(test_real_csc)
print test_real_csr.shape

(800, 100001)
(78, 100001)
(722, 100001)
(350, 100001)


## feature selection and Dimention reduction

In [9]:
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
total_params =100001


In [73]:
for k in range(78,80):
    svd = TruncatedSVD(n_components=k, n_iter=7, random_state=42)
    svd.fit(pos_csc)
    print k, svd.explained_variance_ratio_.sum()

78 1.0
79 1.0


In [9]:
print pos_csr.shape

(78, 100001)


In [80]:
for k in range(750,900):
    svd = TruncatedSVD(n_components=k, n_iter=7, random_state=42)
    svd.fit(dataset_csr)
    print k, svd.explained_variance_ratio_.sum()

750 0.957765491379


KeyboardInterrupt: 

In [11]:
## DR on full set 
svd = TruncatedSVD(n_components=750, n_iter=7, random_state=42)
Reduced_data = svd.fit_transform(dataset_csr)
print 'done!'
print Reduced_data.shape

In [23]:
## DR only on pos data 

svd_pos = TruncatedSVD(algorithm='randomized', n_components=20, n_iter=7, random_state=42)
svd_pos_mod = svd_pos.fit(pos_csr, pos_labels)
X_svdOnPos = svd_pos_mod.transform(dataset_csr)
print 'done!'
print X_svdOnPos.shape

done!
(800, 20)


In [11]:
svd_pos_mod.explained_variance_ratio_.sum()

0.51778034104955351

In [10]:
# # split to train and test:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(Reduced_data, labels, test_size=0.3, random_state=0)

# print X_train.shape 



newlab = np.asarray(labels)
print type(newlab)

intlabels = map(int, newlab)


<type 'numpy.ndarray'>


In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn import svm
Nfold = 5
random_state = np.random.RandomState(0)
cv = StratifiedKFold(n_splits=Nfold, shuffle=True)
# X = Reduced_data
# X = dataset_csr
X = X_svdOnPos
y = newlab
f1_average = 0
for train_index, test_index in cv.split(X, y):
#     print train_index, test_index
    X_train, X_test = X[train_index], X[test_index]
#     print X_train.shape
    y_train, y_test = y[train_index], y[test_index]
#     print len(y_train)
    y_predict = svm.SVC(kernel='linear', C=1, probability=True, random_state=random_state).fit(X_train, y_train).predict(X_test)
    f1_average = f1_average + f1_score(y_test, y_predict, average=None)
print f1_average/Nfold   


[ 0.98490233  0.84241726]


In [38]:
from sklearn.model_selection import cross_val_score
svm_model = svm.SVC(kernel='linear', C=1000)
cross_val_score(svm_model, X_svdOnPos, intlabels, cv=6, scoring='f1').mean()

0.86207729468599037

In [124]:
from sklearn.neighbors import KNeighborsClassifier
# for i in range(1,100):
neigh = KNeighborsClassifier(n_neighbors=3)
print cross_val_score(neigh, X_svdOnPos, intlabels, cv=6, scoring='f1').mean()

0.0


### for testing 

In [119]:
# svm_model = svm.SVC(kernel='linear', C=1000)
# for i in range(15, 30):
    
#     svd_pos = TruncatedSVD(algorithm='randomized', n_components=i, n_iter=7, random_state=42)
#     svd_pos_mod = svd_pos.fit(pos_csr, pos_labels)
#     X_svdOnPos = svd_pos_mod.transform(dataset_csr)
#     print i, cross_val_score(svm_model, X_svdOnPos, intlabels, cv=10, scoring='f1').mean()
# print "done"


svd_pos = TruncatedSVD(algorithm='randomized', n_components=750, n_iter=7, random_state=42)
svd_pos_mod = svd_pos.fit(dataset_csr, labels)
X_svdOnPos = svd_pos_mod.transform(dataset_csr)
print 'done!'
print X_svdOnPos.shape

done!
(800, 750)


In [24]:
from sklearn.neural_network import MLPClassifier
# for i in range(5,20):
    
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(19,5), random_state=1)
print cross_val_score(clf, X_svdOnPos, intlabels, cv=10, scoring='f1').mean()


0.900784313725


In [170]:
from sklearn.neighbors import KNeighborsClassifier
# for i in range(1,100):
neigh = KNeighborsClassifier(n_neighbors=5)
print cross_val_score(neigh, X_svdOnPos, intlabels, cv=10, scoring='f1').mean()

0.0


In [47]:
#25, 9 = 0.684 but 0.77 on test
# for i in range(1, 200, 1):
    
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                 hidden_layer_sizes=(19, 5), random_state=1)
print i, cross_val_score(clf, X, intlabels, cv=10, scoring='f1').mean()


 75 0.679199346405


In [20]:
### chi2 with 359, NN with 19, 5 get 0.80000
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = X_svdOnPos
y = newlab
for i in range(1):
    
    ch2_model = SelectKBest(chi2, k=747).fit(dataset_csr, y)
    X_chi2 = ch2_model.transform(dataset_csr)
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                 hidden_layer_sizes=(19, 5), random_state=1)
#     y_predict = clf.fit(X, y).predict(test_real_OnPos)
    print i, cross_val_score(clf, X_chi2, intlabels, cv=10, scoring='f1').mean()
# ch2_model.transform(x)


0 0.684577138548


(800, 250)


0 0.684377128719



### Test on real test data 


In [11]:
# ## DR only based on full data 
svd_pos = TruncatedSVD(algorithm='randomized', n_components=800, n_iter=7, random_state=42)
svd_pos_mod = svd_pos.fit(dataset_csr, labels)
X_svdOnPos = svd_pos_mod.transform(dataset_csr)
test_real_OnPos = svd_pos_mod.transform(test_real_csr)
print 'done!'
print X_svdOnPos.shape
print test_real_OnPos.shape

done!
(800, 800)
(350, 800)


In [25]:
## DR only on pos data 
## good on validation but bad clp result 
svd_pos = TruncatedSVD(algorithm='randomized', n_components=20, n_iter=7, random_state=42)
svd_pos_mod = svd_pos.fit(pos_csr, pos_labels)
X_svdOnPos = svd_pos_mod.transform(dataset_csr)
test_real_OnPos = svd_pos_mod.transform(test_real_csr)
print 'done!'
print X_svdOnPos.shape
print test_real_OnPos.shape

done!
(800, 20)
(350, 20)


In [12]:
newlab = np.asarray(labels)  
print type(newlab)
intlabels = map(int, newlab)

<type 'numpy.ndarray'>


In [13]:
# best with svd 800, NN 19, 5, and got 0.80000
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
Nfold = 5
random_state = np.random.RandomState(0)
cv = StratifiedKFold(n_splits=Nfold, shuffle=True)
# X = Reduced_data
# X = dataset_csr
X = X_svdOnPos
y = newlab
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(19, 5), random_state=1)
y_predict = clf.fit(X, y).predict(test_real_OnPos)

# y_predict = svm.SVC(kernel='linear', C=100).fit(X, y).predict(test_real_OnPos)

# svm_model = svm.SVC(kernel='linear', C=100)
print cross_val_score(clf, X, intlabels, cv=10, scoring='f1').mean()
print y_predict



0.679199346405
['0' '0' '1' '0' '0' '0' '0' '1' '0' '0' '0' '1' '0' '1' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '1' '1' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '1' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0'
 '0' '1' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '1' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0'

In [14]:
# print result to text file
text_file = open("data/HW04_data/output.dat.txt", "w")


for i in y_predict:
    
    text_file.write(i+'\n')
text_file.close()