In [24]:
import numpy as np
from matplotlib import pyplot as plt
from numpy import genfromtxt
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

from scipy.sparse import csr_matrix
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn import svm

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split

%matplotlib inline

In [25]:
def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

# Getting training data

In [26]:
training_data = np.array(load_data("training_data.txt", skiprows = 1))

In [27]:
y_train = training_data[:, 0]
X_train = training_data[:, 1:]


In [28]:
test_data = np.array(load_data("test_data.txt"))
X_test = test_data


In [7]:
predicted_labels = np.array(log_reg.predict(X_test_slim)).astype(int)
predicted_labels[predicted_labels==-1] = 0

list_nums= [x for x in range(1, 10001)]
output_vector = np.column_stack((list_nums, predicted_labels))
np.savetxt("predicted_labels.txt", output_vector, fmt='%i', delimiter=',', header="Id,Prediction")
output_vector

NameError: name 'log_reg' is not defined

# Running Lasso to Extract Features

In [64]:
# Running Lasso to Extract Features

y_regression_train = y_train
y_regression_train[y_regression_train==0] = -1

streng = 0.0002
lasso = Lasso(alpha = streng)
lasso.fit(X_train, y_regression_train)   
y_pred = np.sign(lasso.predict(X_train))


non_zero_inds = np.nonzero(lasso.coef_)
X_slim = X_train[:, non_zero_inds][:,0,:]

#Changing test set
X_test_slim = X_test[:, non_zero_inds][:,0,:]

In [65]:
X_slim.shape

(20000, 885)

In [66]:
indices = (y_pred!=y_regression_train)
X_train[indices]

array([[ 3.,  0.,  0., ...,  0.,  0.,  0.],
       [ 2.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  1., ...,  0.,  0.,  0.],
       [ 1.,  0.,  3., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.]])

In [75]:
#Splitting Data
X_training, X_testing, y_training, y_testing = train_test_split(X_slim, y_train, test_size=0.1, shuffle = True)

In [76]:
# Making matrix sparse
X_sparse = csr_matrix(X_train)
X_train_sparse = csr_matrix(X_training)

X_test_sparse = csr_matrix(X_testing)


X_slim_sparse = csr_matrix(X_slim)

In [77]:
clf = svm.SVC(C=0.4, kernel = 'linear')

In [81]:
clf.fit(X_sparse, y_train)

SVC(C=0.4, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [86]:
train_predictions = clf.predict(X_train_sparse)
binary_clf_error(train_predictions, y_training)

0.12

In [80]:
test_predictions = clf.predict(X_test_sparse)
binary_clf_error(clf.predict(X_test_sparse), y_testing)



0.14749999999999999

In [52]:
y_train

array([ 1.,  1.,  1., ...,  1.,  1.,  0.])

In [50]:
# Running SVM on X_slim with different validation sets
train_list = []
test_list = []
for i in range(5):
    X_training, X_testing, y_training, y_testing = train_test_split(X_slim, y_train, test_size=0.1, shuffle = True)
    
    X_train_sparse = csr_matrix(X_training)
    X_test_sparse = csr_matrix(X_testing)
    
    clf = svm.SVC(C=0.3, kernel = 'linear')
    clf.fit(X_train_sparse, y_training)
    
    train_predictions = clf.predict(X_train_sparse)
    train_error = binary_clf_error(train_predictions, y_training)
    
    test_predictions = clf.predict(X_test_sparse)
    test_error = binary_clf_error(clf.predict(X_test_sparse), y_testing)
    
    print (train_error, test_error)
    train_list.append(train_error)
    test_list.append(test_error)


0.124222222222 0.138
0.123444444444 0.138
0.122444444444 0.1505
0.122666666667 0.148
0.123944444444 0.1425


In [52]:
np.array([train_list, test_list]).T

array([[ 0.12422222,  0.138     ],
       [ 0.12344444,  0.138     ],
       [ 0.12244444,  0.1505    ],
       [ 0.12266667,  0.148     ],
       [ 0.12394444,  0.1425    ]])

In [53]:
np.mean(test_list)

0.1434

In [8]:
def binary_clf_error(y_pred, y_correct):
    '''Predicts binary classification error'''
    binarytrain_number = (y_pred!=y_correct).sum()
    return binarytrain_number/len(y_pred)

In [48]:
training_error = []
validation_error  = []
c_values = np.linspace(0.1, 1, num=10)
for c_value in c_values:
    clf = svm.SVC(C=c_value, kernel = 'linear')
    clf.fit(X_train_sparse, y_training)
    
    train_predictions = clf.predict(X_train_sparse)
    train_error = binary_clf_error(train_predictions, y_training)
    test_predictions = clf.predict(X_test_sparse)
    test_error = binary_clf_error(clf.predict(X_test_sparse), y_testing)
    
    print(train_error, test_error)
    training_error.append(train_error)
    validation_error.append(test_error)




0.123833333333 0.146
0.122444444444 0.148
0.122277777778 0.147
0.121888888889 0.15
0.122222222222 0.1505
0.121944444444 0.1505
0.121888888889 0.152
0.121833333333 0.152
0.122 0.152
0.122166666667 0.1515


In [49]:
np.array([c_values, training_error, validation_error]).T

array([[ 0.1       ,  0.12383333,  0.146     ],
       [ 0.2       ,  0.12244444,  0.148     ],
       [ 0.3       ,  0.12227778,  0.147     ],
       [ 0.4       ,  0.12188889,  0.15      ],
       [ 0.5       ,  0.12222222,  0.1505    ],
       [ 0.6       ,  0.12194444,  0.1505    ],
       [ 0.7       ,  0.12188889,  0.152     ],
       [ 0.8       ,  0.12183333,  0.152     ],
       [ 0.9       ,  0.122     ,  0.152     ],
       [ 1.        ,  0.12216667,  0.1515    ]])

In [20]:
np.linspace?

In [82]:
#Predicting on Test.
Xtest_sparse = csr_matrix(X_test)

In [83]:
realtest_predictions = clf.predict(Xtest_sparse)

In [84]:
realtest_predictions = realtest_predictions.astype(int)

In [85]:
realtest_predictions[realtest_predictions==-1] = 0

In [87]:
list_nums= [x for x in range(1, 10001)]
output_vector = np.column_stack((list_nums, realtest_predictions))
np.savetxt("predicted_labels_svm.txt", output_vector, fmt='%i', delimiter=',', header="Id,Prediction")
output_vector

array([[    1,     1],
       [    2,     1],
       [    3,     0],
       ..., 
       [ 9998,     0],
       [ 9999,     1],
       [10000,     0]])

In [32]:
realtest_predictions

array([1, 1, 0, ..., 0, 1, 0])