# Task 1

In [22]:
import pandas as pd
import numpy as np

names = ['f{0}'.format(i) for i in range(15)]

dtype = {'f1': np.str }

train_data = pd.read_csv('train.data',
                         header = len(names) + 1,
                         names = names + ['label'],
                         skipinitialspace = True,
                         na_values = ['?'],
                         dtype = dtype)
test_data = pd.read_csv('test.data',
                        header = len(names),
                        names = names[:len(names)],
                        skipinitialspace = True,
                        na_values = ['?'],
                        dtype = dtype)
test_labels = pd.read_csv('test.lab', header = 1, names = ['label'], skipinitialspace = True).label
train_labels = train_data.label

train_data.drop('label', axis = 1, inplace = True)

train_data.f1 = train_data.f1.astype(np.float)
train_data.f1 = test_data.f1.astype(np.float)

if train_data.isnull().any().any():
    print('Data contains missing values.\n')
else:
    print('Data does not contain missing values.\n')

missing_training_data_size = train_data[pd.isnull(train_data).any(axis=1)].shape[0]

print('{0}% data rows have missing values in training set'.format(missing_training_data_size / train_data.shape[0]))

missing_test_data_size = test_data[pd.isnull(test_data).any(axis=1)].shape[0]

print('{0}% data rows have missing values in test set'.format(missing_test_data_size / test_data.shape[0]))

class_one_examples_number = train_labels[train_labels == '+'].shape[0]
class_two_examples_number = train_labels[train_labels == '-'].shape[0]

print('Ratio between class "+" samples and class "-" samples in the training set is {0}'.format(class_one_examples_number / class_two_examples_number))

Data contains missing values.

0.33246753246753247% data rows have missing values in training set
0.058823529411764705% data rows have missing values in test set
Ratio between class "+" samples and class "-" samples in the training set is 0.6960352422907489


$f_0$ has 2 possible values 'b' and 'a'.

$f_1$ has values between 1517 and 8025.

$f_2$ has values between 0 and 25.085000000000001.

$f_3$ has 3 possible values 'y', 'l' and 'u'.

$f_4$ has 3 possible values 'p', 'g' and 'gg'.

$f_5$ has 14 possible values 'cc', 'ff', 'c', 'i', 'q', 'w', 'm', 'd', 'e', 'aa', 'j', 'x', 'k' and 'r'.

$f_6$ has 9 possible values 'v', 'ff', 'h', 'bb', 'j', 'z', 'o', 'n' and 'dd'.

$f_7$ has values between 0 and 20.

$f_8$ has 2 values 't' and 'f'.

$f_9$ has 2 values 't' and 'f'

$f_{10}$ has values between 0 and 20.

$f_{11}$ has 2 values 't' and 'f'

$f_{12}$ has 2 values 'g', 's' and 'p'

$f_{13}$ has values between 0 and 0.00020000000000000001.

$f_{14}$ has values between 0 and 100000.

Data has 385 samples. Features $f_1, f_2, f_7, f_{10}, f_{13}, f_{14}$ are continous. Features $f_0, f_3, f_4, f_5, f_6, f_8, f_9, f_{11}, f_{12}$ are categorical. Problem has two classes. We can say that classes is more or less balanced because the ratio 0.69 is more or less close to 50 % that means that we have the half samples for class 1 and the same amount for class 2.

# Problem 2

In [2]:
import sklearn as skl
from sklearn import preprocessing
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

DataFrameImputer().fit(train_data)
train_data_nona = DataFrameImputer().fit_transform(train_data)
test_data_nona = DataFrameImputer().fit_transform(test_data)

All the missing values were transformed with the most frequent for object fields and mean for numeric fields

# Problem 3

In [30]:
from sklearn.preprocessing import OneHotEncoder

le = preprocessing.LabelEncoder()

categorical_features = [0, 3, 4, 5, 6, 8, 9, 11, 12]

for feature in categorical_features:
    train_data_nona[names[feature]] = le.fit_transform(train_data_nona[names[feature]])
    test_data_nona[names[feature]] = le.fit_transform(test_data_nona[names[feature]])

train_labels = le.fit_transform(train_labels)
test_labels = le.fit_transform(test_labels)

enc = OneHotEncoder(categorical_features = categorical_features, sparse = False)
enc.fit(train_data_nona)
transformed_train = enc.transform(train_data_nona)
transformed_test = enc.transform(test_data_nona)

First I have changed all the lablings for categorical data and the applied One Hot Encoding to it.

# Problem 4

In [4]:
from sklearn import tree
from sklearn import cross_validation

def estimate_accuracy(clf, train_data, train_labels):
    scores = cross_validation.cross_val_score(clf, train_data, train_labels, cv = 5)

    print('Accuracy: {0} % (+/- {1})'.format(scores.mean(), scores.std() * 2))
    
    
clf = tree.DecisionTreeClassifier()
estimate_accuracy(clf, transformed_train, train_labels)

clf = tree.DecisionTreeClassifier(criterion='entropy')
estimate_accuracy(clf, transformed_train, train_labels)

clf = tree.DecisionTreeClassifier(max_features = 1)
estimate_accuracy(clf, transformed_train, train_labels)

clf = tree.DecisionTreeClassifier(max_depth = 1)
estimate_accuracy(clf, transformed_train, train_labels)

Accuracy: 0.7717992533782007 % (+/- 0.15416044072907314)
Accuracy: 0.8130606235869393 % (+/- 0.15164466122114262)
Accuracy: 0.724774348458559 % (+/- 0.100326783323681)
Accuracy: 0.8545209176788123 % (+/- 0.2746649865019716)


The best accuracy was reached when the depth of the tree was 1. But in that configuration standard deviation was very large. Also, it is interesting that  information gain works better on the given dataset. Obviously, decreasing amount of features to use reduces the accuracy.

# Problem 5

In [5]:
from sklearn import grid_search

tuned_parameters = [{'criterion': ['entropy', 'gini'],
                     'max_features': [None, 1, 10, 20, 30, 46, 'auto', 'sqrt', 'log2'],
                     'max_depth': [1, 10, 100, 1000],
                     'presort' : [True, False]
                    }]

clf = grid_search.GridSearchCV(tree.DecisionTreeClassifier(), tuned_parameters, cv = 5)
timing = %timeit -o clf.fit(transformed_train, train_labels)
params, mean, scores = clf.grid_scores_[0]
print('The best score is {1} (+- {2}) with parameters {0} .'.format(params, mean, scores.std() * 2))
print('Grid search took about {0} s.'.format(timing.best))

1 loops, best of 3: 1.57 s per loop
The best score is 0.8545454545454545 (+- 0.2746649865019716) with parameters {'max_depth': 1, 'max_features': None, 'presort': True, 'criterion': 'entropy'} .
Grid search took about 1.5666889720014296 s.


# Problem 6

In [6]:
from sklearn import svm

clf = svm.SVC()
estimate_accuracy(clf, transformed_train, train_labels)

X_normalized = preprocessing.normalize(transformed_train, norm = 'l2')
clf = svm.SVC()
estimate_accuracy(clf, X_normalized, train_labels)

Accuracy: 0.5717081164449586 % (+/- 0.049514617088601505)
Accuracy: 0.6129344339870656 % (+/- 0.021633371232247595)


We can see, that normalization improves svm classification score.

# Problem 7

In [7]:
tuned_parameters = [{'C': [0.1, 0.2, 0.5],
                     'kernel': ['rbf', 'linear', 'sigmoid']
                    },
                    {'C': [0.1, 0.2, 0.5],
                     'kernel': ['poly'],
                     'degree': [1, 2, 3, 4, 5]
                    }]

clf = grid_search.GridSearchCV(svm.SVC(), tuned_parameters, cv = 5)
clf.fit(X_normalized, train_labels)
params, mean, scores = clf.grid_scores_[0]
print('The best score is {1} (+- {2}) with parameters {0} .'.format(params, mean, scores.std() * 2))

The best score is 0.5896103896103896 (+- 0.00561921852408285) with parameters {'C': 0.1, 'kernel': 'rbf'} .


# Problem 8

In [8]:
from sklearn import linear_model
from sklearn import neighbors

svm_tuned_parameters = [{'C': [0.1, 0.2, 0.5],
                         'kernel': ['rbf', 'linear', 'sigmoid']
                        },
                        {'C': [0.1, 0.2, 0.5],
                         'kernel': ['poly'],
                         'degree': [1, 2, 3, 4, 5]
                        }]

dt_tuned_parameters = [{'criterion': ['entropy', 'gini'],
                         'max_features': [None, 1, 10, 20, 30, 46, 'auto', 'sqrt', 'log2'],
                         'max_depth': [1, 10, 100, 1000],
                         'presort' : [True, False]
                        }]

lr_tuned_parameters = [{'penalty': ['l1', 'l2'],
                        'C': [0.01, 0.1, 1, 0.5]
                        }]

knn_tuned_parameters = [{'n_neighbors': [1, 5, 30],
                         'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                         'p': [1, 2, 15]
                         }]

dt_clf = tree.DecisionTreeClassifier()
clf = grid_search.GridSearchCV(dt_clf, dt_tuned_parameters, cv = 5)
timing = %timeit -o clf.fit(X_normalized, train_labels)
params, mean, scores = clf.grid_scores_[0]
print('The best score Decision Tree is {1} (+- {2}) with parameters {0} .'.format(params, mean, scores.std() * 2))
print('Grid search Decision Tree took about {0} s.\n'.format(timing.best))

svm_clf = svm.SVC()
clf = grid_search.GridSearchCV(svm_clf, svm_tuned_parameters, cv = 5)
timing = %timeit -o clf.fit(X_normalized, train_labels)
params, mean, scores = clf.grid_scores_[0]
print('The best score SVM is {1} (+- {2}) with parameters {0} .'.format(params, mean, scores.std() * 2))
print('Grid search SVM took about {0} s.\n'.format(timing.best))

lr_clf = linear_model.LogisticRegression()
clf = grid_search.GridSearchCV(lr_clf, lr_tuned_parameters, cv = 5)
timing = %timeit -o clf.fit(transformed_train, train_labels)
params, mean, scores = clf.grid_scores_[0]
print('The best score Logistic Regression is {1} (+- {2}) with parameters {0} .'.format(params, mean, scores.std() * 2))
print('Grid search Logistic Regression took about {0} s.\n'.format(timing.best))

knn_clf = neighbors.KNeighborsClassifier()
clf = grid_search.GridSearchCV(knn_clf, knn_tuned_parameters, cv = 5)
timing = %timeit -o clf.fit(X_normalized, train_labels)
params, mean, scores = clf.grid_scores_[0]
print('The best score KNN is {1} (+- {2}) with parameters {0} .'.format(params, mean, scores.std() * 2))
print('Grid search KNN took about {0} s.\n'.format(timing.best))

1 loops, best of 3: 1.65 s per loop
The best score Decision Tree is 0.8571428571428571 (+- 0.26878092291129063) with parameters {'max_depth': 1, 'max_features': None, 'presort': True, 'criterion': 'entropy'} .
Grid search Decision Tree took about 1.6524755480349995 s.

1 loops, best of 3: 902 ms per loop
The best score SVM is 0.5896103896103896 (+- 0.00561921852408285) with parameters {'C': 0.1, 'kernel': 'rbf'} .
Grid search SVM took about 0.9015726610086858 s.

10 loops, best of 3: 120 ms per loop
The best score Logistic Regression is 0.7168831168831169 (+- 0.16004900824270843) with parameters {'C': 0.01, 'penalty': 'l1'} .
Grid search Logistic Regression took about 0.12002614900120534 s.

1 loops, best of 3: 920 ms per loop
The best score KNN is 0.6779220779220779 (+- 0.14702194851204545) with parameters {'algorithm': 'ball_tree', 'p': 1, 'n_neighbors': 1} .
Grid search KNN took about 0.9204375360277481 s.



# Problem 9

In [9]:
X_normalized_test = preprocessing.normalize(transformed_test, norm = 'l2')

dt_clf = tree.DecisionTreeClassifier(max_depth = 1, criterion = 'entropy', max_features = None, presort = True)
dt_clf.fit(X_normalized, train_labels)

answers = dt_clf.predict(X_normalized_test)

# Problem 10

In [36]:
dt_clf.score(X_normalized_test, test_labels[0:272])

0.70588235294117652

# Problem 11