# Task 1

In [1]:
import pandas as pd
import numpy as np

names = ['f{0}'.format(i) for i in range(15)]

dtype = {'f1': np.str }

train_data = pd.read_csv('train.data',
                         header = len(names) + 1,
                         names = names + ['label'],
                         skipinitialspace = True,
                         na_values = ['?'],
                         dtype = dtype)
test_data = pd.read_csv('test.data',
                        header = len(names),
                        names = names[:len(names)],
                        skipinitialspace = True,
                        na_values = ['?'],
                        dtype = dtype)
test_labels = pd.read_csv('test.lab', header = 1, names = ['label'], skipinitialspace = True).label
train_labels = train_data.label

train_data.drop('label', axis = 1, inplace = True)

train_data.f1 = train_data.f1.astype(np.float)
train_data.f1 = test_data.f1.astype(np.float)

if train_data.isnull().any().any():
    print('Data contains missing values.\n')
else:
    print('Data does not contain missing values.\n')

missing_training_data_size = train_data[pd.isnull(train_data).any(axis=1)].shape[0]

print('{0}% data rows have missing values in training set'.format(missing_training_data_size / train_data.shape[0]))

missing_test_data_size = test_data[pd.isnull(test_data).any(axis=1)].shape[0]

print('{0}% data rows have missing values in test set'.format(missing_test_data_size / test_data.shape[0]))

class_one_examples_number = train_labels[train_labels == '+'].shape[0]
class_two_examples_number = train_labels[train_labels == '-'].shape[0]

print('Ratio between class "+" samples and class "-" samples in the training set is {0}'.format(class_one_examples_number / class_two_examples_number))

Data contains missing values.

0.33246753246753247% data rows have missing values in training set
0.058823529411764705% data rows have missing values in test set
Ratio between class "+" samples and class "-" samples in the training set is 0.6960352422907489


$f_0$ has 2 possible values 'b' and 'a'.

$f_1$ has values between 1517 and 8025.

$f_2$ has values between 0 and 25.085000000000001.

$f_3$ has 3 possible values 'y', 'l' and 'u'.

$f_4$ has 3 possible values 'p', 'g' and 'gg'.

$f_5$ has 14 possible values 'cc', 'ff', 'c', 'i', 'q', 'w', 'm', 'd', 'e', 'aa', 'j', 'x', 'k' and 'r'.

$f_6$ has 9 possible values 'v', 'ff', 'h', 'bb', 'j', 'z', 'o', 'n' and 'dd'.

$f_7$ has values between 0 and 20.

$f_8$ has 2 values 't' and 'f'.

$f_9$ has 2 values 't' and 'f'

$f_{10}$ has values between 0 and 20.

$f_{11}$ has 2 values 't' and 'f'

$f_{12}$ has 2 values 'g', 's' and 'p'

$f_{13}$ has values between 0 and 0.00020000000000000001.

$f_{14}$ has values between 0 and 100000.

Data has 385 samples. Features $f_1, f_2, f_7, f_{10}, f_{13}, f_{14}$ are continous. Features $f_0, f_3, f_4, f_5, f_6, f_8, f_9, f_{11}, f_{12}$ are categorical. Problem has two classes. We can say that classes is more or less balanced because the ratio 0.69 is more or less close to 50 % that means that we have the half samples for class 1 and the same amount for class 2.

# Problem 2

In [2]:
import sklearn as skl
from sklearn import preprocessing
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

train_data_nona = DataFrameImputer().fit_transform(train_data)
test_data_nona = DataFrameImputer().fit_transform(test_data)

All the missing values were transformed with the most frequent for object fields and mean for numeric fields

# Problem 3

In [13]:
from sklearn.preprocessing import OneHotEncoder

le = preprocessing.LabelEncoder()

categorical_features = [0, 3, 4, 5, 6, 8, 9, 11, 12]

for feature in categorical_features:
    train_data_nona[names[feature]] = le.fit_transform(train_data_nona[names[feature]])
    test_data_nona[names[feature]] = le.fit_transform(test_data_nona[names[feature]])

train_labels = le.fit_transform(train_labels)
test_labels = le.fit_transform(train_labels)

enc = OneHotEncoder(categorical_features = categorical_features, sparse = False)
transformed_train = enc.fit_transform(train_data_nona)
transformed_test = enc.fit_transform(test_data_nona)

First I have changed all the lablings for categorical data and the applied One Hot Encoding to it.

# Problem 4

In [44]:
from sklearn import tree
from sklearn import cross_validation

def estimate_accuracy(clf, train_data, train_labels):
    scores = cross_validation.cross_val_score(clf, train_data, train_labels, cv = 5)

    print('Accuracy: {0} % (+/- {1})'.format(scores.mean(), scores.std() * 2))
    
    return scores.mean()
    
    
clf = tree.DecisionTreeClassifier()
estimate_accuracy(clf, transformed_train, train_labels)

clf = tree.DecisionTreeClassifier(criterion='entropy')
estimate_accuracy(clf, transformed_train, train_labels)

clf = tree.DecisionTreeClassifier(max_features = 1)
estimate_accuracy(clf, transformed_train, train_labels)

clf = tree.DecisionTreeClassifier(max_depth = 1)
estimate_accuracy(clf, transformed_train, train_labels)

Accuracy: 0.774059274059274 % (+/- 0.12810621205593303)
Accuracy: 0.8105657500394342 % (+/- 0.12210216687220216)
Accuracy: 0.7091189512242144 % (+/- 0.1419345020803041)
Accuracy: 0.8545209176788123 % (+/- 0.2746649865019716)


0.85452091767881233

The best accuracy was reached when the depth of the tree was 1. But in that configuration standard deviation was very large. Also, it is interesting that  information gain works better on the given dataset.

# Problem 5