# Reto 2

## Miguel Ortiz y Andrea Castiella


In [7]:
import numpy  as np
import pandas as pd
import OurUtils as our
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

In [8]:
FullSet_0 = pd.read_csv('./Reto2-Datasets/1000_cero.csv', header=None)
FullSet_3 = pd.read_csv('./Reto2-Datasets/1000_tres.csv', header=None)
FullSet_6 = pd.read_csv('./Reto2-Datasets/1000_seis.csv', header=None)
FullSet_9 = pd.read_csv('./Reto2-Datasets/1000_nueve.csv', header=None)


# Scale image to range [0,1]
FullSet_0 = FullSet_0 / 255.0
FullSet_3 = FullSet_3 / 255.0
FullSet_6 = FullSet_6 / 255.0
FullSet_9 = FullSet_9 / 255.0

## Feature Engineering
- Feature multiplication
- Feature division

In [9]:
# Multiplies the value of two given characteristics
def feat_mult(features, feat_name='', feat1 = 'feat_2', feat2 = 'feat-5'):
    mult = features[feat1] * features[feat2]
    return pd.DataFrame({feat_name:mult})

# Divides the value of two given characteristics
def feat_div(features, feat_name='', feat1 = 'feat_2', feat2 = 'feat-4'):
    div = features[feat1] / (features[feat2] + 0.00001)
    return pd.DataFrame({feat_name:div})


## Feature extraction
Label for number 3: 0

Label for number 7: 1


In [11]:
FullSet = our.join_features_labels(FullSet_0, FullSet_3, FullSet_6, FullSet_9)
seed = 1234
theta = 0.5
X_full = our.feat_extraction( FullSet.drop('label', axis=1))
area = feat_mult(X_full, 'area', feat1='feat_7', feat2='feat_8')
X_full = pd.concat([X_full, area],axis=1)
mult_25 = feat_mult(X_full, 'mult_25', feat1='feat_2', feat2='feat_5')
X_full = pd.concat([X_full, mult_25],axis=1)
div_24 = feat_div(X_full, 'div_24', feat1='feat_2', feat2='feat_4')
X_full = pd.concat([X_full, div_24],axis=1)
#X_full.head()
Y_full = FullSet[['label']]


TypeError: join_features_labels() takes 2 positional arguments but 4 were given

## Split dataset
Train split: 1600 samples (80%)

Validation split: 400 samples (20%)

In [None]:
valid_size = 0.2
splitter = StratifiedShuffleSplit(n_splits=1, test_size=valid_size, random_state=seed)
split_ix = splitter.split(X_full,Y_full)
for train_ix, test_ix in split_ix:
    X_train = X_full.loc[train_ix].reset_index(drop=True)
    Y_train = Y_full.loc[train_ix].reset_index(drop=True)
    X_valid  = X_full.loc[test_ix].reset_index(drop=True)
    Y_valid  = Y_full.loc[test_ix].reset_index(drop=True)


In [None]:
# Selected features after analyzing different combinations
feats = ['feat_9', 'div_24']
scaler = MinMaxScaler()
X = scaler.fit_transform(X_train[feats])
Y = Y_train.values.ravel() # Returns data as numpy array


In [None]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.svm import SVC

base_clf = SVC(kernel='rbf', degree=2, gamma=1, random_state = seed) #<- it can be any other one

# Train OvO - SVM
OvO_clf = OneVsOneClassifier(base_clf)
OvO_clf.fit(X,Y)

# Train OvR - SVM
OvR_clf = OneVsRestClassifier(base_clf)
OvR_clf.fit(X,Y)

# Check
strlog = "OvO produces %d classifiers" %(len(OvO_clf.estimators_))
print(strlog)
strlog = "OvR produces %d classifiers" %(len(OvR_clf.estimators_))
print(strlog)


In [None]:
# Feature engineering, selection and rescaling to [0,1]
X_pred = scaler.transform(X_valid)
Y_true = Y_valid.values.ravel()

# predict

Y_pred_OvO = OvO_clf.predict(X_pred)
Y_pred_OvR = OvR_clf.predict(X_pred)


In [None]:
# Performance metrics

conf_mat_OvO = confusion_matrix(Y_true, Y_pred_OvO)
hits_OvO = np.trace(conf_mat_OvO)
conf_mat_OvR = confusion_matrix(Y_true, Y_pred_OvR)
hits_OvR = np.trace(conf_mat_OvR)

# Print out
print("\nOvO confusion matrix:\n")
print(conf_mat_OvO)
print("\n")
print( "OvO Hits  = %d"%(hits_OvO) )
print( "OvO Fails = %d"%(Y_true.shape[0]-hits_OvO) )
print("\nOvR confusion matrix:\n")
print(conf_mat_OvR)
print( "\n")
print( "OvR Hits  = %d"%(hits_OvR) )
print( "OvR Fails = %d"%(Y_true.shape[0]-hits_OvR) )