# Reto 3

## Miguel Ortiz y Andrea Castiella


In [1]:
import numpy  as np
import pandas as pd
import OurUtils as our
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Read data
X_full = pd.read_csv('Reto 3-Dataset/reto3_trainX.csv', header=0, low_memory=False)
Y_full = pd.read_csv('Reto 3-Dataset/reto3_trainY.csv', header=0, low_memory=False)

print(X_full.head())
print(Y_full.head())

   altitud  azimut  inclinacion  DH_agua  DV_agua  DH_camino  sombra_9am  \
0     3351     354           19      450       95       1064         185   
1     2995      84            7      481       30       5154         231   
2     2884      72           25      210      -45        466         239   
3     3139      76           11      301       58       1368         234   
4     3018     312           17       30       10       1719         172   

   sombra_12pm  sombra_3pm  DH_fuego  ...  t31  t32  t33  t34  t35  t36  t37  \
0          203         153       711  ...    0    0    0    0    0    0    0   
1          227         129      5488  ...    0    0    0    0    0    0    0   
2          183          60      2123  ...    0    0    0    0    0    0    0   
3          220         117      3282  ...    0    0    1    0    0    0    0   
4          225         193      1961  ...    0    0    0    0    0    0    0   

   t38  t39  t40  
0    0    1    0  
1    0    0    0  
2    

## Split dataset
Train split: 80%

Validation split: 20%

In [3]:
# Split into train and validation
valid_size = 0.2
seed = 1234
X_train, Y_train, X_valid, Y_valid = \
   our.single_stratified_split(X_full, Y_full, test_size=valid_size, random_state=seed)

## Feature selection

In [4]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X_train)
Y = Y_train.values.ravel() # Returns data as numpy array

##PCA

In [5]:
X_proy, pca = our.our_PCA(X)
num_of_pc = len(pca.singular_values_)
strTitle = '\n First %d principal components' %(num_of_pc)
print(strTitle)
print(pca.components_.T)
strTitle = '\n First %d singular values ' %(num_of_pc)
print(strTitle)
print(pca.singular_values_.T)
strTitle = '\n Explained Variance Ratio'
print(strTitle)
print(pca.explained_variance_ratio_.T)
strTitle = '\n Accumulated Explained Variance Ratio'
print(strTitle)
print(np.cumsum(pca.explained_variance_ratio_.T))

(92961, 9)

 First 9 principal components
[[ 1.14232061e-02 -1.66662092e-01 -1.53569339e-01 -1.34148739e-01
   4.33027047e-03  4.39481469e-02  2.00748534e-01  6.86155830e-02
  -4.13646495e-02]
 [-5.98923150e-02  1.07265622e-01  2.97571677e-01 -8.00264585e-01
  -1.52812681e-01 -1.81846760e-01 -5.47562516e-02 -4.60894248e-02
   1.56966747e-03]
 [-3.23421744e-02  4.91748061e-02  1.12116218e-01  6.47103895e-02
  -1.71554670e-02 -1.11661847e-01  1.30713961e-02 -1.89385804e-02
  -1.09111019e-01]
 [-2.33723778e-02 -7.79749920e-02  1.07434631e-02 -2.04892310e-02
  -6.73199664e-02  1.58267194e-02  1.12955914e-01 -3.18731603e-02
   7.83717572e-02]
 [-1.91150211e-02 -7.65252329e-03  3.39197427e-02  3.92915976e-03
  -3.50735633e-02 -2.45541476e-02  3.29858512e-02 -2.47711139e-02
   3.78967154e-03]
 [ 1.25352760e-01 -2.08090141e-01 -7.55076272e-02 -1.52650390e-01
  -8.72560007e-02 -8.95778512e-02 -8.72104223e-02  3.58418734e-02
   2.55775643e-04]
 [ 2.62534617e-02 -4.81002300e-02 -1.09479136e-01  1

## Multiclass. OvO vs OvR

In [None]:
# Multiclass
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

base_clf = SVC(kernel='rbf', degree=2, gamma=1, random_state = seed)
X = X_proy
# Train OvO 
OvO_clf = OneVsOneClassifier(base_clf)
OvO_clf.fit(X,Y)

# Train OvR 
OvR_clf = OneVsRestClassifier(base_clf)
OvR_clf.fit(X,Y)

# Check
strlog = "OvO produces %d classifiers" %(len(OvO_clf.estimators_))
print(strlog)
strlog = "OvR produces %d classifiers" %(len(OvR_clf.estimators_))
print(strlog)


## Validation

In [None]:
# Feature engineering, selection and rescaling to [0,1]
X_pred = scaler.transform(X_valid)
# PCA
X_pred = pca.transform(X_pred)
Y_true = Y_valid.values.ravel()

# predict
Y_pred_OvO = OvO_clf.predict(X_pred)
Y_pred_OvR = OvR_clf.predict(X_pred)


## Confusion Matrix

To check the performance of our model, we use Confusion Matrix as metric. This give us a general idea of how good is the model and let us compare it with the multiple combinations of feature that we have available.

In [None]:
# Performance metrics

from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

conf_mat_OvO = confusion_matrix(Y_true,Y_pred_OvO)
hits_OvO = np.trace(conf_mat_OvO)
conf_mat_OvR = confusion_matrix(Y_true,Y_pred_OvR)
hits_OvR = np.trace(conf_mat_OvR)

# Print out
print("\nOvO confusion matrix:\n")
print(conf_mat_OvO)
print("\n")
print( "OvO Hits  = %d"%(hits_OvO) ) 
print( "OvO Fails = %d"%(Y_true.shape[0]-hits_OvO) )
print("\nOvR confusion matrix:\n")
print(conf_mat_OvR)
print( "\n")
print( "OvR Hits  = %d"%(hits_OvR) ) 
print( "OvR Fails = %d"%(Y_true.shape[0]-hits_OvR) )


## Competition data

In [None]:
# Load competition data
challenge_data = pd.read_csv('Reto 3-Dataset/reto3_testX.csv', header=0)
challenge_data = scaler.transform(challenge_data)
# PCA
challenge_features = pca.transform(challenge_data) # Principal components

# Inference
Y_chal_OvO = OvO_clf.predict(challenge_features)
Y_chal_OvR = OvR_clf.predict(challenge_features)

## Save results

In [None]:
#np.savetxt('Reto3_Ypred.csv', Y_chal_OvO, fmt='%i', delimiter=',')