# Classification 

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

%matplotlib inline

## Carregamento das amostras

In [2]:
df_dados= pd.read_csv('../data/features.csv', sep=';', index_col=0)
df_dados.head()

Unnamed: 0,nu_CPFCNPJ,qtdAditivosPorCPFCNPJ,qtdContratos,qtdContratos_ganhos,vl_TotalContrato,valor_total,valor_total_pregao,valor_total_convite,valor_total_tomada,participacoes_pregao,participacoes_tomada,participacoes_convite,participacoes_total,label
0,10362263000102,94,12,12,819940.0,1186185.05,328424.1,217864.25,639896.7,4,5,4,13,1
3,10365996000192,354,63,63,2206238.34,239217.7,216737.7,11080.0,11400.0,8,1,1,10,1
6,10408838000172,228,24,24,2199888.36,2773393.55,784044.84,1570827.41,418521.3,6,3,29,38,1
9,10414278000169,898,92,92,3671676.51,1579741.4,124532.0,1399209.0,56000.4,4,2,61,67,1
12,10445253000122,404,49,49,30955873.37,1777174.21,629167.64,1094890.94,53115.63,4,2,15,21,1


In [3]:
samples = df_dados.drop(['nu_CPFCNPJ', 'label'], axis=1).values
labels = df_dados.label.values
print(samples.shape, labels.shape)

(132, 12) (132,)


## Classificação

In [19]:
estimators = [('std', StandardScaler()), ('svm', SVC(probability=True))]
pipe = Pipeline(estimators)
parameters = dict(svm__C=[1,10], svm__kernel=['linear', 'rbf'])

clf = GridSearchCV(pipe, parameters, scoring='f1', verbose=10, cv=10)
clf.fit(samples, labels)
print(clf.best_params_, clf.best_score_)

Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.952381, total=   0.0s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=1.000000, total=   0.0s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.900000, total=   0.0s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.857143, total=   0.0s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.947368, total=   0.0s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.947368, total=   0.0s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.2s finished


In [20]:
best_c, best_kernel = 10, 'linear'

In [21]:
df_dados['label_pred'] = clf.predict(samples)
df_dados.head()

Unnamed: 0,nu_CPFCNPJ,qtdAditivosPorCPFCNPJ,qtdContratos,qtdContratos_ganhos,vl_TotalContrato,valor_total,valor_total_pregao,valor_total_convite,valor_total_tomada,participacoes_pregao,participacoes_tomada,participacoes_convite,participacoes_total,label,label_pred
0,10362263000102,94,12,12,819940.0,1186185.05,328424.1,217864.25,639896.7,4,5,4,13,1,1
3,10365996000192,354,63,63,2206238.34,239217.7,216737.7,11080.0,11400.0,8,1,1,10,1,1
6,10408838000172,228,24,24,2199888.36,2773393.55,784044.84,1570827.41,418521.3,6,3,29,38,1,1
9,10414278000169,898,92,92,3671676.51,1579741.4,124532.0,1399209.0,56000.4,4,2,61,67,1,1
12,10445253000122,404,49,49,30955873.37,1777174.21,629167.64,1094890.94,53115.63,4,2,15,21,1,1


In [23]:
probs = clf.predict_proba(samples)
df_dados['confianca'] = np.around(probs[:,0], decimals=2)
df_dados.head()

Unnamed: 0,nu_CPFCNPJ,qtdAditivosPorCPFCNPJ,qtdContratos,qtdContratos_ganhos,vl_TotalContrato,valor_total,valor_total_pregao,valor_total_convite,valor_total_tomada,participacoes_pregao,participacoes_tomada,participacoes_convite,participacoes_total,label,label_pred,confianca
0,10362263000102,94,12,12,819940.0,1186185.05,328424.1,217864.25,639896.7,4,5,4,13,1,1,0.25
3,10365996000192,354,63,63,2206238.34,239217.7,216737.7,11080.0,11400.0,8,1,1,10,1,1,0.37
6,10408838000172,228,24,24,2199888.36,2773393.55,784044.84,1570827.41,418521.3,6,3,29,38,1,1,0.0
9,10414278000169,898,92,92,3671676.51,1579741.4,124532.0,1399209.0,56000.4,4,2,61,67,1,1,0.0
12,10445253000122,404,49,49,30955873.37,1777174.21,629167.64,1094890.94,53115.63,4,2,15,21,1,1,0.06


In [24]:
df_dados.to_csv(path_or_buf='../data/features.csv', sep=';')