In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC,LinearSVC
import numpy as np
import seaborn as sns
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.model_selection import StratifiedKFold, KFold,RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split

## Support Vector Machines (lineal y polinomico)

In [None]:
df = pd.read_csv("../data/train_data_final.csv")

outliers = pd.read_csv("outliers.csv")

df.shape

df.columns

In [None]:
outliers.merge(df,left_on='Unnamed: 0.1',right_on='Unnamed: 0')

In [None]:
df = df[~df['Unnamed: 0'].isin(outliers['Unnamed: 0.1'])]

In [None]:
categorical_variables = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'] 

df.drop(categorical_variables,axis=1,inplace=True)

payment_variables = []
spending_variables = []
delinquency_variables = []
balance_variables = []
risk_variables = []
for column in df.columns:
    if 'P' in column:
        payment_variables.append(column)
    if 'S' in column:
        spending_variables.append(column)
    if 'D' in column and 'ID' not in column:
        delinquency_variables.append(column)
    if 'B' in column:
        balance_variables.append(column)
    if 'R' in column:
        risk_variables.append(column)

In [None]:
df.drop('customer_ID',axis=1,inplace=True)

In [None]:
df = df.drop('S_2',axis=1)\
.drop('D_42',axis=1) \
.drop('D_49',axis=1) \
.drop('D_53',axis=1) \
.drop('D_73',axis=1) \
.drop('D_76',axis=1) \
.drop('R_9',axis=1)  \
.drop('D_82',axis=1) \
.drop('B_29',axis=1) \
.drop('D_87',axis=1) \
.drop('D_88',axis=1) \
.drop('D_106',axis=1)\
.drop('R_26',axis=1) \
.drop('D_108',axis=1)\
.drop('D_110',axis=1)\
.drop('D_111',axis=1)\
.drop('B_39',axis=1) \
.drop('B_42',axis=1) \
.drop('D_132',axis=1)\
.drop('D_134',axis=1)\
.drop('D_135',axis=1)\
.drop('D_136',axis=1)\
.drop('D_137',axis=1)\
.drop('D_138',axis=1)\
.drop('D_142',axis=1)\


delinquency_variables_redux = delinquency_variables.copy()



delinquency_variables_redux.remove('D_42') 
delinquency_variables_redux.remove('D_49') 
delinquency_variables_redux.remove('D_53') 
delinquency_variables_redux.remove('D_73') 
delinquency_variables_redux.remove('D_76') 
delinquency_variables_redux.remove('D_82') 
delinquency_variables_redux.remove('D_87') 
delinquency_variables_redux.remove('D_88') 
delinquency_variables_redux.remove('D_106')
delinquency_variables_redux.remove('D_108')
delinquency_variables_redux.remove('D_110')
delinquency_variables_redux.remove('D_111')
delinquency_variables_redux.remove('D_132')
delinquency_variables_redux.remove('D_134')
delinquency_variables_redux.remove('D_135')
delinquency_variables_redux.remove('D_136')
delinquency_variables_redux.remove('D_137')
delinquency_variables_redux.remove('D_138')
delinquency_variables_redux.remove('D_142')


df.loc[:,delinquency_variables_redux].fillna(0,inplace=True)

for col in df.columns:
    #print(col)
    df[col].fillna(df[col].mean(),inplace=True)

#### Armado del train_test

In [None]:
train_variables = []

for v in payment_variables:
    if v in df.columns:
        train_variables.append(v)
for v in balance_variables:
    if v in df.columns:        
        train_variables.append(v)
for v in risk_variables:
    if v in df.columns:        
        train_variables.append(v)     
for v in spending_variables:
    if v in df.columns:        
        train_variables.append(v)       
#for v in delinquency_variables_redux:
#    if v in df.columns:        
#        train_variables.append(v)        
        
        

#X = df.iloc[0:20000,:][train_variables]
X = df[train_variables]
#y = df.iloc[0:20000,:].target
y = df.target
features = X.columns





X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
X_train_norm=scaler.fit_transform(X_train)
X_test_norm=scaler.fit_transform(X_test)

X.info()

### Hiperparametros lineales y polinomicos para el svm

In [None]:
#primera busqueda:
#parameters = {'kernel':('linear','rbf','poly'),'C':[1,1e+7],'degree':[3,8],'tol':[1,1e-7],'max_iter':[1,100000]}
parameters = {'kernel':('rbf','poly'),'C':[1,1e+7],'degree':[8],'tol':[1e-7],'max_iter':[50000,100000]}

svc_gridsearch = SVC(random_state=0)

grid_search_svc = GridSearchCV(svc_gridsearch,parameters,verbose=4,n_jobs=-1,cv=2)

In [None]:
grid_search_svc.get_params().keys()

## Entrenamiento de los modelos por CV

In [None]:
#%time

grid_search_svc.fit(X_train_norm, y_train) 

#### Conseguimos los nuevos hiperparametros

In [None]:
params_opt_linear_svm = grid_search_svc.best_params_
params_opt_linear_svm

#obtuve {'C': 1, 'degree': 8, 'kernel': 'rbf', 'max_iter': 50000, 'tol': 1e-07} y lograba converger, con kernel= linear no.


In [None]:
#svc = SVC(C=1,tol=1e-7,kernel= 'rbf',max_iter=50000,random_state=0) #tarda mucho
svc = SVC(C=1,tol=1e-7,kernel= 'poly',max_iter=100000,degree=3,random_state=0)


## Aproximacion de Kernel por Nystroem reduciendo a 30 components (no ayudo, demora mucho en el dataset completo)

In [None]:
from sklearn.kernel_approximation import Nystroem

feature_map_nystroem = Nystroem(kernel='poly',degree=3,gamma=.2,random_state=0,n_components=10)

data_transformed = feature_map_nystroem.fit_transform(X_train_norm)

## Entrenamos de vuelta con svm y los nuevos parametros

In [None]:


svc.fit(data_transformed,y_train)

#### Muestra de los resultados

In [None]:
y_pred = svc.predict(data_transformed)
y_pred_test = svc.predict(X_test_norm)

#Usando el entrenado
print("precision = ", precision_score(y_train, y_pred))
print("recall = ", recall_score(y_train, y_pred))
print("auc = ", roc_auc_score(y_train, y_pred))
print("f1 score = ", f1_score(y_train, y_pred))


confusion_matrix(y_train, y_pred)

In [None]:
#Usando el test

confusion_matrix(y_test, y_pred_test)


print("precision = ", precision_score(y_test, y_pred_test))
print("recall = ", recall_score(y_test, y_pred_test))
print("auc = ", roc_auc_score(y_test, y_pred_test))
print("f1 score = ", f1_score(y_test, y_pred_test))