In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import PolynomialFeatures as plf
from sklearn.metrics import(confusion_matrix,
                            precision_score,
                            recall_score,
                            f1_score,
                            accuracy_score)

In [2]:
#importar datos 
file= 'HR_comma_sep.csv'
datos=pd.read_csv(file,
                       header=0,
                       sep=',',
                       parse_dates=False,
                       skip_blank_lines=True)
datos.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
%matplotlib inline

In [4]:
#tratar variables cualitativas
datos['salary']=datos.salary.map({'low':1,'medium':2,'high':3})

#variable "sales=area"
rh=datos.iloc[:,0:8]
temp=pd.get_dummies(datos['sales'])
temp1=datos.iloc[:,9:10]
datos=rh.join(temp).join(temp1)    
datos

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,salary
0,0.38,0.53,2,157,3,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
1,0.80,0.86,5,262,6,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2
2,0.11,0.88,7,272,4,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2
3,0.72,0.87,5,223,5,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
4,0.37,0.52,2,159,3,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
5,0.41,0.50,2,153,3,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
6,0.10,0.77,6,247,4,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
7,0.92,0.85,5,259,5,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
8,0.89,1.00,5,224,5,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
9,0.42,0.53,2,142,3,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1


In [5]:
def support_vector_m(Data_set,pol_degree,c,g):
    datos=Data_set
    #separar "x,y"
    m,n=np.shape(datos)
    Y=datos.iloc[:,6:7]
    X=datos.ix[:, datos.columns != 'left']
    #normalizar "x" 
    X=X.apply(lambda x: (x - np.mean(x)) / 2*(np.std(x))) # para SVM se recomienda esta "soft normalization"
    
    #separar datos de entrenamiento, prueba y validación
    X_train, X_other, Y_train, Y_other = train_test_split(X, Y, test_size=0.4, random_state=0)
    X_cross, X_test, Y_cross, Y_test = train_test_split(X_other, Y_other, test_size=0.5, random_state=5)
    
    #entrenar modelos con training data set
    
    svc = svm.SVC(kernel='linear',C=c).fit(X_train,Y_train) # "C" is the penalization for error term
    svc_poly = svm.SVC(kernel='poly',degree=pol_degree,C=c,gamma=g).fit(X_train,Y_train) 
    svc_rbf = svm.SVC(kernel='rbf',C=c,gamma=g).fit(X_train,Y_train) #gamma es el coeficiente del kernel/gamma es el ancho del guassiano                                                  
    
    # hacer predicciones de cross-validation data set
    
    Yg_linear=svc.predict(X_cross)
    Yg_poly=svc_poly.predict(X_cross)
    Yg_rbf=svc_rbf.predict(X_cross)
    
    cm=confusion_matrix(Y_cross,Yg_linear)
    
    print('\t Grado polinomio %1.3f' %pol_degree)
    print('\t C: %1.3f' %c)
    print('\t gamma: %1.3f' %g)
    print('\t Linear-cross validation: %1.3f')
    print ('\t Accuracy: %1.3f' %accuracy_score(Y_cross,Yg_linear))
    print ('\t Precision: %1.3f' %precision_score(Y_cross,Yg_linear))
    print ('\t Recall: %1.3f' %recall_score(Y_cross,Yg_linear))
    print ('\t F1: %1.3f' %f1_score(Y_cross,Yg_linear))
    
    print('\t Grado polinomio %1.3f' %pol_degree)
    print('\t C: %1.3f' %c)
    print('\t gamma: %1.3f' %g)
    print('\t Polinomial-cross validation: %1.3f')
    print ('\t Accuracy: %1.3f' %accuracy_score(Y_cross,Yg_poly))
    print ('\t Precision: %1.3f' %precision_score(Y_cross,Yg_poly))
    print ('\t Recall: %1.3f' %recall_score(Y_cross,Yg_poly))
    print ('\t F1: %1.3f' %f1_score(Y_cross,Yg_poly))
    
    print('\t Grado polinomio %1.3f' %pol_degree)
    print('\t C: %1.3f' %c)
    print('\t gamma: %1.3f' %g)
    print('\t Radial basis-cross validation: %1.3f')
    print ('\t Accuracy: %1.3f' %accuracy_score(Y_cross,Yg_rbf))
    print ('\t Precision: %1.3f' %precision_score(Y_cross,Yg_rbf))
    print ('\t Recall: %1.3f' %recall_score(Y_cross,Yg_rbf))
    print ('\t F1: %1.3f' %f1_score(Y_cross,Yg_rbf))
                                                       
    
    # hacer predicciones de test data set
    
    Yg_linear2=svc.predict(X_test)
    Yg_poly2=svc_poly.predict(X_test)
    Yg_rbf2=svc_rbf.predict(X_test)
    
    print('\t Grado polinomio %1.3f' %pol_degree)
    print('\t C: %1.3f' %c)
    print('\t gamma: %1.3f' %g)
    print('\t Linear-Test: %1.3f')
    print ('\t Accuracy: %1.3f' %accuracy_score(Y_test,Yg_linear2))
    print ('\t Precision: %1.3f' %precision_score(Y_test,Yg_linear2))
    print ('\t Recall: %1.3f' %recall_score(Y_test,Yg_linear2))
    print ('\t F1: %1.3f' %f1_score(Y_test,Yg_linear2))
    
    print('\t Grado polinomio %1.3f' %pol_degree)
    print('\t C: %1.3f' %c)
    print('\t gamma: %1.3f' %g)
    print('\t Polinomial-Test: %1.3f')
    print ('\t Accuracy: %1.3f' %accuracy_score(Y_test,Yg_poly2))
    print ('\t Precision: %1.3f' %precision_score(Y_test,Yg_poly2))
    print ('\t Recall: %1.3f' %recall_score(Y_test,Yg_poly2))
    print ('\t F1: %1.3f' %f1_score(Y_test,Yg_poly2))
    
    print('\t Grado polinomio %1.3f' %pol_degree)
    print('\t C: %1.3f' %c)
    print('\t gamma: %1.3f' %g)
    print('\t Radial basis-Test: %1.3f')
    print ('\t Accuracy: %1.3f' %accuracy_score(Y_test,Yg_rbf2))
    print ('\t Precision: %1.3f' %precision_score(Y_test,Yg_rbf2))
    print ('\t Recall: %1.3f' %recall_score(Y_test,Yg_rbf2))
    print ('\t F1: %1.3f' %f1_score(Y_test,Yg_rbf2))
    
    resu_cross=pd.DataFrame(np.zeros((1,3)))
    resu_test=pd.DataFrame(np.zeros((1,3)))
    
    resu_cross[1,0]=recall_score(Y_cross,Yg_linear)
    resu_cross[1,1]=recall_score(Y_cross,Yg_poly)
    resu_cross[1,2]=recall_score(Y_cross,Yg_rbf)
    
    resu_test[1,0]=recall_score(Y_test,Yg_linear2)
    resu_test[1,1]=recall_score(Y_test,Yg_poly2)
    resu_test[1,2]=recall_score(Y_test,Yg_rbf2)
    
    return   resu_cross,resu_test

In [None]:
support_vector_m(datos,1,1,'auto')

In [6]:
def frange(x, y, jump):
  while x <= y:
    yield x
    x *= jump

def frange2(x, y, jump):
  while x >= y:
    yield x
    x /= jump

In [None]:
resultados=np.zeros((30,6))
i=0
for pol_degree in range(1,4):
    for c in frange(.1,10,10):
        for g in frange2(.1,.001,10):
            resultados[i,(0,1,2)],resultados[i,(3,4,5)]=support_vector_m(datos,pol_degree,c,g)
            i=i+1
resultados         

  y_ = column_or_1d(y, warn=True)
