In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from time import process_time

class Summary:
    def __init__(self, data, *args):
        self._summary = data
        
    def get_descr(self, colname):
        
        """Returns the description of a specific variable"""
        
        return self._summary.loc[self._summary["Variable"] == colname, "Description"].values[0]
    
    def get_type(self, colname):
        
        """Returns the type of a specific variable"""
        
        return self._summary.loc[self._summary["Variable"] == colname, "Type"].values[0]
    
    def get_ifna(self, colname):
        
        """Returns whether a specific variable allows NA values"""
        
        return self._summary.loc[self._summary["Variable"] == colname, "Accepts NAs"].values[0]
    
    def get_cols_by_type(self, type):
        
        """Returns the names of the variables of a certain type. If you want the output
        to be a list, you must transform it to list (list(get_cols_by_type('int')))"""
        
        return self._summary.loc[self._summary["Type"] == type, "Variable"]

nov2014 = pd.read_csv(r"C:/Users/losaa/Downloads/Escritorio/Estudios/PROYECTO III/PAR-2014-Nov2-Nov29.csv", sep = ",")
summary = Summary(pd.read_csv(r"C:/Users/losaa/Downloads/Escritorio/Estudios/PROYECTO III/VariablesDescr.csv", sep = ";", nrows = 15))



In [3]:
nov2014.head()

Unnamed: 0,bookingdate,arrivaldate,poocountry,origincity,destinationcity,bookingsign,leadtime,paxprofile,lengthofstay,losname,cabinclass,distchannel,pax,numpss,numnss
0,2014-11-16,2014-12-02,OM,LON,PAR,NEW_BOOKING,16,LEISURE,10,STAY,T,RETAIL,1,1,1
1,2014-11-16,2014-12-19,FR,DTT,PAR,NEW_BOOKING,32,LEISURE,-3,RETURN_HOME,T,OTHER,1,4,0
2,2014-11-16,2014-11-19,IL,TLV,PAR,PARTIAL_CANCELLATION,3,LEISURE,5,STAY,T,RETAIL,-1,0,1
3,2014-11-16,2015-01-28,AM,LAX,PAR,NEW_BOOKING,72,LEISURE,-4,SHORT_TRANSFER,T,OTHER,3,2,1
4,2014-11-16,2014-12-18,US,ATL,PAR,PARTIAL_ADDITION,31,LEISURE,-2,DWELLING_TRANSFER,T,OTHER,1,0,5


In [4]:
for var in list(summary.get_cols_by_type("datetime")):
    nov2014[var] = pd.to_datetime(nov2014[var])


#Categories
for var in list(summary.get_cols_by_type("factor")):
    nov2014[var] = nov2014[var].astype("category")

nov2014.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5166300 entries, 0 to 5166299
Data columns (total 15 columns):
 #   Column           Dtype         
---  ------           -----         
 0   bookingdate      datetime64[ns]
 1   arrivaldate      datetime64[ns]
 2   poocountry       category      
 3   origincity       category      
 4   destinationcity  category      
 5   bookingsign      category      
 6   leadtime         int64         
 7   paxprofile       category      
 8   lengthofstay     int64         
 9   losname          category      
 10  cabinclass       category      
 11  distchannel      category      
 12  pax              int64         
 13  numpss           int64         
 14  numnss           int64         
dtypes: category(8), datetime64[ns](2), int64(5)
memory usage: 325.2 MB


In [5]:
data = {}

for cat in np.unique(nov2014.paxprofile):
    data[cat] = nov2014.iloc[:,7:8].applymap(lambda x: x == cat).values

In [6]:
nov2014.head()

Unnamed: 0,bookingdate,arrivaldate,poocountry,origincity,destinationcity,bookingsign,leadtime,paxprofile,lengthofstay,losname,cabinclass,distchannel,pax,numpss,numnss
0,2014-11-16,2014-12-02,OM,LON,PAR,NEW_BOOKING,16,LEISURE,10,STAY,T,RETAIL,1,1,1
1,2014-11-16,2014-12-19,FR,DTT,PAR,NEW_BOOKING,32,LEISURE,-3,RETURN_HOME,T,OTHER,1,4,0
2,2014-11-16,2014-11-19,IL,TLV,PAR,PARTIAL_CANCELLATION,3,LEISURE,5,STAY,T,RETAIL,-1,0,1
3,2014-11-16,2015-01-28,AM,LAX,PAR,NEW_BOOKING,72,LEISURE,-4,SHORT_TRANSFER,T,OTHER,3,2,1
4,2014-11-16,2014-12-18,US,ATL,PAR,PARTIAL_ADDITION,31,LEISURE,-2,DWELLING_TRANSFER,T,OTHER,1,0,5


In [7]:
for columna in data:
    nov2014[columna] = data[columna]

In [8]:
nov2014.head()

Unnamed: 0,bookingdate,arrivaldate,poocountry,origincity,destinationcity,bookingsign,leadtime,paxprofile,lengthofstay,losname,cabinclass,distchannel,pax,numpss,numnss,BUSINESS,GROUP,LEISURE,VFR
0,2014-11-16,2014-12-02,OM,LON,PAR,NEW_BOOKING,16,LEISURE,10,STAY,T,RETAIL,1,1,1,False,False,True,False
1,2014-11-16,2014-12-19,FR,DTT,PAR,NEW_BOOKING,32,LEISURE,-3,RETURN_HOME,T,OTHER,1,4,0,False,False,True,False
2,2014-11-16,2014-11-19,IL,TLV,PAR,PARTIAL_CANCELLATION,3,LEISURE,5,STAY,T,RETAIL,-1,0,1,False,False,True,False
3,2014-11-16,2015-01-28,AM,LAX,PAR,NEW_BOOKING,72,LEISURE,-4,SHORT_TRANSFER,T,OTHER,3,2,1,False,False,True,False
4,2014-11-16,2014-12-18,US,ATL,PAR,PARTIAL_ADDITION,31,LEISURE,-2,DWELLING_TRANSFER,T,OTHER,1,0,5,False,False,True,False


In [9]:
numnov2014 = nov2014.loc[:, list(summary.get_cols_by_type("int"))]
boolean = nov2014.iloc[:,15:]

In [10]:
for columna in boolean.columns.values:
    numnov2014[columna] = boolean[columna]

In [11]:
numnov2014['bookingsign'] = nov2014['bookingsign']

In [12]:
numnov2014.head()

Unnamed: 0,leadtime,lengthofstay,pax,numnss,numpss,BUSINESS,GROUP,LEISURE,VFR,bookingsign
0,16,10,1,1,1,False,False,True,False,NEW_BOOKING
1,32,-3,1,0,4,False,False,True,False,NEW_BOOKING
2,3,5,-1,1,0,False,False,True,False,PARTIAL_CANCELLATION
3,72,-4,3,1,2,False,False,True,False,NEW_BOOKING
4,31,-2,1,5,0,False,False,True,False,PARTIAL_ADDITION


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(numnov2014.iloc[:,:-1],numnov2014.bookingsign, test_size = 0.2, random_state = 88)

# SVM

In [None]:
from sklearn.model_selection import cross_val_score
import math
import pandas as pd
from sklearn import svm

def plot_accuracy(nombres, nn_list_scores,errorInterval):
    pd.DataFrame({"Tipo":nombres,"Error":nn_list_scores}).set_index("Tipo").plot.bar(yerr=errorInterval,figsize= (9,6),ylim=(0.0,0.5),rot=0)
    plt.xticks(rotation=90)
    plt.ylim(0,0.15)
    plt.show()

cv_scores=[]
nombres=[]

for i in [ 'rbf', 'poly', 'linear']:
    for c in [100]:
        if i =='linear':
            
            print('ENTRO')
            clf = svm.SVC(kernel=i, C=c)
            scores = cross_val_score(clf,X_train,Y_train,cv = 5,scoring ="accuracy")
            cv_scores.append(1-scores.mean())
            print(cv_scores)
            nombres.append(f'SVC: kernel {i} con C={c}')
            print(nombres)
            
        
                
errorInterval = [1.96*math.sqrt(x*(1-x)/len(Y)) for x in cv_scores]
print(errorInterval)
plot_accuracy(nombres, cv_scores,errorInterval)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes

ENTRO


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_folds = np.zeros(n_samples, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprec

In [None]:
if i=='poly':
            
            for d in [2,3]:
                print(f' Entro con grado:   {d}')
                clf = svm.SVC(kernel=i, C=c, degree=d)
                scores = cross_val_score(clf,X_train,Y_train,cv = 5,scoring ="accuracy")
                cv_scores.append(1-scores.mean())
                print(cv_scores)
                nombres.append(f'SVC: kernel {i} de grado {d} con C={c}')
                print(nombres)
        if i == 'rbf':
            for g in ['scale']:
                print(f'Entramos en la última parte  {c}')
                clf = svm.SVC(kernel=i, C=c, gamma=g)
                scores = cross_val_score(clf,X_train,Y_train,cv = 5,scoring ="accuracy")
                cv_scores.append(1-scores.mean())
                print(cv_scores)
                nombres.append(f'SVC: kernel {i} con C={c} y gamma={g}')
                print(nombres)