In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

def split_label(df, test_size, label):
    train, test = train_test_split(df, test_size=test_size)
    features = df.columns.drop(label)
    train_x = train[features]
    train_y = train[label]
    test_x = test[features]
    test_y = test[label]
    return train_x, train_y, test_x, test_y

In [2]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import pandas as pd

titanic = pd.read_csv('titanic_ml.csv')
train_x, train_y, test_x, test_y = split_label(titanic, 0.2, 'Survived')

In [3]:
index_Embarked = train_x.columns.get_loc('Embarked')
ohe = ColumnTransformer([('embarked_ohe', OneHotEncoder(categories='auto'), ['Embarked'])], remainder='passthrough')

min_max_scaler = MinMaxScaler()
svm = SVC()

In [4]:
pipe = Pipeline([('ohe', ohe), ('sca', min_max_scaler), ('clf', svm)])

In [5]:
pipe.fit(train_x, train_y)
pipe.score(test_x, test_y)

0.7692307692307693

In [6]:
pipe.predict(test_x)

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0], dtype=int64)

In [11]:
from sklearn.cluster import KMeans
import joblib

index_Embarked = titanic.columns.get_loc('Embarked')
ohe = ColumnTransformer([('embarked_ohe', OneHotEncoder(categories='auto'), ['Embarked'])], remainder='passthrough')
titanic_1 = ohe.fit_transform(titanic)
min_max_scaler = MinMaxScaler()
titanic_2 = min_max_scaler.fit_transform(titanic_1)

clu = KMeans(n_clusters=3)
clu.fit(titanic_2)
joblib.dump(clu, 'kmeans.pkl')

['kmeans.pkl']

In [12]:
loaded_clu = joblib.load('kmeans.pkl')

In [13]:
index_Embarked = titanic.columns.get_loc('Embarked')
ohe = ColumnTransformer([('embarked_ohe', OneHotEncoder(categories='auto'), ['Embarked'])], remainder='passthrough')
sca = MinMaxScaler()
clu = KMeans(n_clusters=3)
pipe = Pipeline([('ohe', ohe), ('sca', sca), ('clu', clu)])

pipe.fit(titanic)
joblib.dump(pipe, 'kmeans_pipeline.pkl')

['kmeans_pipeline.pkl']

In [14]:
loaded_pipe = joblib.load('kmeans_pipeline.pkl')

In [15]:
#-----OPTIMIZCION DE HIPERPARAMETROS

In [16]:
from sklearn.model_selection import GridSearchCV

svc = SVC()
parameters = {
    'kernel':['linear', 'rbf'],
    'C': [1,2]
}
clf = GridSearchCV(svc, parameters)
clf.fit(train_x, train_y)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 2], 'kernel': ['linear', 'rbf']})

In [18]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [22]:
clf.best_estimator_


SVC(C=1, kernel='linear')

In [23]:
clf.predict(test_x)

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0], dtype=int64)