In [1]:
import pandas as pd
import os

SRC_DIR = os.path.abspath(".")         # endereço do script
BASE_DIR = os.path.dirname(SRC_DIR)
DATA_DIR = os.path.join(BASE_DIR, "data")

data_file = os.path.join(DATA_DIR, 'train.csv')
titanic_train = pd.read_csv(data_file)
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
titanic_train.drop(['PassengerId',  'Cabin', 'Ticket', 'Name'], axis = 1,  inplace = True)
titanic_train.sample(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
477,0,3,male,29.0,1,0,7.0458,S
449,1,1,male,52.0,0,0,30.5,S
86,0,3,male,16.0,1,3,34.375,S
108,0,3,male,38.0,0,0,7.8958,S
803,1,3,male,0.42,0,1,8.5167,C
201,0,3,male,,8,2,69.55,S
386,0,3,male,1.0,5,2,46.9,S
15,1,2,female,55.0,0,0,16.0,S
353,0,3,male,25.0,1,0,17.8,S
616,0,3,male,34.0,1,1,14.4,S


In [4]:
titanic_train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
titanic_train.describe(include=['O'])

Unnamed: 0,Sex,Embarked
count,891,889
unique,2,3
top,male,S
freq,577,644


In [6]:
titanic_train.Age.median()

28.0

In [7]:
titanic_train.Embarked.mode()

0    S
dtype: object

In [8]:
titanic_train.fillna({'Age': titanic_train['Age'].median(), 
                               'Embarked': 'S'}, inplace = True)
titanic_train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

Separando as features e a variável target
***

In [9]:
titanic_train['Pclass'] = titanic_train['Pclass'].astype('object')
target = titanic_train['Survived']
data=titanic_train.drop(columns=["Survived"])

In [10]:
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(data,target, random_state=123)

In [11]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data_train)
categorical_columns=categorical_columns_selector(data_train)
display(numerical_columns)
display(categorical_columns)

['Age', 'SibSp', 'Parch', 'Fare']

['Pclass', 'Sex', 'Embarked']

In [12]:
from sklearn.neighbors import KNeighborsClassifier
data_train_num = data_train[numerical_columns]
model = KNeighborsClassifier(n_neighbors=30)

In [13]:
model.fit(data_train_num, target_train)

KNeighborsClassifier(n_neighbors=30)

In [14]:
first_data_values = data_train_num.iloc[:10]
first_predictions=model.predict(first_data_values)
first_predictions

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [15]:
target_first_values = target_train[:10]
(target_first_values==first_predictions).mean()

0.7

Vamos carregar os dados de teste e computar a acurácia nos dados de treinamento

In [16]:
numerical_columns_test = numerical_columns_selector(data_test)
categorical_columns_test=categorical_columns_selector(data_test)
display(numerical_columns_test)
display(categorical_columns_test)

['Age', 'SibSp', 'Parch', 'Fare']

['Pclass', 'Sex', 'Embarked']

In [17]:
data_test_num = data_test[numerical_columns_test]
model.score(data_test_num, target_test)

0.7309417040358744

Usando todas as colunas como irá melhorar meu modelo ?


In [18]:
target = titanic_train['Survived']
data=titanic_train.drop(columns=["Survived"])
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(data,target, random_state=123)
numerical_columns = numerical_columns_selector(data)
categorical_columns=categorical_columns_selector(data)
display(numerical_columns)
display(categorical_columns)

['Age', 'SibSp', 'Parch', 'Fare']

['Pclass', 'Sex', 'Embarked']

In [19]:
data_train.shape, data_test.shape

((668, 7), (223, 7))

In [20]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown='ignore')
numerical_preprocessor =  StandardScaler()

In [21]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard-scaler', numerical_preprocessor, numerical_columns)
])

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

knn = make_pipeline(preprocessor, KNeighborsClassifier(n_neighbors=30))
logit = make_pipeline(preprocessor, LogisticRegression(max_iter = 500))

In [23]:
from sklearn import set_config
set_config(display = 'diagram')
knn

In [24]:
logit

In [25]:
_ = knn.fit(data_train, target_train)
_ = logit.fit(data_train, target_train)

In [26]:
pred_knn = knn.predict(data_test)
pred_logit = logit.predict(data_test)

In [27]:
display(pred_knn[:10])

array([1, 0, 0, 0, 0, 0, 1, 1, 1, 1])

In [28]:
display(knn.score(data_test, pred_knn))
display(logit.score(data_test, pred_logit))

1.0

1.0

In [29]:
from sklearn.model_selection import cross_validate
cv_results = cross_validate(knn, data, target, cv = 5)
cv_results

{'fit_time': array([0.02462411, 0.02449608, 0.02082825, 0.01962376, 0.0342927 ]),
 'score_time': array([0.03616619, 0.03601646, 0.03301287, 0.03214502, 0.03812003]),
 'test_score': array([0.77094972, 0.81460674, 0.8258427 , 0.78651685, 0.84831461])}

In [30]:
scores = cv_results['test_score']
print(f"The mean cross-validation accuracy is: \n {scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 
 0.809 +/- 0.028


In [31]:
cv_results2 = cross_validate(logit, data, target, cv = 5)
cv_results2

{'fit_time': array([0.09372044, 0.08020139, 0.04657102, 0.03475571, 0.03472376]),
 'score_time': array([0.01039028, 0.03746343, 0.00886512, 0.01104569, 0.0169208 ]),
 'test_score': array([0.7877095 , 0.78651685, 0.78651685, 0.76966292, 0.83146067])}

In [32]:
scores = cv_results2['test_score']
print(f"The mean cross-validation accuracy is: \n {scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 
 0.792 +/- 0.021


In [41]:
model = make_pipeline(preprocessor, LogisticRegression())

In [44]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

para_distributions = {
    "logisticregression__C":loguniform(0.001, 10),
}

In [45]:
import numpy as np

model_random_search=RandomizedSearchCV(
    model, param_distributions= para_distributions, 
    n_iter=20, error_score=np.nan, n_jobs=-1, verbose = 1, random_state=1
)

model_random_search.fit(data_train, target_train)
model_random_search.best_params_ 

Fitting 5 folds for each of 20 candidates, totalling 100 fits
