In [1]:
import pandas as pd
import re

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score

In [2]:
df = pd.read_csv('train.csv')
X = df.drop('Survived', axis=1) 
y = df.Survived

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.1, random_state=1234) 

In [4]:
class PrepProcesor(BaseEstimator, TransformerMixin): 
    def fit(self, X, y=None): 
        self.ageImputer = SimpleImputer()
        self.ageImputer.fit(X[['Age']])        
        return self 
        
    def transform(self, X, y=None):
        X['Age'] = self.ageImputer.transform(X[['Age']])
        X['CabinClass'] = X['Cabin'].fillna('M').apply(lambda x: str(x).replace(" ", "")).apply(lambda x: re.sub(r'[^a-zA-Z]', '', x))
        X['CabinNumber'] = X['Cabin'].fillna('M').apply(lambda x: str(x).replace(" ", "")).apply(lambda x: re.sub(r'[^0-9]', '', x)).replace('', 0) 
        X['Embarked'] = X['Embarked'].fillna('M')
        X = X.drop(['PassengerId', 'Name', 'Ticket','Cabin'], axis=1)
        return X

In [5]:
preproc = PrepProcesor()
numeric_pipeline = Pipeline([('Scaler', StandardScaler())]) 
categorical_pipeline = Pipeline([('OneHot', OneHotEncoder(handle_unknown='ignore'))])
transformer = ColumnTransformer([('num', numeric_pipeline, ['Pclass','Age','SibSp','Parch','Fare','CabinNumber']), ('cat', categorical_pipeline, ['Sex','Embarked','CabinClass'])]) 

In [6]:
mlpipe = Pipeline([('InitialPreproc', PrepProcesor()), ('Transformer',transformer), ('xgb', XGBClassifier())])

In [7]:
mlpipe.fit(X_train,y_train)

In [8]:
model = mlpipe.fit(X_train,y_train)

In [9]:
yhat = model.predict(X_test) 

In [10]:
precision_score(y_test, yhat) 

0.8529411764705882

In [11]:
recall_score(y_test, yhat) 

0.8285714285714286

In [12]:
accuracy_score(y_test, yhat) 

0.8777777777777778

In [13]:
import joblib

In [14]:
joblib.dump(model, 'xgbpipe_my_model.joblib') 

['xgbpipe_my_model.joblib']

In [15]:
model = joblib.load('xgbpipe_my_model.joblib')

In [16]:
test = pd.read_csv('test.csv')

In [17]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [18]:
yhat = model.predict(test)
yhat

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [19]:
columns = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

passengerid = '123456'
pclass = 1
name = 'John Smith'
sex = 'male'
age = 0
sibsp = 0
parch = 0
ticket = "12345"
fare = 0
cabin = "C52"
embarked = 'S'

def predict_result():
    # Create a DataFrame from user input
    data = {
        'PassengerId': [passengerid],
        'Pclass': [int(pclass)],
        'Name': [name],
        'Sex': [sex],
        'Age': [age],
        'SibSp': [sibsp],
        'Parch': [parch],
        'Ticket': [ticket],
        'Fare': [fare],
        'Cabin': [cabin],
        'Embarked': [embarked]
    }

    df = pd.DataFrame(data, columns=columns)

    # Make prediction
    prediction = model.predict(df)

    # Display the result
    if prediction[0] == 1:
        print('Passenger Survived :)')
    else:
        print('Passenger did not Survive :(')


#trigger = st.button('Predict', on_click=predict_result)
trigger = predict_result()

Passenger did not Survive :(


# Реализация без контейнера

In [39]:
df = pd.read_csv('train.csv')
X = df.drop('Survived', axis=1) 
y = df.Survived

In [40]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1234)

In [42]:
passengerid = '123456'
pclass = 1
name = 'John Smith'
sex = 'male'
age = 0
sibsp = 0
parch = 0
ticket = "12345"
fare = 0
cabin = "C52"
embarked = 'S'

columns = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

In [43]:
row = np.array([passengerid,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked]) 
X = pd.DataFrame([row], columns = columns)

In [44]:
X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,123456,1,John Smith,male,0,0,0,12345,0,C52,S


# 1

In [45]:
class PrepProcesor:
    def fit(self, X, y=None):
        self.ageImputer = SimpleImputer()
        self.ageImputer.fit(X[['Age']])
        return self

    def transform(self, X, y=None):
        X['Age'] = self.ageImputer.transform(X[['Age']])
        X['CabinClass'] = X['Cabin'].fillna('M').apply(lambda x: str(x).replace(" ", "")).apply(lambda x: re.sub(r'[^a-zA-Z]', '', x))
        X['CabinNumber'] = X['Cabin'].fillna('M').apply(lambda x: str(x).replace(" ", "")).apply(lambda x: re.sub(r'[^0-9]', '', x)).replace('', 0)
        X['Embarked'] = X['Embarked'].fillna('M')
        X = X.drop(['PassengerId', 'Name', 'Ticket','Cabin'], axis=1)
        return X

# Пример использования
preproc = PrepProcesor()
preproc.fit(X_train)
X_train = preproc.transform(X_train)
X_test = preproc.transform(X_test)
X = preproc.transform(X)

In [47]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinClass,CabinNumber
267,3,male,25.000000,1,0,7.7750,S,M,0
635,2,female,28.000000,0,0,13.0000,S,M,0
473,2,female,23.000000,0,0,13.7917,C,D,0
207,3,male,26.000000,0,0,18.7875,C,M,0
290,1,female,26.000000,0,0,78.8500,S,M,0
...,...,...,...,...,...,...,...,...,...
204,3,male,18.000000,0,0,8.0500,S,M,0
53,2,female,29.000000,1,0,26.0000,S,M,0
294,3,male,24.000000,0,0,7.8958,S,M,0
723,2,male,50.000000,0,0,13.0000,S,M,0


In [46]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinClass,CabinNumber
523,1,female,44.000000,0,1,57.9792,C,B,18
778,3,male,29.601302,0,0,7.7375,Q,M,0
760,3,male,29.601302,0,0,14.5000,S,M,0
496,1,female,54.000000,1,0,78.2667,C,D,20
583,1,male,36.000000,0,0,40.1250,C,A,10
...,...,...,...,...,...,...,...,...,...
72,2,male,21.000000,0,0,73.5000,S,M,0
271,3,male,25.000000,0,0,0.0000,S,M,0
281,3,male,28.000000,0,0,7.8542,S,M,0
230,1,female,35.000000,1,0,83.4750,S,C,83


In [48]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinClass,CabinNumber
0,1,male,0.0,0,0,0,S,C,52


# 2

In [49]:
from sklearn.preprocessing import StandardScaler

# Определяем числовые признаки
numeric_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'CabinNumber']

# Создаем и обучаем StandardScaler
scaler = StandardScaler()
scaler.fit(X_train[numeric_features])

# Применяем масштабирование к тренировочным и тестовым данным
X_train[numeric_features] = scaler.transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])
X[numeric_features] = scaler.transform(X[numeric_features])

In [50]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinClass,CabinNumber
267,0.831027,male,-3.494410e-01,0.455773,-0.469923,-0.504690,S,M,-0.050694
635,-0.359765,female,-1.216092e-01,-0.480315,-0.469923,-0.396790,S,M,-0.050694
473,-0.359765,female,-5.013289e-01,-0.480315,-0.469923,-0.380440,C,D,-0.050694
207,0.831027,male,-2.734971e-01,-0.480315,-0.469923,-0.277273,C,M,-0.050694
290,-1.550556,female,-2.734971e-01,-0.480315,-0.469923,0.963063,S,M,-0.050694
...,...,...,...,...,...,...,...,...,...
204,0.831027,male,-8.810486e-01,-0.480315,-0.469923,-0.499011,S,M,-0.050694
53,-0.359765,female,-4.566527e-02,0.455773,-0.469923,-0.128330,S,M,-0.050694
294,0.831027,male,-4.253850e-01,-0.480315,-0.469923,-0.502195,S,M,-0.050694
723,-0.359765,male,1.549157e+00,-0.480315,-0.469923,-0.396790,S,M,-0.050694


In [51]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinClass,CabinNumber
523,-1.550556,female,1.093494e+00,-0.480315,0.748227,0.532065,C,B,-0.050688
778,0.831027,male,-2.698071e-16,-0.480315,-0.469923,-0.505464,Q,M,-0.050694
760,0.831027,male,-2.698071e-16,-0.480315,-0.469923,-0.365813,S,M,-0.050694
496,-1.550556,female,1.852933e+00,0.455773,-0.469923,0.951018,C,D,-0.050687
583,-1.550556,male,4.859423e-01,-0.480315,-0.469923,0.163362,C,A,-0.050691
...,...,...,...,...,...,...,...,...,...
72,-0.359765,male,-6.532168e-01,-0.480315,-0.469923,0.852582,S,M,-0.050694
271,0.831027,male,-3.494410e-01,-0.480315,-0.469923,-0.665249,S,M,-0.050694
281,0.831027,male,-1.216092e-01,-0.480315,-0.469923,-0.503054,S,M,-0.050694
230,-1.550556,female,4.099984e-01,0.455773,-0.469923,1.058573,S,C,-0.050665


In [52]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinClass,CabinNumber
0,-1.550556,male,-2.248039,-0.480315,-0.469923,-0.665249,S,C,-0.050676


In [53]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

# 3

In [54]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Пример данных
# X_train и X_test должны быть определены заранее

# Определяем категориальные признаки
categorical_features = ['Sex', 'Embarked', 'CabinClass']

# Проверка размеров данных до преобразования
print(f"X_train shape before encoding: {X_train.shape}")
print(f"X_test shape before encoding: {X_test.shape}")

# Создаем и обучаем OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[categorical_features])

# Применяем one-hot encoding к тренировочным и тестовым данным
X_train_encoded = encoder.transform(X_train[categorical_features]).toarray()
X_test_encoded = encoder.transform(X_test[categorical_features]).toarray()
X_encoded = encoder.transform(X[categorical_features]).toarray()

# Проверка размеров данных после преобразования
print(f"X_train_encoded shape: {X_train_encoded.shape}")
print(f"X_test_encoded shape: {X_test_encoded.shape}")

# Создаем новые DataFrame с закодированными признаками
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(categorical_features))
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(categorical_features))
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_features))

# Объединяем закодированные признаки с оригинальными данными
X_train = pd.concat([X_train.drop(categorical_features, axis=1), X_train_encoded_df], axis=1)
X_test = pd.concat([X_test.drop(categorical_features, axis=1), X_test_encoded_df], axis=1)
X = pd.concat([X.drop(categorical_features, axis=1), X_encoded_df], axis=1)

# Проверка размеров данных после объединения
print(f"X_train shape after encoding: {X_train.shape}")
print(f"X_test shape after encoding: {X_test.shape}")

X_train shape before encoding: (801, 9)
X_test shape before encoding: (90, 9)
X_train_encoded shape: (801, 22)
X_test_encoded shape: (90, 22)
X_train shape after encoding: (801, 28)
X_test shape after encoding: (90, 28)


In [55]:
X_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,CabinNumber,Sex_female,Sex_male,Embarked_C,Embarked_M,...,CabinClass_CC,CabinClass_CCC,CabinClass_D,CabinClass_DD,CabinClass_E,CabinClass_F,CabinClass_FG,CabinClass_G,CabinClass_M,CabinClass_T
0,0.831027,-3.494410e-01,0.455773,-0.469923,-0.504690,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.359765,-1.216092e-01,-0.480315,-0.469923,-0.396790,-0.050694,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.359765,-5.013289e-01,-0.480315,-0.469923,-0.380440,-0.050694,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.831027,-2.734971e-01,-0.480315,-0.469923,-0.277273,-0.050694,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.550556,-2.734971e-01,-0.480315,-0.469923,0.963063,-0.050694,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796,0.831027,-8.810486e-01,-0.480315,-0.469923,-0.499011,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
797,-0.359765,-4.566527e-02,0.455773,-0.469923,-0.128330,-0.050694,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
798,0.831027,-4.253850e-01,-0.480315,-0.469923,-0.502195,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
799,-0.359765,1.549157e+00,-0.480315,-0.469923,-0.396790,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [56]:
X_test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,CabinNumber,Sex_female,Sex_male,Embarked_C,Embarked_M,...,CabinClass_CC,CabinClass_CCC,CabinClass_D,CabinClass_DD,CabinClass_E,CabinClass_F,CabinClass_FG,CabinClass_G,CabinClass_M,CabinClass_T
0,-1.550556,1.093494e+00,-0.480315,0.748227,0.532065,-0.050688,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.831027,-2.698071e-16,-0.480315,-0.469923,-0.505464,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.831027,-2.698071e-16,-0.480315,-0.469923,-0.365813,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-1.550556,1.852933e+00,0.455773,-0.469923,0.951018,-0.050687,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.550556,4.859423e-01,-0.480315,-0.469923,0.163362,-0.050691,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,-0.359765,-6.532168e-01,-0.480315,-0.469923,0.852582,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
86,0.831027,-3.494410e-01,-0.480315,-0.469923,-0.665249,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
87,0.831027,-1.216092e-01,-0.480315,-0.469923,-0.503054,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
88,-1.550556,4.099984e-01,0.455773,-0.469923,1.058573,-0.050665,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,CabinNumber,Sex_female,Sex_male,Embarked_C,Embarked_M,...,CabinClass_CC,CabinClass_CCC,CabinClass_D,CabinClass_DD,CabinClass_E,CabinClass_F,CabinClass_FG,CabinClass_G,CabinClass_M,CabinClass_T
0,-1.550556,-2.248039,-0.480315,-0.469923,-0.665249,-0.050676,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
X_train.shape[0] == y_train.shape[0]

True

In [61]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

In [62]:
y_res_train = model.predict(X_train)

In [63]:
precision_score(y_train, y_res_train)

0.9897260273972602

In [64]:
recall_score(y_train, y_res_train)

0.9413680781758957

In [65]:
yhat = model.predict(X_test)

In [67]:
precision_score(y_test, yhat)

0.8529411764705882

In [68]:
recall_score(y_test, yhat)

0.8285714285714286

In [69]:
X_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,CabinNumber,Sex_female,Sex_male,Embarked_C,Embarked_M,...,CabinClass_CC,CabinClass_CCC,CabinClass_D,CabinClass_DD,CabinClass_E,CabinClass_F,CabinClass_FG,CabinClass_G,CabinClass_M,CabinClass_T
0,0.831027,-3.494410e-01,0.455773,-0.469923,-0.504690,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.359765,-1.216092e-01,-0.480315,-0.469923,-0.396790,-0.050694,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.359765,-5.013289e-01,-0.480315,-0.469923,-0.380440,-0.050694,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.831027,-2.734971e-01,-0.480315,-0.469923,-0.277273,-0.050694,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.550556,-2.734971e-01,-0.480315,-0.469923,0.963063,-0.050694,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796,0.831027,-8.810486e-01,-0.480315,-0.469923,-0.499011,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
797,-0.359765,-4.566527e-02,0.455773,-0.469923,-0.128330,-0.050694,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
798,0.831027,-4.253850e-01,-0.480315,-0.469923,-0.502195,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
799,-0.359765,1.549157e+00,-0.480315,-0.469923,-0.396790,-0.050694,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [70]:
y_test_enter_str = model.predict(X)

In [71]:
y_test_enter_str

array([0])

In [74]:
def transform(X):
    X['CabinClass'] = X['Cabin'].fillna('M').apply(lambda x: str(x).replace(" ", "")).apply(lambda x: re.sub(r'[^a-zA-Z]', '', x))
    X['CabinNumber'] = X['Cabin'].fillna('M').apply(lambda x: str(x).replace(" ", "")).apply(lambda x: re.sub(r'[^0-9]', '', x)).replace('', 0) 
    X['Embarked'] = X['Embarked'].fillna('M')
    X = X.drop(['PassengerId', 'Name', 'Ticket','Cabin'], axis=1)
    return X

In [75]:
# Определение колонок
columns = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

# Пример данных - не выжил
# passengerid = '123456'
# pclass = 1
# name = 'John Smith'
# sex = 'male'
# age = 0
# sibsp = 0
# parch = 0
# ticket = "12345"
# fare = 0
# cabin = "C52"
# embarked = 'S'

# Пример данных - выжил
passengerid = '123456'
pclass = 1
name = 'John Smith'
sex = 'female'  
age = 21
sibsp = 7
parch = 1
ticket = "12345"
fare = 125
cabin = "C52"
embarked = 'S'

row = np.array([passengerid,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked]) 
X = pd.DataFrame([row], columns = columns)

X = transform(X)
X[numeric_features] = scaler.transform(X[numeric_features])
X_encoded = encoder.transform(X[categorical_features]).toarray()
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_features))
X = pd.concat([X.drop(categorical_features, axis=1), X_encoded_df], axis=1)
pred = model.predict(X)
if pred[0] == 1:
    print('Passenger Survived :)')
else:
    print('Passenger did not Survive :(')

Passenger Survived :)


In [76]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,CabinNumber,Sex_female,Sex_male,Embarked_C,Embarked_M,...,CabinClass_CC,CabinClass_CCC,CabinClass_D,CabinClass_DD,CabinClass_E,CabinClass_F,CabinClass_FG,CabinClass_G,CabinClass_M,CabinClass_T
0,-1.550556,-0.653217,6.072298,0.748227,1.916096,-0.050676,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
model.predict(X)

array([1])

In [81]:
import joblib

joblib.dump(model, "my_model_titanic.joblib")

['my_model_titanic.joblib']

In [82]:
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [83]:
joblib.dump(encoder, "encoder.pkl")

['encoder.pkl']