In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.compose import ColumnTransformer



In [21]:
df = pd.read_csv("train.csv")

PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
Destination - The planet the passenger will be debarking to.
Age - The age of the passenger.
VIP - Whether the passenger has paid for special VIP service during the voyage.
RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
Name - The first and last names of the passenger.
Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.
test.csv - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.
sample_submission.csv - A submission file in the correct format.
PassengerId - Id for each passenger in the test set.
Transported - The target. For each passenger, predict either True or False.

In [22]:
df.drop(columns=["PassengerId","Name","Cabin"],inplace=True)

In [23]:
df["Transported"] = df["Transported"].map({True:1, False:0})

In [24]:
df["Transported"]

0       0
1       1
2       0
3       0
4       1
       ..
8688    0
8689    0
8690    1
8691    0
8692    1
Name: Transported, Length: 8693, dtype: int64

In [25]:
X = df.iloc[:,0:11]
y = df["Transported"]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.25)

In [27]:
cat_feat = np.array([coluna for coluna in X_train.columns if X_train[coluna].dtype.name == 'object'])

num_feat = np.array([coluna for coluna in X_train.columns if coluna not in cat_feat])

In [28]:
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer

In [29]:
cat_pipe = Pipeline([("imputer_cat", SimpleImputer(strategy="most_frequent")),("encoder", ce.OneHotEncoder()),
                    ])

num_pipe = Pipeline([("imputer_num", SimpleImputer(strategy="median"))])

In [30]:
transformer = ColumnTransformer([("num_trans", num_pipe, num_feat),
                            ("cat_trans", cat_pipe, cat_feat)])

In [31]:
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)


In [32]:
modelo = xgb.XGBClassifier()

In [34]:
modelo.fit(X_train_transformed, y_train)

In [40]:
modelo.predict(X_test_transformed)

array([1, 0, 0, ..., 1, 0, 0])

In [35]:
y_pred = modelo.predict(X_test_transformed)

In [42]:
ACC = accuracy_score(y_pred, y_test)
ACC

1.0

In [43]:
f1 = f1_score(y_pred, y_test)
f1

1.0