In [119]:
import pandas as pd

In [120]:
train_data = pd.read_csv("data/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [121]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [122]:
train_data['Age'].isna().sum(axis=0) # Null values of age

177

In [123]:
train_data['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [124]:
from sklearn.model_selection import train_test_split

# features = ["Pclass", "Sex", "Age", "SibSp", "Parch" ,"Fare"]
features = ["Pclass", "Sex", "Age" ,"Fare"]

X = train_data[features]
y = train_data["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [125]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

data_transformer = ColumnTransformer([
  ("age", SimpleImputer(strategy="mean"), ["Age"]),
  ('fare', SimpleImputer(strategy="median"), ["Fare"]),
  ("gender", OneHotEncoder(), ["Sex"])
], remainder="passthrough")


In [126]:
from sklearn.model_selection import GridSearchCV

X_prep = data_transformer.fit_transform(X)

param_grid = [
  {"n_estimators": [3, 10, 30]},
  {"bootstrap": [False], "n_estimators": [3, 10]}
]

forest_clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(forest_clf, param_grid, cv=3, scoring="accuracy")

In [127]:
grid_search.fit(X_prep, y)
print(grid_search.cv_results_['mean_test_score'])
print(grid_search.best_score_)

[0.78563412 0.79685746 0.7979798  0.77216611 0.78002245]
0.797979797979798


In [128]:
from sklearn.pipeline import Pipeline

main_pipeline = Pipeline([
  ("transformer", data_transformer),
  ("forest_clf", grid_search.best_estimator_)
])

In [129]:
main_pipeline.fit(X_train, y_train)
main_pipeline.score(X_test, y_test)

0.8022388059701493

In [130]:
test_data = pd.read_csv("data/test.csv")

In [131]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [132]:
preds = main_pipeline.predict(test_data[features])

In [133]:
preds_data: list[tuple] = []

for passId in test_data["PassengerId"]:
  pass_data = test_data.loc[test_data["PassengerId"] == passId][features]
  pred = main_pipeline.predict(pass_data)
  preds_data.append((passId, pred[0]))
  

preds_df = pd.DataFrame(preds_data, columns=["PassengerId", "Survived"])

In [134]:
preds_df["Survived"].value_counts()

0    279
1    139
Name: Survived, dtype: int64

In [135]:
preds_df.to_csv("result/submission.csv", index=False)