In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

titanic_data = pd.read_csv("train.csv")
titanic_data.head()

In [None]:
titanic_data.describe()

In [None]:
titanic_data.info()

In [None]:
titanic_cleaned = titanic_data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis = 1)
titanic_cleaned.head()

In [None]:
titanic_cleaned.describe()

In [None]:
titanic_cleaned.isnull().sum()

In [None]:
titanic_cleaned.corr(method = "pearson")

In [None]:
plt = sns.catplot("Survived", data = titanic_data, kind = "count")

In [None]:
sns.distplot(titanic_cleaned["Age"].dropna(), bins = 15, kde = False)

In [None]:
sns.distplot(titanic_cleaned["Age"][(titanic_cleaned["Sex"] == "female")].dropna(), bins = 7)
sns.distplot(titanic_cleaned["Age"][(titanic_cleaned["Sex"] == "male")].dropna(), bins = 7)

In [None]:
sns.distplot(titanic_cleaned["Age"][(titanic_cleaned["Sex"] == "female") & (titanic_cleaned["Survived"] == 1)].dropna(), bins = 7)
sns.distplot(titanic_cleaned["Age"][(titanic_cleaned["Sex"] == "male") & (titanic_cleaned["Survived"] == 1)].dropna(), bins = 7)

In [None]:
youngest_survived = titanic_cleaned["Age"][(titanic_cleaned["Survived"] == 1)].min()
youngest_died = titanic_cleaned["Age"][(titanic_cleaned["Survived"] == 0)].min()
oldest_survived = titanic_cleaned["Age"][(titanic_cleaned["Survived"] == 1)].max()
oldest_died = titanic_cleaned["Age"][(titanic_cleaned["Survived"] == 0)].max()

print("Самый юный выживший: {0}\nСамый юный погибший: {1}\nСамый пожилой выживший: {2}\nСамый пожилой погибший: {3}".format(youngest_survived, youngest_died, oldest_survived, oldest_died))

In [None]:
pclass_groups = titanic_cleaned.groupby(["Pclass", "Survived", "Sex"])
pclass_groups.size()

In [None]:
titanic_cleaned.groupby(["Pclass", "Sex"]).describe()

In [None]:
titanic_cleaned.groupby(["Pclass"])["Survived"].sum() / titanic_cleaned.groupby("Pclass")["Survived"].count() * 100

In [None]:
sns.factorplot("Survived", col = "Pclass", data = titanic_cleaned, kind = "count", size = 7, aspect = 0.8)

In [None]:
sns.factorplot("Survived", col = "Pclass", hue = "Sex", data = titanic_cleaned, kind = "count", size = 7, aspect = 0.8)

In [None]:
sns.lmplot("Age", "Fare", data = titanic_cleaned, fit_reg = False, hue = "Pclass", scatter_kws = {"marker": ".", "s": 20})

In [None]:
sns.lmplot("Age", "Fare", data = titanic_cleaned, fit_reg = False, hue = "Pclass", col = "Embarked", scatter_kws = {"marker": ".", "s": 20})

In [None]:
from sklearn.ensemble import RandomForestClassifier

test_data = pd.read_csv("test.csv")

y = titanic_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(titanic_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({"PassengerId": test_data.PassengerId, "Survived": predictions})

output.to_csv("out.csv", index = False)