In [1]:
import numpy as np
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


**Preparation of a dataframe for the train and test data:**

In [2]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
#train_df.head()

test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
#test_df.head()

traintest_df = [train_df, test_df]

**Which features are obviously available in the dataset "train_df?**

In [3]:
print(train_df.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


**What is the percentage of women who have survived?**

In [4]:
women = train_df.loc[train_df.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("Percent of women who survived:", rate_women)

Percent of women who survived: 0.7420382165605095


**What is the percentage of men who have survived?**

In [5]:
men = train_df.loc[train_df.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("Percent of men who survived:", rate_men)

Percent of men who survived: 0.18890814558058924


In [6]:
train_df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Ponesell, Mr. Martin",male,1601,G6,S
freq,1,577,7,4,644


In [7]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


**Checks based on the problem description and the first data checks:**

1.	Women (Sex=female) survived with 75% rate (0.7420382165605095)
3.	1st class passengers (Pclass=1) survived with 63% rate (0.629630)

**Play around with the names and titels:**

In [8]:
for dataset in traintest_df:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


**Group name and titels:**

In [9]:
for dataset in traintest_df:
    dataset['Title'] = dataset['Title'].replace(['Capt', 'Col','Countess','Lady','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Other,0.347826


**Checks II based on the problem description and the first data checks:**

1. Women (Sex=female) survived with 75% rate (0.7420382165605095)
2. 1st class passengers (Pclass=1) survived with 63% rate (0.629630)
3. If Title == Master survival with 58% rate (0.575000)


In [10]:
title_clear = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5}
for dataset in traintest_df:
    dataset['Title'] = dataset['Title'].map(title_clear)
    dataset['Title'] = dataset['Title'].fillna(0)

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


**Actions: **
1. drop the Name feature from training and testing datasets
2. drop PassengerId feature in the training dataset

In [11]:
train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
traintest_df = [train_df, test_df]
train_df.shape, test_df.shape

((891, 11), (418, 11))

**Action: **

1. converting Sex feature to a new feature called Sex where female=1 and male=0.

In [12]:
for dataset in traintest_df:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,0,3,0,22.0,1,0,A/5 21171,7.25,,S,1
1,1,1,1,38.0,1,0,PC 17599,71.2833,C85,C,3
2,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,S,2
3,1,1,1,35.0,1,0,113803,53.1,C123,S,3
4,0,3,0,35.0,0,0,373450,8.05,,S,1


**Action: **
1. drop feature "Age" 
2. drop feature "SibSp"
3. drop feature "Parch"
4. drop feature "Ticket" 
5. drop feature "Fare" 
6. drop feature "Cabin" 
7. drop feature "Embarked"

In [13]:
train_df = train_df.drop(['Age', 'SibSp', 'Parch','Ticket', 'Fare','Cabin','Embarked'], axis=1)
traintest_df = [train_df, test_df]
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Title
0,0,3,0,1
1,1,1,1,3
2,1,3,1,2
3,1,1,1,3
4,0,3,0,1


In [14]:
test_df = test_df.drop(['Age', 'SibSp', 'Parch','Ticket', 'Fare','Cabin','Embarked'], axis=1)
traintest_df = [train_df, test_df]
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Title
0,892,3,0,1
1,893,3,1,3
2,894,2,0,1
3,895,3,0,1
4,896,3,1,3


**Prediction model with Random Forest Classifier: **

In [15]:
df1_train = train_df.drop("Survived", axis=1)
df2_train = train_df["Survived"]
df1_test  = test_df.drop("PassengerId", axis=1).copy()
df1_train.shape, df2_train.shape, df1_test.shape

((891, 3), (891,), (418, 3))

In [16]:
random_forest_1 = RandomForestClassifier(n_estimators=100)
random_forest_1.fit(df1_train, df2_train)
df2_pred = random_forest_1.predict(df1_test)
random_forest_1.score(df1_train, df2_train)
rate_random_forest = round(random_forest_1.score(df1_train, df2_train) * 100, 2)
rate_random_forest

80.02

In [17]:
my_submission = pd.DataFrame({"PassengerId": test_df["PassengerId"],"Survived": df2_pred})
my_submission.to_csv('my_submission_AG5.csv', index=False)
print('Submission was successfully!')

Submission was successfully!
