# I. **read data**

In [2]:
import pandas as pd

In [3]:
train_data = pd.read_csv(
    "train.csv",
    index_col=0,
    dtype={
        "Survived":"int64",
        "Pclass":"category",
        "Sex":"category",
        "Embarked": "category"
        }
    )
test_data = pd.read_csv(
    "test.csv",
    index_col=0,
    dtype={
        "Pclass":"category",
        "Sex":"category",
        "Embarked": "category"
        }
    )


In [4]:
#test
train_data[train_data["Sex"] == "female"]["Survived"].mean()

0.7420382165605095

# II. **Calculate the Surviving probability of categorical metrics**  
Already existing categorical metrics :   
+ Pclass   
+ Sex   
+ Embarke   

## II.1 Extract additional metric "title"

In [5]:
# extract posible titles
def get_title(string_name):
    return string_name[string_name.find(",")+2:string_name.find(".",string_name.find(","))]

train_data["temp_title"] = [get_title(i) for i in train_data["Name"]]
test_data["temp_title"] = [get_title(i) for i in test_data["Name"]]


In [6]:
# check the available title
train_data["temp_title"].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: temp_title, dtype: int64

In [7]:
# based on the ouput of above code, there are 6 titles that has enough data to calculate:
# "Mr","Miss","Mrs","Master","Dr","Rev"
# the other titles will be labeled "Other"
def get_title(temp_title_string):
    title = ["Mr","Miss","Mrs","Master","Dr","Rev"]
    if temp_title_string in title:
        return temp_title_string
    else: return "Other"
train_data["title"] = [get_title(i) for i in train_data["temp_title"]]
train_data.title = train_data.title.astype("category")
test_data["title"] = [get_title(i) for i in test_data["temp_title"]]
test_data.title = test_data.title.astype("category")

In [8]:
test_data.title.value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Other       4
Rev         2
Dr          1
Name: title, dtype: int64

In [9]:
train_data["Survived"] = train_data["Survived"].astype("int64")

## II.2 create **function** to return **probability of survival**, based on **known categorical metrics**

In [10]:
# dictionary of survival rate of each categorical metrics
Embarked = train_data.groupby("Embarked").mean().Survived.to_dict()
Pclass = train_data.groupby("Pclass").mean().Survived.to_dict()
Sex = train_data.groupby("Sex").mean().Survived.to_dict()
title = train_data.groupby("title").mean().Survived.to_dict()

In [55]:
train_data_test = train_data[
["Name","Embarked","Pclass","Sex","title","Survived"]
]

In [56]:
def return_dict_value_keyerror(dic,key):
    try:
        return dic[key]
    except KeyError:
        return "no"
def cal_survival(em,pc,se,ti):
    avr = []
    if return_dict_value_keyerror(Embarked,em) != "no":
        avr.append(return_dict_value_keyerror(Embarked,em))
    if return_dict_value_keyerror(Pclass,pc) != "no":
        avr.append(return_dict_value_keyerror(Pclass,pc))
    if return_dict_value_keyerror(Sex,se) != "no":
        avr.append(return_dict_value_keyerror(Sex,se))
    if return_dict_value_keyerror(title,ti) != "no":
        avr.append(return_dict_value_keyerror(title,ti))
    return sum(avr)/len(avr)


In [57]:
train_data_test["calculated_survival_rate"] = [cal_survival(ent[1],ent[2],ent[3],ent[4]) for ent in train_data_test.values]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_test["calculated_survival_rate"] = [cal_survival(ent[1],ent[2],ent[3],ent[4]) for ent in train_data_test.values]


In [58]:
def cal_survived(x):
    if x <= .5: return 0
    else: return 1
train_data_test["calculated_survived"] = [cal_survived(x) for x in train_data_test.calculated_survival_rate]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_test["calculated_survived"] = [cal_survived(x) for x in train_data_test.calculated_survival_rate]


In [61]:
train_data_test[train_data_test.Survived != train_data_test.calculated_survived]

Unnamed: 0_level_0,Name,Embarked,Pclass,Sex,title,Survived,calculated_survival_rate,calculated_survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
15,"Vestrom, Miss. Hulda Amanda Adolfina",S,3,female,Miss,0,0.504790,1
18,"Williams, Mr. Charles Eugene",S,2,male,Mr,1,0.288841,0
19,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",S,3,female,Mrs,0,0.528339,1
22,"Beesley, Mr. Lawrence",S,2,male,Mr,1,0.288841,0
24,"Sloper, Mr. William Thompson",S,1,male,Mr,1,0.328042,0
...,...,...,...,...,...,...,...,...
870,"Johnson, Master. Harold Theodor",S,3,male,Master,1,0.335807,0
883,"Dahlberg, Miss. Gerda Ulrika",S,3,female,Miss,0,0.504790,1
886,"Rice, Mrs. William (Margaret Norton)",Q,3,female,Mrs,0,0.541503,1
889,"Johnston, Miss. Catherine Helen ""Carrie""",S,3,female,Miss,0,0.504790,1


In [62]:
train_data_test

Unnamed: 0_level_0,Name,Embarked,Pclass,Sex,title,Survived,calculated_survival_rate,calculated_survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,"Braund, Mr. Owen Harris",S,3,male,Mr,0,0.231225,0
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",C,1,female,Mrs,1,0.679310,1
3,"Heikkinen, Miss. Laina",S,3,female,Miss,1,0.504790,1
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",S,1,female,Mrs,1,0.625156,1
5,"Allen, Mr. William Henry",S,3,male,Mr,0,0.231225,0
...,...,...,...,...,...,...,...,...
887,"Montvila, Rev. Juozas",S,2,male,Rev,0,0.249673,0
888,"Graham, Miss. Margaret Edith",S,1,female,Miss,1,0.601607,1
889,"Johnston, Miss. Catherine Helen ""Carrie""",S,3,female,Miss,0,0.504790,1
890,"Behr, Mr. Karl Howell",C,1,male,Mr,1,0.382196,0


In [63]:
189/891

0.21212121212121213

In [65]:
test_data_final = test_data[["Name","Embarked","Pclass","Sex","title"]]

In [66]:
test_data_final["Survived"] = [cal_survived(cal_survival(ent[1],ent[2],ent[3],ent[4])) for ent in test_data_final.values]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_final["Survived"] = [cal_survived(cal_survival(ent[1],ent[2],ent[3],ent[4])) for ent in test_data_final.values]


In [73]:
test_data_final[["Survived"]].sort_values(by=["PassengerId"]).to_csv("summit.csv")