In [21]:
import numpy as np
import pandas as pd

In [22]:
# Create dataset from Google Drive
dataset_url = "https://raw.githubusercontent.com/FajarAndrianto037/data/main/train.csv"
data = pd.read_csv(dataset_url, index_col='PassengerId')

In [23]:
data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [24]:
# Show dataset shape
number_of_columns = data.shape[1]

In [25]:
# Show all columns for dataset
pd.set_option('display.max_columns', number_of_columns)
pd.set_option('display.max_rows', number_of_columns)

In [26]:
# Show all columns from dataframe
data.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## Categorical/Nomimal Features
- Survived
- Embarked

In [27]:
data[["Name","Survived", "Embarked"]].head(5)

Unnamed: 0_level_0,Name,Survived,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"Braund, Mr. Owen Harris",0,S
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,C
3,"Heikkinen, Miss. Laina",1,S
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,S
5,"Allen, Mr. William Henry",0,S


## Change Values to 1/0
- Take all values of ‘Embarked’ series

- If values is S change to 1

- If values is C change to 0

In [28]:
# embark code
code_embark_for_cherbourg = "C"
code_embark_for_southampton = "S"

# binary value
value_of_one = 1
value_of_zero = 0

def change_code_embark_to_biner(embarked):
    return value_of_one if embarked == code_embark_for_southampton else value_of_zero

In [29]:
# Update all values of 'Embarked' series
data["Embarked"] = data["Embarked"].apply(change_code_embark_to_biner)

In [30]:
data[["Name","Survived", "Embarked"]].head(5)

Unnamed: 0_level_0,Name,Survived,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"Braund, Mr. Owen Harris",0,1
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0
3,"Heikkinen, Miss. Laina",1,1
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,1
5,"Allen, Mr. William Henry",0,1


In [31]:
# CONSTAN VARIABLE
DECREMENT_BY_ONE = 1
INCREMENT_BY_ONE = 1

CONTINGENCY_TABLE_VALUE = {
    "q" : (1,1),
    "r" : (1,0),
    "s" : (0,1),
    "t" : (0,0),
}

In [32]:
def get_series(data, idx, series):
    return data.loc[(idx), series]

In [33]:
def get_dissimilarity_dataset(data, series_index = [], series = []):
    first_series = get_series(data, series_index[0], series)
    second_series = get_series(data, series_index[1], series)
    dataset = pd.concat([first_series,second_series],axis=1)
    return dataset.T

In [34]:
get_dissimilarity_dataset(data, [1,2], ["Survived", "Embarked"]).T

Unnamed: 0,1,2
Survived,0,1
Embarked,1,0


In [35]:
data.loc[0:5, ["Survived", "Embarked"]]

Unnamed: 0_level_0,Survived,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,1
2,1,0
3,1,1
4,1,1
5,0,1


In [36]:
def count_contingency_value(data, start_index = 0, last_index = 1):

    CONTINGENCY_VALUE = {
        "q" : 0,
        "r" : 0,
        "s" : 0,
        "t" : 0,
    }

    column_range = data.shape[1]

    for column in range(column_range):
        for value in CONTINGENCY_TABLE_VALUE:
            item = list((tuple(data.loc[(start_index):(last_index), data.columns[column]]) == CONTINGENCY_TABLE_VALUE[value], value))
            if item[0] == True:
                if item[1] == "q":
                    CONTINGENCY_VALUE["q"] += 1
                if item[1] == "r":
                    CONTINGENCY_VALUE["r"] += 1
                if item[1] == "s":
                    CONTINGENCY_VALUE["s"] += 1
                if item[1] == "t":
                    CONTINGENCY_VALUE["t"] += 1

    return CONTINGENCY_VALUE

In [37]:
# d(1,2)
data_1_2 = get_dissimilarity_dataset(data, [1,2], ["Survived", "Embarked"])

In [38]:
c_d_1_2 = count_contingency_value(data_1_2, 1, 2)

In [39]:
# d(1,3)
data_1_3 = get_dissimilarity_dataset(data, [1,3], ["Survived", "Embarked"])

In [40]:
c_d_1_3 = count_contingency_value(data_1_3, 1, 3)

In [41]:
# d(1,4)
data_1_4 = get_dissimilarity_dataset(data, [1,4], ["Survived", "Embarked"])

In [42]:
c_data_1_4 = count_contingency_value(data_1_4, 1, 4)

In [43]:
# d(1,5)
data_1_5 = get_dissimilarity_dataset(data, [1,5], ["Survived", "Embarked"])

In [44]:
c_data_1_5 = count_contingency_value(data_1_5, 1, 5)

# Dissimilarity Binary Assymetric Value Formula

In [45]:
def measure_dissimilarity_binary_value_assymetric_distance(contingency_value):

    return (contingency_value["r"] + contingency_value["s"]) / (contingency_value["q"] + contingency_value["r"] + contingency_value["s"])

In [46]:
d_1_2 = measure_dissimilarity_binary_value_assymetric_distance(c_d_1_2)
d_1_3 = measure_dissimilarity_binary_value_assymetric_distance(c_d_1_2)
d_1_4 = measure_dissimilarity_binary_value_assymetric_distance(c_d_1_2)

In [47]:
d_1_2

1.0

In [48]:
d_1_3

1.0

In [49]:
d_1_4

1.0