# Jaccad Coefficient (SMC), 
## it's specifically useful for binary attributes where the focus is on the presence of attributes (ignoring the absence).

### Importing required Libraries

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import jaccard_score

### Load Datasets

In [10]:
# Load datasets
adult_df = pd.read_csv("../adult/adult_trim.data", header=None) # No header
titanic_df = pd.read_csv('../titanic/titanic_trim.csv') # Has header

# Rename columns for clarity
adult_df.columns = ["age", "workclass", "fnlwgt", "education", "education_num", 
                    "marital_status", "occupation", "relationship", "race", "sex", 
                    "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"]
adult_df.dropna(inplace=True)

In [11]:
adult_df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,29,Local-gov,115585,Some-college,10,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,50,United-States,<=50K
96,48,Self-emp-not-inc,191277,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,60,United-States,>50K
97,37,Private,202683,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,48,United-States,>50K
98,48,Private,171095,Assoc-acdm,12,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,England,<=50K


In [12]:
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
150,151,0,2,"Bateman, Rev. Robert James",male,51.0,0,0,S.O.P. 1166,12.5250,,S
151,152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22.0,1,0,113776,66.6000,C2,S
152,153,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.0500,,S
153,154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5000,,S


### Select relevant columns from Adult dataset (mix of nominal and ratio-scaled)

In [13]:
adult_df = adult_df[["age", "workclass", "education", "education_num", "sex"]]

adult_df

Unnamed: 0,age,workclass,education,education_num,sex
0,39,State-gov,Bachelors,13,Male
1,50,Self-emp-not-inc,Bachelors,13,Male
2,38,Private,HS-grad,9,Male
3,53,Private,11th,7,Male
4,28,Private,Bachelors,13,Female
...,...,...,...,...,...
95,29,Local-gov,Some-college,10,Male
96,48,Self-emp-not-inc,Doctorate,16,Male
97,37,Private,Some-college,10,Male
98,48,Private,Assoc-acdm,12,Female


### Encode nominal attributes as integers for processing

In [None]:
label_encoders = {}
for column in adult_df.columns:
    if adult_df[column].dtype == object:
        le = LabelEncoder()
        adult_df[column] = le.fit_transform(adult_df[column])
        label_encoders[column] = le

adult_df


### Clean and preprocess Titanic dataset

In [15]:
titanic_df.dropna(inplace=True)
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S
21,22,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0,D56,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S
52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C
54,55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C


### Select relevant columns from Titanic dataset (mix of nominal and ratio-scaled)

In [16]:
titanic_df = titanic_df[["Age", "Sex", "Pclass", "Fare", "Embarked"]]
titanic_df

Unnamed: 0,Age,Sex,Pclass,Fare,Embarked
1,38.0,female,1,71.2833,C
3,35.0,female,1,53.1,S
6,54.0,male,1,51.8625,S
10,4.0,female,3,16.7,S
11,58.0,female,1,26.55,S
21,34.0,male,2,13.0,S
23,28.0,male,1,35.5,S
27,19.0,male,1,263.0,S
52,49.0,female,1,76.7292,C
54,65.0,male,1,61.9792,C


### Encode Nominal as Integers for processing

In [17]:
label_encoders_titanic = {}
for column in titanic_df.columns:
    if titanic_df[column].dtype == object:
        le = LabelEncoder()
        titanic_df[column] = le.fit_transform(titanic_df[column])
        label_encoders[column] = le

titanic_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_df[column] = le.fit_transform(titanic_df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_df[column] = le.fit_transform(titanic_df[column])


Unnamed: 0,Age,Sex,Pclass,Fare,Embarked
1,38.0,0,1,71.2833,0
3,35.0,0,1,53.1,1
6,54.0,1,1,51.8625,1
10,4.0,0,3,16.7,1
11,58.0,0,1,26.55,1
21,34.0,1,2,13.0,1
23,28.0,1,1,35.5,1
27,19.0,1,1,263.0,1
52,49.0,0,1,76.7292,0
54,65.0,1,1,61.9792,0


### Combine the datasets into a list for further processing

In [18]:

# Combine the datasets into a list for further processing
datasets = {
    "Adult Dataset": adult_df,
    "Titanic Dataset": titanic_df
}

### Computing Jaccard Coefficient
#### The Jaccard Coefficient is calculated as the size of the intersection divided by the size of the union of the attribute sets. It's typically used for binary or nominal data.

In [19]:
def jaccard_coefficient(a, b):
    """Calculate the Jaccard Coefficient between two vectors."""
    try:
        # Convert to binary format if necessary (for categorical data)
        return jaccard_score(a, b, average='macro')
    except Exception as e:
        return np.nan

# Function to create the Jaccard proximity matrix
def calculate_jaccard_matrix(dataset):
    n = len(dataset)
    jaccard_matrix = np.zeros((n, n))
    
    for i in range(n):
        # print(f"{i}/{n}")
        for j in range(n):
            jaccard_matrix[i, j] = jaccard_coefficient(dataset.iloc[i].values, dataset.iloc[j].values)
    
    return pd.DataFrame(jaccard_matrix)


### Calculating Matrices for the data sets

In [20]:
jaccard_matrix_adult = calculate_jaccard_matrix(adult_df)
jaccard_matrix_titanic = calculate_jaccard_matrix(titanic_df)

### Displaying the Matrices

#### Adult Dataset: Jaccard Matrix

In [21]:
jaccard_matrix_adult

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.000000,0.428571,0.125000,0.071429,0.250000,0.000000,0.000000,0.125000,0.000,0.428571,...,0.125000,0.000000,0.000000,0.000000,0.428571,0.111111,0.111111,0.111111,0.000,0.071429
1,0.428571,1.000000,0.125000,0.071429,0.250000,0.000000,0.000000,0.285714,0.000,0.428571,...,0.111111,0.000000,0.000000,0.000000,0.428571,0.111111,0.250000,0.111111,0.000,0.071429
2,0.125000,0.125000,1.000000,0.250000,0.125000,0.125000,0.125000,0.333333,0.125,0.285714,...,0.285714,0.125000,0.125000,0.333333,0.125000,0.125000,0.125000,0.285714,0.125,0.300000
3,0.071429,0.071429,0.250000,1.000000,0.142857,0.125000,0.125000,0.071429,0.125,0.250000,...,0.214286,0.125000,0.125000,0.142857,0.071429,0.062500,0.062500,0.214286,0.125,0.055556
4,0.250000,0.250000,0.125000,0.142857,1.000000,0.250000,0.250000,0.000000,0.250,0.428571,...,0.111111,0.250000,0.428571,0.285714,0.250000,0.000000,0.000000,0.111111,0.250,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.111111,0.111111,0.125000,0.062500,0.000000,0.000000,0.000000,0.125000,0.000,0.111111,...,0.111111,0.250000,0.250000,0.000000,0.250000,1.000000,0.111111,0.428571,0.000,0.071429
96,0.111111,0.250000,0.125000,0.062500,0.000000,0.000000,0.000000,0.285714,0.000,0.111111,...,0.111111,0.000000,0.000000,0.000000,0.111111,0.111111,1.000000,0.111111,0.125,0.071429
97,0.111111,0.111111,0.285714,0.214286,0.111111,0.285714,0.111111,0.125000,0.125,0.250000,...,0.250000,0.666667,0.428571,0.125000,0.111111,0.428571,0.111111,1.000000,0.125,0.071429
98,0.000000,0.000000,0.125000,0.125000,0.250000,0.250000,0.285714,0.000000,0.250,0.111111,...,0.111111,0.285714,0.285714,0.285714,0.000000,0.000000,0.125000,0.125000,1.000,0.000000


#### Titanic Dataset: Jaccard Matrix

In [22]:
jaccard_matrix_titanic

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,1.0,,0.111111,,,...,,0.111111,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,0.111111,,1.0,,,...,,0.2,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


### Explanation
Jaccard Coefficient Calculation: This metric focuses on the presence of attributes, making it useful for binary data. It calculates the ratio of the intersection of attributes to the union of attributes between two data points.


Handling Nominal Data: The Jaccard Coefficient is generally used for binary data, but in this case, we can apply it to the encoded nominal attributes as they are binary representations of categories.

### Observation and Analysis
The resulting matrices will have values between 0 and 1, where 1 indicates identical presence/absence of attributes (perfect similarity) and 0 indicates no similarity.


Jaccard Coefficient is particularly sensitive to the presence of attributes (1s) and ignores cases where both attributes are absent (0s), making it a useful measure for sparse datasets.