# Simple Matching Coefficient (SMC), 
## It is typically used for comparing binary or nominal attributes.

### Importing required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

### Load Datasets

In [2]:
# Load datasets
adult_df = pd.read_csv("../adult/adult_trim.data", header=None) # No header
titanic_df = pd.read_csv('../titanic/train.csv') # Has header

# Rename columns for clarity
adult_df.columns = ["age", "workclass", "fnlwgt", "education", "education_num", 
                    "marital_status", "occupation", "relationship", "race", "sex", 
                    "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"]
adult_df.dropna(inplace=True)

In [3]:
adult_df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,29,Local-gov,115585,Some-college,10,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,50,United-States,<=50K
96,48,Self-emp-not-inc,191277,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,60,United-States,>50K
97,37,Private,202683,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,48,United-States,>50K
98,48,Private,171095,Assoc-acdm,12,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,England,<=50K


In [4]:
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Select relevant columns from Adult dataset (mix of nominal and ratio-scaled)

In [5]:
adult_df = adult_df[["age", "workclass", "education", "education_num", "sex"]]

adult_df

Unnamed: 0,age,workclass,education,education_num,sex
0,39,State-gov,Bachelors,13,Male
1,50,Self-emp-not-inc,Bachelors,13,Male
2,38,Private,HS-grad,9,Male
3,53,Private,11th,7,Male
4,28,Private,Bachelors,13,Female
...,...,...,...,...,...
95,29,Local-gov,Some-college,10,Male
96,48,Self-emp-not-inc,Doctorate,16,Male
97,37,Private,Some-college,10,Male
98,48,Private,Assoc-acdm,12,Female


### Encode nominal attributes as integers for processing

In [6]:
label_encoders = {}
for column in adult_df.columns:
    if adult_df[column].dtype == object:
        le = LabelEncoder()
        adult_df[column] = le.fit_transform(adult_df[column])
        label_encoders[column] = le

adult_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adult_df[column] = le.fit_transform(adult_df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adult_df[column] = le.fit_transform(adult_df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adult_df[column] = le.fit_transform(adult_df[column])


Unnamed: 0,age,workclass,education,education_num,sex
0,39,6,7,13,1
1,50,5,7,13,1
2,38,3,9,9,1
3,53,3,1,7,1
4,28,3,7,13,0
...,...,...,...,...,...
95,29,2,12,10,1
96,48,5,8,16,1
97,37,3,12,10,1
98,48,3,5,12,0


### Clean and preprocess Titanic dataset

In [7]:
titanic_df.dropna(inplace=True)
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


### Select relevant columns from Titanic dataset (mix of nominal and ratio-scaled)

In [8]:
titanic_df = titanic_df[["Age", "Sex", "Pclass", "Fare", "Embarked"]]
titanic_df

Unnamed: 0,Age,Sex,Pclass,Fare,Embarked
1,38.0,female,1,71.2833,C
3,35.0,female,1,53.1000,S
6,54.0,male,1,51.8625,S
10,4.0,female,3,16.7000,S
11,58.0,female,1,26.5500,S
...,...,...,...,...,...
871,47.0,female,1,52.5542,S
872,33.0,male,1,5.0000,S
879,56.0,female,1,83.1583,C
887,19.0,female,1,30.0000,S


### Encode Nominal as Integers for processing

In [9]:
label_encoders_titanic = {}
for column in titanic_df.columns:
    if titanic_df[column].dtype == object:
        le = LabelEncoder()
        titanic_df[column] = le.fit_transform(titanic_df[column])
        label_encoders[column] = le

titanic_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_df[column] = le.fit_transform(titanic_df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_df[column] = le.fit_transform(titanic_df[column])


Unnamed: 0,Age,Sex,Pclass,Fare,Embarked
1,38.0,0,1,71.2833,0
3,35.0,0,1,53.1000,2
6,54.0,1,1,51.8625,2
10,4.0,0,3,16.7000,2
11,58.0,0,1,26.5500,2
...,...,...,...,...,...
871,47.0,0,1,52.5542,2
872,33.0,1,1,5.0000,2
879,56.0,0,1,83.1583,0
887,19.0,0,1,30.0000,2


### Combine the datasets into a list for further processing

In [10]:

# Combine the datasets into a list for further processing
datasets = {
    "Adult Dataset": adult_df,
    "Titanic Dataset": titanic_df
}

### Code to Calculate SMC

In [15]:
def simple_matching_coefficient(a, b):
    """Calculate the Simple Matching Coefficient between two vectors."""
    try:
        return np.sum(a == b) / len(a)
    except Exception as e:
        return np.nan

# Function to create the proximity matrix
def calculate_smc_matrix(dataset):
    n = len(dataset)
    smc_matrix = np.zeros((n, n))
    
    for i in range(n):
        # print(f"{i}/{n}")
        for j in range(n):
            smc_matrix[i, j] = simple_matching_coefficient(dataset.iloc[i].values, dataset.iloc[j].values)
    
    return pd.DataFrame(smc_matrix)

### Calculate SMC matrices for each dataset


In [16]:
smc_matrix_adult = calculate_smc_matrix(adult_df)
smc_matrix_titanic = calculate_smc_matrix(titanic_df)

### Print SMC matrices

#### Adult Dataset SMC Matrix

In [17]:
smc_matrix_adult

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.0,0.6,0.2,0.2,0.4,0.0,0.0,0.2,0.0,0.6,...,0.2,0.0,0.0,0.0,0.6,0.2,0.2,0.2,0.0,0.2
1,0.6,1.0,0.2,0.2,0.4,0.0,0.0,0.4,0.0,0.6,...,0.2,0.0,0.0,0.0,0.6,0.2,0.4,0.2,0.0,0.2
2,0.2,0.2,1.0,0.4,0.2,0.2,0.2,0.6,0.2,0.4,...,0.4,0.2,0.2,0.6,0.2,0.2,0.2,0.4,0.2,0.6
3,0.2,0.2,0.4,1.0,0.2,0.2,0.2,0.2,0.2,0.4,...,0.4,0.2,0.2,0.2,0.2,0.2,0.2,0.4,0.2,0.2
4,0.4,0.4,0.2,0.2,1.0,0.4,0.4,0.0,0.4,0.6,...,0.2,0.4,0.6,0.4,0.4,0.0,0.0,0.2,0.4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.2,0.2,0.2,0.2,0.0,0.0,0.0,0.2,0.0,0.2,...,0.2,0.4,0.4,0.0,0.4,1.0,0.2,0.6,0.0,0.2
96,0.2,0.4,0.2,0.2,0.0,0.0,0.0,0.4,0.0,0.2,...,0.2,0.0,0.0,0.0,0.2,0.2,1.0,0.2,0.2,0.2
97,0.2,0.2,0.4,0.4,0.2,0.4,0.2,0.2,0.2,0.4,...,0.4,0.8,0.6,0.2,0.2,0.6,0.2,1.0,0.2,0.2
98,0.0,0.0,0.2,0.2,0.4,0.4,0.4,0.0,0.4,0.2,...,0.2,0.4,0.4,0.4,0.0,0.0,0.2,0.2,1.0,0.0


#### Titanic Dataset SMC Matrix

In [18]:
smc_matrix_titanic

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,173,174,175,176,177,178,179,180,181,182
0,1.0,0.4,0.2,0.2,0.4,0.0,0.2,0.2,0.6,0.4,...,0.6,0.4,0.2,0.4,0.2,0.4,0.2,0.6,0.4,0.4
1,0.4,1.0,0.4,0.4,0.6,0.2,0.4,0.4,0.4,0.2,...,0.4,0.6,0.4,0.6,0.4,0.6,0.4,0.4,0.6,0.2
2,0.2,0.4,1.0,0.2,0.4,0.4,0.6,0.6,0.2,0.4,...,0.2,0.4,0.6,0.4,0.6,0.4,0.6,0.2,0.4,0.4
3,0.2,0.4,0.2,1.0,0.4,0.2,0.2,0.2,0.2,0.0,...,0.2,0.4,0.2,0.4,0.2,0.4,0.2,0.2,0.4,0.0
4,0.4,0.6,0.4,0.4,1.0,0.2,0.4,0.4,0.4,0.2,...,0.4,0.6,0.6,0.6,0.4,0.6,0.4,0.4,0.6,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,0.4,0.6,0.4,0.4,0.6,0.2,0.4,0.4,0.4,0.2,...,0.4,0.6,0.4,0.6,0.4,1.0,0.4,0.4,0.6,0.2
179,0.2,0.4,0.6,0.2,0.4,0.4,0.6,0.6,0.2,0.4,...,0.2,0.4,0.6,0.4,0.6,0.4,1.0,0.2,0.4,0.4
180,0.6,0.4,0.2,0.2,0.4,0.0,0.2,0.2,0.6,0.4,...,0.8,0.4,0.2,0.4,0.2,0.4,0.2,1.0,0.4,0.4
181,0.4,0.6,0.4,0.4,0.6,0.2,0.4,0.6,0.4,0.2,...,0.4,0.6,0.4,0.6,0.4,0.6,0.4,0.4,1.0,0.4
