# Correlation Matrix, 
 Correlation measures the linear relationship between two variables and is particularly useful when dealing with interval and ratio-scaled data. It can indicate how similar the variations in two data points are.

### Importing required Libraries

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

### Load Datasets

In [15]:
# Load datasets
adult_df = pd.read_csv("../adult/adult_trim.data", header=None) # No header
titanic_df = pd.read_csv('../titanic/titanic_trim.csv') # Has header

# Rename columns for clarity
adult_df.columns = ["age", "workclass", "fnlwgt", "education", "education_num", 
                    "marital_status", "occupation", "relationship", "race", "sex", 
                    "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"]
adult_df.dropna(inplace=True)

In [16]:
adult_df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,29,Local-gov,115585,Some-college,10,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,50,United-States,<=50K
96,48,Self-emp-not-inc,191277,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,60,United-States,>50K
97,37,Private,202683,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,48,United-States,>50K
98,48,Private,171095,Assoc-acdm,12,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,England,<=50K


In [17]:
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
150,151,0,2,"Bateman, Rev. Robert James",male,51.0,0,0,S.O.P. 1166,12.5250,,S
151,152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22.0,1,0,113776,66.6000,C2,S
152,153,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.0500,,S
153,154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5000,,S


### Select relevant columns from Adult dataset (mix of nominal and ratio-scaled)

In [18]:
adult_df = adult_df[["age", "workclass", "education", "education_num", "sex"]]

adult_df

Unnamed: 0,age,workclass,education,education_num,sex
0,39,State-gov,Bachelors,13,Male
1,50,Self-emp-not-inc,Bachelors,13,Male
2,38,Private,HS-grad,9,Male
3,53,Private,11th,7,Male
4,28,Private,Bachelors,13,Female
...,...,...,...,...,...
95,29,Local-gov,Some-college,10,Male
96,48,Self-emp-not-inc,Doctorate,16,Male
97,37,Private,Some-college,10,Male
98,48,Private,Assoc-acdm,12,Female


### Encode nominal attributes as integers for processing

In [19]:
label_encoders = {}
for column in adult_df.columns:
    if adult_df[column].dtype == object:
        le = LabelEncoder()
        adult_df[column] = le.fit_transform(adult_df[column])
        label_encoders[column] = le

adult_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adult_df[column] = le.fit_transform(adult_df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adult_df[column] = le.fit_transform(adult_df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adult_df[column] = le.fit_transform(adult_df[column])


Unnamed: 0,age,workclass,education,education_num,sex
0,39,6,7,13,1
1,50,5,7,13,1
2,38,3,9,9,1
3,53,3,1,7,1
4,28,3,7,13,0
...,...,...,...,...,...
95,29,2,12,10,1
96,48,5,8,16,1
97,37,3,12,10,1
98,48,3,5,12,0


### Clean and preprocess Titanic dataset

In [20]:
titanic_df.dropna(inplace=True)
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S
21,22,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0,D56,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S
52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C
54,55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C


### Select relevant columns from Titanic dataset (mix of nominal and ratio-scaled)

In [21]:
titanic_df = titanic_df[["Age", "Sex", "Pclass", "Fare", "Embarked"]]
titanic_df

Unnamed: 0,Age,Sex,Pclass,Fare,Embarked
1,38.0,female,1,71.2833,C
3,35.0,female,1,53.1,S
6,54.0,male,1,51.8625,S
10,4.0,female,3,16.7,S
11,58.0,female,1,26.55,S
21,34.0,male,2,13.0,S
23,28.0,male,1,35.5,S
27,19.0,male,1,263.0,S
52,49.0,female,1,76.7292,C
54,65.0,male,1,61.9792,C


### Encode Nominal as Integers for processing

In [22]:
label_encoders_titanic = {}
for column in titanic_df.columns:
    if titanic_df[column].dtype == object:
        le = LabelEncoder()
        titanic_df[column] = le.fit_transform(titanic_df[column])
        label_encoders[column] = le

titanic_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_df[column] = le.fit_transform(titanic_df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_df[column] = le.fit_transform(titanic_df[column])


Unnamed: 0,Age,Sex,Pclass,Fare,Embarked
1,38.0,0,1,71.2833,0
3,35.0,0,1,53.1,1
6,54.0,1,1,51.8625,1
10,4.0,0,3,16.7,1
11,58.0,0,1,26.55,1
21,34.0,1,2,13.0,1
23,28.0,1,1,35.5,1
27,19.0,1,1,263.0,1
52,49.0,0,1,76.7292,0
54,65.0,1,1,61.9792,0


### Combine the datasets into a list for further processing

In [23]:

# Combine the datasets into a list for further processing
datasets = {
    "Adult Dataset": adult_df,
    "Titanic Dataset": titanic_df
}

### Compute Correlation Matrix

In [24]:
def correlation_coefficient(a, b):
    """Calculate the Pearson Correlation Coefficient between two vectors."""
    try:
        return np.corrcoef(a, b)[0, 1]
    except Exception as e:
        return np.nan

# Function to create the Correlation matrix
def calculate_correlation_matrix(dataset):
    n = len(dataset)
    correlation_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(n):
            correlation_matrix[i, j] = correlation_coefficient(dataset.iloc[i].values, dataset.iloc[j].values)
    
    return pd.DataFrame(correlation_matrix)

### Calculate Correlation Matrix

#### For Adult Dataset

In [25]:
correlation_matrix_adult = calculate_correlation_matrix(adult_df)
correlation_matrix_adult

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.000000,0.997350,0.987351,0.979855,0.982608,0.987214,0.979155,0.985819,0.973770,0.996797,...,0.990631,0.974937,0.952244,0.984207,0.990559,0.952796,0.998811,0.974860,0.996972,0.978260
1,0.997350,1.000000,0.992346,0.990383,0.971402,0.981827,0.990318,0.993768,0.963523,0.997273,...,0.997941,0.976220,0.947628,0.983245,0.987215,0.951934,0.996954,0.977875,0.999669,0.981818
2,0.987351,0.992346,1.000000,0.977230,0.966611,0.986108,0.986540,0.995618,0.969761,0.991901,...,0.992350,0.993895,0.973154,0.995007,0.983535,0.977942,0.988350,0.995414,0.990512,0.995396
3,0.979855,0.990383,0.977230,1.000000,0.931087,0.947285,0.996425,0.989915,0.918105,0.979980,...,0.995363,0.948173,0.903421,0.954681,0.959776,0.912323,0.978175,0.952534,0.990242,0.957604
4,0.982608,0.971402,0.966611,0.931087,1.000000,0.994572,0.930666,0.948588,0.995967,0.983983,...,0.957142,0.968306,0.965757,0.978583,0.993915,0.964839,0.986529,0.966230,0.972744,0.973384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.952796,0.951934,0.977942,0.912323,0.964839,0.984216,0.931341,0.955238,0.982308,0.963549,...,0.946838,0.993654,0.997665,0.990267,0.969546,1.000000,0.958159,0.992814,0.949612,0.990722
96,0.998811,0.996954,0.988350,0.978175,0.986529,0.990747,0.976581,0.983955,0.978473,0.999202,...,0.990899,0.976483,0.955091,0.985272,0.995602,0.958159,1.000000,0.977120,0.997514,0.983073
97,0.974860,0.977875,0.995414,0.952534,0.966230,0.987763,0.967844,0.983614,0.976983,0.981705,...,0.976156,0.999626,0.989609,0.997909,0.978402,0.992814,0.977120,1.000000,0.975251,0.997584
98,0.996972,0.999669,0.990512,0.990242,0.972744,0.981685,0.988416,0.991396,0.963678,0.997955,...,0.997641,0.973235,0.944175,0.980862,0.988923,0.949612,0.997514,0.975251,1.000000,0.980972


#### For Titanic Dataset

In [26]:
correlation_matrix_titanic = calculate_correlation_matrix(titanic_df)
correlation_matrix_titanic

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,1.0,0.993438,0.919708,0.925325,0.674371,0.60146,0.975698,0.892647,0.99521,0.917958,...,0.963749,0.952177,0.904026,0.620265,0.988953,0.985852,0.989423,0.972736,0.825618,0.978661
1,0.993438,1.0,0.958218,0.879845,0.753852,0.687969,0.994154,0.835749,0.999768,0.956767,...,0.927261,0.980598,0.849637,0.705193,0.999264,0.998496,0.99937,0.939975,0.884093,0.949193
2,0.919708,0.958218,1.0,0.712509,0.909944,0.866324,0.983353,0.644276,0.953679,0.99992,...,0.78192,0.995678,0.663848,0.877352,0.967654,0.971436,0.966838,0.803826,0.980519,0.819682
3,0.925325,0.879845,0.712509,1.0,0.367142,0.28089,0.824795,0.981587,0.886621,0.709759,...,0.98304,0.772886,0.984075,0.307199,0.86183,0.856703,0.863373,0.979446,0.569122,0.977002
4,0.674371,0.753852,0.909944,0.367142,1.0,0.99519,0.819495,0.269311,0.74331,0.911627,...,0.453165,0.867522,0.294109,0.996774,0.775961,0.787108,0.773931,0.484868,0.97319,0.508782
5,0.60146,0.687969,0.866324,0.28089,0.99519,1.0,0.761294,0.176781,0.676623,0.868651,...,0.366543,0.816266,0.202236,0.999254,0.712499,0.724881,0.710243,0.399972,0.94734,0.424763
6,0.975698,0.994154,0.983353,0.824795,0.819495,0.761294,1.0,0.772517,0.992425,0.982463,...,0.882174,0.995979,0.788686,0.776057,0.997386,0.997961,0.997149,0.898512,0.92878,0.910088
7,0.892647,0.835749,0.644276,0.981587,0.269311,0.176781,0.772517,1.0,0.844335,0.640809,...,0.980528,0.712522,0.999658,0.200376,0.81638,0.80558,0.818231,0.972742,0.482654,0.966053
8,0.99521,0.999768,0.953679,0.886621,0.74331,0.676623,0.992425,0.844335,1.0,0.952328,...,0.933093,0.977476,0.857932,0.693802,0.998669,0.997263,0.998825,0.945445,0.876786,0.953936
9,0.917958,0.956767,0.99992,0.709759,0.911627,0.868651,0.982463,0.640809,0.952328,1.0,...,0.779074,0.995183,0.6605,0.879412,0.966449,0.970112,0.96562,0.801248,0.981455,0.816975


### Explanation
Correlation Calculation: This metric measures the degree of linear relationship between two variables. A high positive value indicates that as one variable increases, the other also tends to increase, while a high negative value indicates an inverse relationship.

Handling Different Data Types: Correlation is best suited for interval and ratio-scaled data. It may not be meaningful for nominal or ordinal data unless they are encoded or transformed appropriately.

### Observation and Analysis
The resulting matrices will show the correlation coefficients between each pair of data points. A value close to 
1 or −1 indicates a strong relationship, while a value close to 0 indicates little to no linear relationship.

Correlation as a Similarity Metric is useful for understanding the degree to which data points share a linear relationship, which can be especially important in fields like finance and economics.