<div class="alert alert-warning">
FEATURE SELECTION ON INDIAN LIVER PATIENTS

<br>Data Acquired From University of California, Irvine Machine Learning Repository
<br>Additonal Data Information in the Link Below:
<br>
[https://archive.ics.uci.edu/dataset/225/ilpd+indian+liver+patient+dataset](https://archive.ics.uci.edu/dataset/225/ilpd+indian+liver+patient+dataset)

The Data was used to determine if blood test data could be sufficient to identify liver disease in rural areas with few physicians.
</div>

In [13]:
# Packages
import numpy as np
import pandas as pd
import warnings

# Get Packages
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mutual_info_score

%matplotlib inline

In [14]:
# URL DATA
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00225/Indian Liver Patient Dataset (ILPD).csv"
url = url.replace(" ", "%20")

# Downloading Data
ILPD = pd.read_csv(url, header=None)

# Replacing Default Collumn Names (0, 1, 2, 3, 4, 5)
ILPD.columns = ["Age","Gender","DB","TB","Alkphos","Sgpt","Sgot","TPr","ALB","AGRatio","Selector"]

ILPD.head()

Unnamed: 0,Age,Gender,DB,TB,Alkphos,Sgpt,Sgot,TPr,ALB,AGRatio,Selector
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [15]:
# Preliminary EDA
display(ILPD.shape)

print("**************************")
display(ILPD.dtypes)

print("**************************")
print(ILPD.Selector.unique())

print("**************************")
ILPD.isna().sum(axis=0)

(583, 11)

**************************


Age           int64
Gender       object
DB          float64
TB          float64
Alkphos       int64
Sgpt          int64
Sgot          int64
TPr         float64
ALB         float64
AGRatio     float64
Selector      int64
dtype: object

**************************
[1 2]
**************************


Age         0
Gender      0
DB          0
TB          0
Alkphos     0
Sgpt        0
Sgot        0
TPr         0
ALB         0
AGRatio     4
Selector    0
dtype: int64

In [16]:
# Coerce All Data to Numeric Data
# Coercion Introduces nans/nulls for the Non-Numeric Values in All Columns
# Missing Categories will be nans/nulls After Coercion for Categories encoded as Integers
ILPD = ILPD.apply(pd.to_numeric, errors="coerce")

# Dropping Binary Columns
ILPD = ILPD.drop(["Gender", "Selector"], axis=1)

# Impute Values or Remove Rows w/ Nulls
ILPD = ILPD.dropna()


In [17]:
# Check for Null Values
ILPD.isna().sum(axis=0)

Age        0
DB         0
TB         0
Alkphos    0
Sgpt       0
Sgot       0
TPr        0
ALB        0
AGRatio    0
dtype: int64

In [18]:
# EDA
display(ILPD.dtypes)

Age          int64
DB         float64
TB         float64
Alkphos      int64
Sgpt         int64
Sgot         int64
TPr        float64
ALB        float64
AGRatio    float64
dtype: object

In [19]:
ILPD.head()

Unnamed: 0,Age,DB,TB,Alkphos,Sgpt,Sgot,TPr,ALB,AGRatio
0,65,0.7,0.1,187,16,18,6.8,3.3,0.9
1,62,10.9,5.5,699,64,100,7.5,3.2,0.74
2,62,7.3,4.1,490,60,68,7.0,3.3,0.89
3,58,1.0,0.4,182,14,20,6.8,3.4,1.0
4,72,3.9,2.0,195,27,59,7.3,2.4,0.4


In [20]:
# Mutual Information
# x = First Input Variable
# y = Second Input Variable
# bins = Number of Discretized Values for Input Variables
def calc_MI(x, y, bins=80):
    mi = mutual_info_score(x, y)
    return mi

# Define Method listMutualInformationScores
def listMutualInformationScores(data):
    columns = data.columns
    number_of_columns = len(columns)
    mi_scores = []
    
    for a in range(number_of_columns):
        for b in range(a + 1, number_of_columns):
            x_columns = columns[a]
            y_columns = columns[b]
            mi = calc_MI(data[x_columns], data[y_columns])
            mi_scores.append([x_columns, y_columns, mi])
    
    return mi_scores

# Filter out Specific Warnings
warnings.filterwarnings("ignore", message="Clustering metrics expects discrete values but received")

# Run Method listMutualInformationScores
mutual_info_scores = listMutualInformationScores(ILPD)

for score in mutual_info_scores:
    print(score)


['Age', 'DB', 1.5362359963411014]
['Age', 'TB', 1.1864560454542543]
['Age', 'Alkphos', 2.896530320011925]
['Age', 'Sgpt', 2.215819044186841]
['Age', 'Sgot', 2.465542349447693]
['Age', 'TPr', 1.5219026749495055]
['Age', 'ALB', 1.3051582059817863]
['Age', 'AGRatio', 1.1208822547976103]
['DB', 'TB', 2.247495165719161]
['DB', 'Alkphos', 2.624452714891249]
['DB', 'Sgpt', 2.0984825139028693]
['DB', 'Sgot', 2.304455736932444]
['DB', 'TPr', 1.4030446398595084]
['DB', 'ALB', 1.2261412250061694]
['DB', 'AGRatio', 1.151021294808811]
['TB', 'Alkphos', 2.1654125147528838]
['TB', 'Sgpt', 1.7623169633193212]
['TB', 'Sgot', 1.9533703265006115]
['TB', 'TPr', 1.1225321520920404]
['TB', 'ALB', 0.9579247326634799]
['TB', 'AGRatio', 0.89896446137386]
['Alkphos', 'Sgpt', 3.4330574337163617]
['Alkphos', 'Sgot', 3.6713961556749535]
['Alkphos', 'TPr', 2.6493317560586904]
['Alkphos', 'ALB', 2.402229779282233]
['Alkphos', 'AGRatio', 2.0496538169462424]
['Sgpt', 'Sgot', 3.055060699792564]
['Sgpt', 'TPr', 1.980005

In [21]:
# Presenting Results as DataFrame
mi_df = pd.DataFrame(mutual_info_scores, columns=['x', 'y', 'mi'])
mi_df = mi_df.sort_values(by='mi', ascending=False)
mi_df

Unnamed: 0,x,y,mi
22,Alkphos,Sgot,3.671396
21,Alkphos,Sgpt,3.433057
26,Sgpt,Sgot,3.055061
2,Age,Alkphos,2.89653
23,Alkphos,TPr,2.649332
9,DB,Alkphos,2.624453
4,Age,Sgot,2.465542
24,Alkphos,ALB,2.40223
11,DB,Sgot,2.304456
8,DB,TB,2.247495
