In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000

In [3]:
df = pd.read_csv("heart.processed.cleveland.data", na_values="?")
df = df.dropna()
df = df.astype({"restecg":'int', "thal":'int', "slope":'int', "cp":'int'}) 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       297 non-null    float64
 1   sex       297 non-null    float64
 2   cp        297 non-null    int64  
 3   trestbps  297 non-null    float64
 4   chol      297 non-null    float64
 5   fbs       297 non-null    float64
 6   restecg   297 non-null    int64  
 7   thalach   297 non-null    float64
 8   exang     297 non-null    float64
 9   oldpeak   297 non-null    float64
 10  slope     297 non-null    int64  
 11  ca        297 non-null    float64
 12  thal      297 non-null    int64  
 13  target    297 non-null    int64  
dtypes: float64(9), int64(5)
memory usage: 34.8 KB


In [7]:
corr = df.corr()['target'].abs().sort_values()
corr

fbs         0.049040
chol        0.066448
trestbps    0.159620
restecg     0.184136
age         0.222156
sex         0.226797
slope       0.374689
exang       0.391613
cp          0.404248
thalach     0.420639
oldpeak     0.501461
thal        0.513377
ca          0.521178
target      1.000000
Name: target, dtype: float64

In [8]:
df.loc[df['target'] > 1, ['target']] = 1
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1,145.0,233.0,1.0,2,150.0,0.0,2.3,3,0.0,6,0
1,67.0,1.0,4,160.0,286.0,0.0,2,108.0,1.0,1.5,2,3.0,3,1
2,67.0,1.0,4,120.0,229.0,0.0,2,129.0,1.0,2.6,2,2.0,7,1
3,37.0,1.0,3,130.0,250.0,0.0,0,187.0,0.0,3.5,3,0.0,3,0
4,41.0,0.0,2,130.0,204.0,0.0,2,172.0,0.0,1.4,1,0.0,3,0
5,56.0,1.0,2,120.0,236.0,0.0,0,178.0,0.0,0.8,1,0.0,3,0
6,62.0,0.0,4,140.0,268.0,0.0,2,160.0,0.0,3.6,3,2.0,3,1
7,57.0,0.0,4,120.0,354.0,0.0,0,163.0,1.0,0.6,1,0.0,3,0
8,63.0,1.0,4,130.0,254.0,0.0,2,147.0,0.0,1.4,2,1.0,7,1
9,53.0,1.0,4,140.0,203.0,1.0,2,155.0,1.0,3.1,3,0.0,7,1


In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(['target'],axis='columns'),df.target,test_size=0.3)


In [34]:
len(X_test)

90

In [96]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200,
    criterion='entropy',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=10,)
model.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_samples=10)

In [95]:
model.score(X_test,y_test)

0.8777777777777778

In [None]:
# Helper function for plotting side by side
def sideplot(df, col, kind="bar", title=None):
    assert kind in ["bar", "hist"]
    fig = plt.figure(figsize=(10, 6))
    if kind == "bar":
        ax1 = plt.subplot(2, 2, 1)
        df[df.target == 1][['target', col]].groupby(col).count().plot(kind='bar', rot=0, legend=False, ax=ax1, color="#268bd2")
        ax2 = plt.subplot(2, 2, 2)
        df[df.target == 0][['target', col]].groupby(col).count().plot(kind='bar', rot=0, legend=False, ax=ax2, color="#268bd2")
    else:
        ax1 = plt.subplot(2, 2, 1)
        plt.hist(df[df.target == 1][col], color="#268bd2")
        plt.xlabel(col)
        ax2 = plt.subplot(2, 2, 2)
        plt.hist(df[df.target == 0][col], color="#268bd2")
        plt.xlabel(col)
    # Re-adjusting
    ylim = (0, max(ax1.get_ylim()[1], ax2.get_ylim()[1]))
    ax1.set_ylim(ylim)
    ax2.set_ylim(ylim)
    xlim = (min(ax1.get_xlim()[0], ax2.get_xlim()[0]), max(ax1.get_xlim()[1], ax2.get_xlim()[1]))
    ax1.set_xlim(xlim)
    ax2.set_xlim(xlim)
    if title is not None:
        fig.suptitle(title)
    #plt.subplots_adjust(top=0.99)

In [None]:
sideplot(df, "fbs", kind="bar", title="Comparison of fasting blood sugar")

In [None]:
sideplot(df, "chol", kind="hist", title="Comparison of serum cholestoral")

In [None]:
sideplot(df, "restecg", kind="bar", title="Comparison of resting ECG results")

In [None]:
sideplot(df, "trestbps", kind="hist", title="Comparison of resting blood pressure")

In [None]:
sideplot(df, "age", kind="hist", title="Comparison of age")

In [None]:
sideplot(df, "sex", kind="bar", title="Comparison of sex")

In [None]:
sideplot(df, "thal", kind="bar", title="Comparison of (thal)")

In [None]:
sideplot(df, "slope", kind="bar", title="Comparison of the slope of the peak exercise ST segment")

In [None]:
sideplot(df, "ca", kind="bar", title="Comparison of the number of major visible vessels under fluorosopy")

In [None]:
sideplot(df, "thalach", kind="hist", title="Comparison of maximum heart rate achieved")

In [None]:
sideplot(df, "oldpeak", kind="hist", title="Comparison of ST depression induced by exercise relative to rest")

In [None]:
sideplot(df, "cp", kind="bar", title="Comparison of chest pain type")

In [None]:
sideplot(df, "exang", kind="bar", title="Comparison of exercise induced angina")

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
clf = LDA(n_components=1)

y = df["target"].values
X = clf.fit(df[df.columns[:-1]].values, y).transform(df[df.columns[:-1]].values)
X = X[:, 0]

sns.swarmplot(X[y == 0], color="b", label="without HD")
sns.swarmplot(X[y == 1], color="r", label="with HD")
plt.title("LDA analysis of heart disease classification")
plt.legend()

In [5]:
def onehot(ser, num_classes=None):
    """
    One-hot encode the series.
    Example: 
    >>> onehot([1, 0, 2], 3)
    array([[0., 1., 0.],
           [1., 0., 0.],
           [0., 0., 1.]])
    """
    #if not num_classes:
    if num_classes == None:
        num_classes = len(np.unique(ser))
    return np.identity(num_classes)[ser]

new_col_names = []
need_encode_col = ["restecg", "thal", "slope", "cp"]
no_encode_col = [col for col in df.columns if col not in need_encode_col]
new_df = df[no_encode_col]

for col in need_encode_col:
#     print("---")
#     print("Col: ", col)
#     print("DF Col:\n", df[col])
#     print("---")
    
    num_classes = df[col].max() + 1
#     num_classes = len(df[col].unique()) - np.identity tworzyło array zbyt mały, bo brana była pod uwagę
                                        # ilość unikalnym wartości zamiast ich maksymalna wartość
    
#     print(df[col].unique())
#     print("---")
#     print("Num classes: ", num_classes)
#     print("---")
    
    new_col_names = [f"{col}_{i}" for i in range(num_classes)]
    encoded = pd.DataFrame(onehot(df[col], num_classes), columns=new_col_names, dtype=int)  # tworzy nowy df, nadając nowe "id"?
#     print("Encoded\n", encoded)
    new_df = pd.concat([new_df, encoded], axis=1)  # dodaje nowe id do starych !!! problem !!! max(starych) id > max(nowych)
new_df.head()

# print("---\nNew_df")
# print(new_df)

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,restecg_0,restecg_1,restecg_2,thal_0,thal_1,thal_2,thal_3,thal_4,thal_5,thal_6,thal_7,slope_0,slope_1,slope_2,slope_3,cp_0,cp_1,cp_2,cp_3,cp_4
0,63.0,1.0,145.0,233.0,1.0,150.0,0.0,2.3,0.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0
1,67.0,1.0,160.0,286.0,0.0,108.0,1.0,1.5,3.0,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
2,67.0,1.0,120.0,229.0,0.0,129.0,1.0,2.6,2.0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1
3,37.0,1.0,130.0,250.0,0.0,187.0,0.0,3.5,0.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
4,41.0,0.0,130.0,204.0,0.0,172.0,0.0,1.4,0.0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0


In [6]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

new_df_shfl = shuffle(new_df, random_state=443)
X = new_df_shfl[data_cols].values
y = new_df_shfl["target"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=80)

NameError: name 'data_cols' is not defined