In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

## We are going to use a .csv of heart information to try and predict if the individual will have a heart disease.

In [2]:
path = 'C:\\Users\\ashri\\Machine Learning\\ML\\18_PCA\\Exercise\\heart.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


##### We first want to drop all outliers outside of a Z-Score of 3. 
##### Then we want to convert text columns to Label Encodings and One Hot encodings and see how each performs differently 
##### Then scale all of the information between the standard deviation.
##### The models we will evaluate will be SVC(Support Vector Classification), Logistic Regression, and RandomForest.
##### Then we will want to apply pca to them and see how the pca_reduced information performs on the models.

In [3]:
#So remember, z-score is the standard score Z= (x-u)/o with x being the example, u being the mean, and o being the std dev

In [4]:
#So let's compute the z-score for RestingBP, Cholesterol, and MaxHR
resting_bp_mean = df['RestingBP'].mean()
cholesterol_mean = df['Cholesterol'].mean()
max_hr_mean = df['MaxHR'].mean()

#Now let's compute the std dev for each.
resting_bp_std = df['RestingBP'].std()
cholesterol_std = df['Cholesterol'].std()
max_hr_std = df['MaxHR'].std()

#Now let's compute the z-score
df['resting_bp_zscore'] = df['RestingBP'].apply(lambda x: ((x-resting_bp_mean)/resting_bp_std))
df['cholesterol_zscore'] = df['Cholesterol'].apply(lambda x: ((x-cholesterol_mean)/cholesterol_std))
df['max_hr_zscore'] = df['MaxHR'].apply(lambda x: ((x-max_hr_mean)/max_hr_std))

In [5]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,resting_bp_zscore,cholesterol_zscore,max_hr_zscore
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,0.410685,0.824621,1.382175
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,1.49094,-0.171867,0.753746
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-0.129442,0.769768,-1.524307
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,0.30266,0.138964,-1.131539
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,0.950812,-0.034736,-0.581664


In [6]:
df.shape

(918, 15)

In [7]:
#Now, we have the z-scores for the three numerical columns, now we need to remove all Zscores >3 and Zscores <-3
df = df.drop(df[df.resting_bp_zscore < -3].index, axis = 0)
df = df.drop(df[df.resting_bp_zscore > 3].index, axis = 0)
df = df.drop(df[df.cholesterol_zscore < -3].index, axis = 0)
df = df.drop(df[df.cholesterol_zscore > 3].index, axis = 0)
df = df.drop(df[df.max_hr_zscore < -3].index, axis = 0)
df = df.drop(df[df.max_hr_zscore > 3].index, axis = 0)

In [8]:
df.shape

(906, 15)

In [9]:
df.drop(['resting_bp_zscore', 'cholesterol_zscore', 'max_hr_zscore'], axis = 'columns', inplace = True)

In [10]:
# Now we have removed all the z scores outside of the acceptable range of 3 > or -3 < which is standard practice
#Let's now make a dataframe where we have label_encodings and another where we have one hot encodings.
#We need to convert Sex, ChestPainType, RestingECG, ExerciseAngina, and STSlope
#label encoding will label each unique string 0-n with n being the number of unique strings
#one hot encoding will give us a vector [0...n] with each 0 being a potential unique string and if it is there it will turned 
#from a 0 to a 1.
le = LabelEncoder()
df_ohe = df.copy() ## Making a new df that will be ohe
df_le = df.copy() ## Making a new df that will label encoded


need_le = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for index in need_le :
    df_le[index] = le.fit_transform(df[index])
    dummies = pd.get_dummies(df[index], drop_first=True)
    df_ohe = pd.concat([df_ohe, dummies], axis = 'columns')

df_le.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [11]:
df_ohe.drop(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], axis = 'columns', inplace = True)
df_ohe.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,M,ATA,NAP,TA,Normal,ST,Y,Flat,Up
0,40,140,289,0,172,0.0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,0,1,0,1,0,1,0,0,0,1


In [12]:
#Now we need to scale all of the information within our columns for df_le and df_ohe within their respective std dev
scaler = MinMaxScaler()

needs_scaling = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR']
for index in needs_scaling:
    df_le[index] = scaler.fit_transform(df_le[[index]])
    df_ohe[index] = scaler.fit_transform(df_ohe[[index]])

In [13]:
df_le.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,0.244898,1,1,0.571429,0.557915,0,1,0.784173,0,0.0,2,0
1,0.428571,0,2,0.761905,0.34749,0,1,0.669065,0,1.0,1,1
2,0.183673,1,1,0.47619,0.546332,0,2,0.251799,0,0.0,2,0
3,0.408163,0,0,0.552381,0.413127,0,1,0.323741,1,1.5,1,1
4,0.530612,1,2,0.666667,0.376448,0,1,0.42446,0,0.0,2,0


In [14]:
df_ohe.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,M,ATA,NAP,TA,Normal,ST,Y,Flat,Up
0,0.244898,0.571429,0.557915,0,0.784173,0.0,0,1,1,0,0,1,0,0,0,1
1,0.428571,0.761905,0.34749,0,0.669065,1.0,1,0,0,1,0,1,0,0,1,0
2,0.183673,0.47619,0.546332,0,0.251799,0.0,0,1,1,0,0,0,1,0,0,1
3,0.408163,0.552381,0.413127,0,0.323741,1.5,1,0,0,0,0,1,0,1,1,0
4,0.530612,0.666667,0.376448,0,0.42446,0.0,0,1,0,1,0,1,0,0,0,1


In [15]:
##Now we will use GridSearchCV to assess which model we should use and if we should continue with LabelEncodings or OneHotEncode
#first we will create a dictionary of dictionaries containing the models and params we want evaled
models_and_info = {
    'SVC' :{
        'model':SVC(gamma='auto'),
        'params':{
            'C':[1,2,3,4,5,10,15,20],
            'kernel':['linear', 'rbf', 'poly']
        }
    },
    'Logistic_Regression':{
        'model' : LogisticRegression(solver = 'liblinear', multi_class='auto'),
        'params' : {
            'C':[1,2,3,4,5,10,15,20]
        }
    },
    'RandomForestClassifier':{
        'model': RandomForestClassifier(),
        'params':{
            'n_estimators' :[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]
        }
    }
}



In [16]:
scores_with_le = []
scores_with_ohe= []

x_with_ohe = df_ohe.drop('HeartDisease', axis = 'columns')
x_with_le = df_le.drop('HeartDisease', axis = 'columns')
y = df_le.HeartDisease

#mp is for model parameters, cv= cross validation and is asking how many sets do we want the data split into.
for model_name, mp in models_and_info.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv =5, return_train_score = False)
    clf.fit(x_with_le, y)
    scores_with_le.append({
        'model' : model_name,
        'best_score' : clf.best_score_,
        'best_params' : clf.best_params_
    })
    
    clf.fit(x_with_ohe, y)
    scores_with_ohe.append({
        'model' : model_name,
        'best_score' : clf.best_score_,
        'best_params' : clf.best_params_
    })

In [17]:
scores_df_le = pd.DataFrame(scores_with_le, columns = ['model', 'best_score', 'best_params'])
scores_df_ohe = pd.DataFrame(scores_with_ohe, columns = ['model', 'best_score', 'best_params'])

In [18]:
scores_df_le.head()

Unnamed: 0,model,best_score,best_params
0,SVC,0.841011,"{'C': 1, 'kernel': 'rbf'}"
1,Logistic_Regression,0.815658,{'C': 1}
2,RandomForestClassifier,0.820023,{'n_estimators': 7}


In [19]:
scores_df_ohe.head()

Unnamed: 0,model,best_score,best_params
0,SVC,0.847623,"{'C': 20, 'kernel': 'poly'}"
1,Logistic_Regression,0.83661,{'C': 1}
2,RandomForestClassifier,0.822245,{'n_estimators': 15}


## Our Absolute best is a Support Vector Classifier with a regularization component C = 20 and running a poly kernel instead of a linear (one up hyperplane delineation) or a rbf (infinite hyperplane delineation)

In [21]:
#Now let's apply PCA, keep in mind this is a dimensionality reduction technique to see which variable carry the most information
# so if computation speed was a factor, we would apply this, but we would lose accuracy

pca = PCA(0.95) #Let's find the features that contain 95% of the information
x_pca = pca.fit_transform(x_with_ohe)

In [23]:
x_with_ohe.shape

(906, 15)

In [22]:
x_pca.shape

(906, 10)

In [24]:
## We lost 10 columns, as the 5 we lost only contain 5% of the information, these carry the other 95%

#Now let's build a model using this new PCA reduced information, using the model that performed best.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.2)

model = SVC(kernel = 'poly', C = 20)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.8351648351648352

In [None]:
#As you can see, we lost just over 1% of predictive power but reduced the computations needed by 5 columns of information.