# 1. Take one of the supervised learning models you have built recently and apply at least three dimensionality reduction techniques to it (separately). Be sure to create a short summary of each technique you use. Indicate how each changed the model performance. Reference: https://machinelearningmastery.com/dimensionality-reduction-algorithms-with-python/

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

arrhythmia_df = pd.read_csv("df.csv")
arrhythmia_df.head()

Unnamed: 0,age,sex,height,weight,QRSduration,PRinterval,Q-Tinterval,Tinterval,Pinterval,QRS,...,chV2_TwaveAmp,chV2_QRSTA,chV3_SwaveAmp,chV3_PwaveAmp,chV4_RwaveAmp,chV4_PwaveAmp,chV5_JJwaveAmp,chV5_SPwaveAmp,chV6_SPwaveAmp,class
0,75.0,0.0,190.0,80.0,91.0,193.0,371.0,174.0,121.0,-16.0,...,2.9,15.2,-10.0,0.6,15.2,0.9,-0.4,0.0,0.0,1.0
1,56.0,1.0,165.0,64.0,81.0,174.0,401.0,149.0,39.0,25.0,...,2.0,1.2,-7.7,0.9,9.5,0.5,-0.4,0.0,0.0,1.0
2,54.0,0.0,172.0,95.0,138.0,163.0,386.0,185.0,102.0,96.0,...,-2.4,-2.6,-4.1,0.4,10.0,0.5,1.3,0.0,0.0,1.0
3,55.0,0.0,175.0,94.0,100.0,202.0,380.0,179.0,143.0,28.0,...,2.9,18.0,-7.9,0.1,15.0,0.1,0.1,0.0,0.0,0.0
4,75.0,0.0,190.0,80.0,88.0,181.0,360.0,177.0,103.0,-16.0,...,2.1,8.6,-10.2,-1.0,15.2,-0.1,-0.2,0.0,0.0,1.0


In [2]:
type(arrhythmia_df)

pandas.core.frame.DataFrame

In [5]:
X = arrhythmia_df.drop('class', axis=1).values
y = arrhythmia_df['class'].values

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42, stratify=y)

#Standardize
sc= StandardScaler()
X_train_sc=sc.fit_transform(X_train)
X_test_sc=sc.fit_transform(X_test)

In [6]:
#logistic regression
clr = LogisticRegression(random_state=42).fit(X_train_sc,y_train)

#predict
y_predicted = clr.predict(X_test_sc)

# print score
print("Accuracy score of the logistic regression model:", round(clr.score(X_test_sc,y_test)*100, 2))

Accuracy score of the logistic regression model: 76.47


# 1. SVD

Using Singular Value Decomposition for dimensionality reduction improved the performance of the logistic regression model. The ouput displays that using 54 features gives the highest accuracy performance, which is 79%. It works well with sparse data and there are many columns with zero values (the 0's do not mean missing for this dataset); so it was a good technique to use for this dataset.

In [7]:
from sklearn.decomposition import TruncatedSVD

for n in range(1,137):
    svd = TruncatedSVD(n_components=n)

    X_train_svd=svd.fit_transform(X_train)
    X_test_svd=svd.transform(X_test)

    clr = LogisticRegression(random_state=42).fit(X_train_svd, y_train)

    print(n, round(clr.score(X_test_svd, y_test)*100, 2))

1 59.56
2 54.41
3 57.35
4 57.35
5 63.97
6 66.91
7 73.53
8 74.26
9 72.79
10 72.79
11 72.06
12 69.85
13 66.18
14 68.38
15 73.53
16 71.32
17 75.74
18 75.74
19 76.47
20 75.74
21 76.47
22 76.47
23 76.47
24 76.47
25 75.74
26 75.74
27 76.47
28 74.26
29 75.0
30 75.74
31 75.74
32 75.74
33 74.26
34 74.26
35 76.47
36 77.94
37 78.68
38 77.21
39 78.68
40 76.47
41 77.94
42 76.47
43 76.47
44 75.74
45 76.47
46 76.47
47 75.74
48 77.21
49 77.21
50 76.47
51 78.68
52 77.94
53 77.94
54 79.41
55 78.68
56 77.94
57 77.21
58 77.21
59 75.74
60 79.41
61 75.74
62 76.47
63 76.47
64 74.26
65 75.0
66 74.26
67 75.0
68 74.26
69 73.53
70 74.26
71 73.53
72 72.79
73 74.26
74 75.0
75 72.06
76 72.06
77 70.59
78 72.79
79 69.85
80 73.53
81 70.59
82 70.59
83 69.85
84 71.32
85 75.74
86 71.32
87 71.32
88 73.53
89 72.06
90 75.0
91 72.79
92 73.53
93 72.06
94 73.53
95 72.79
96 75.74
97 73.53
98 71.32
99 70.59
100 69.85
101 69.85
102 72.79
103 73.53
104 72.79
105 71.32
106 71.32
107 69.12
108 71.32
109 70.59
110 69.85
111 70.59
112

# 2. PCA

Using principle component analysis shows that reducing the dataframe to 51 features improves the model performance with a score of 82%. This techniqued gave logistic regression the best model performance. The baseline score for Logistic Regression is lower than PCA's score because there were originally a large number of dimensions in the feature space which means there were some features that were not relevant to the outcome of regular or irregular heartbeat. 

In [8]:
from sklearn.decomposition import PCA

for n in range(1,137):
    pca = PCA(n_components=n)

    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    classifier = LogisticRegression(random_state=42).fit(X_train_pca, y_train)
    print(n, round(classifier.score(X_test_pca, y_test)*100, 2))

1 54.41
2 59.56
3 58.09
4 60.29
5 63.24
6 72.79
7 70.59
8 73.53
9 72.06
10 70.59
11 71.32
12 69.85
13 69.12
14 68.38
15 74.26
16 75.74
17 75.74
18 76.47
19 76.47
20 77.94
21 76.47
22 77.21
23 77.21
24 77.94
25 78.68
26 77.21
27 75.0
28 76.47
29 75.74
30 75.74
31 75.74
32 75.0
33 75.74
34 75.0
35 76.47
36 77.21
37 77.94
38 79.41
39 80.15
40 79.41
41 80.15
42 80.15
43 80.15
44 78.68
45 79.41
46 78.68
47 77.21
48 78.68
49 77.94
50 77.94
51 82.35
52 77.21
53 77.94
54 78.68
55 75.74
56 75.74
57 75.0
58 72.79
59 72.06
60 69.85
61 70.59
62 71.32
63 68.38
64 67.65
65 67.65
66 67.65
67 66.91
68 66.91
69 69.12
70 66.91
71 66.91
72 66.18
73 65.44
74 66.18
75 65.44
76 66.18
77 65.44
78 67.65
79 66.18
80 67.65
81 66.18
82 66.18
83 66.18
84 66.18
85 67.65
86 67.65
87 66.91
88 67.65
89 67.65
90 67.65
91 66.18
92 67.65
93 67.65
94 67.65
95 67.65
96 66.91
97 67.65
98 68.38
99 66.91
100 68.38
101 67.65
102 66.18
103 66.18
104 67.65
105 66.91
106 66.91
107 66.18
108 66.18
109 66.91
110 68.38
111 66.91
11

# 3. Isomap Embedding

IsoMapping shows the best performance when the features are redcued to 23; it gives a performance of 73%. It does not improve the model and had the lowest score in comparison to the other dimensionality techniques.

In [10]:
from sklearn.manifold import Isomap

for n in range(1,137):
    embedding = Isomap(n_components=n)

    X_train_em = embedding.fit_transform(X_train)
    X_test_em = embedding.transform(X_test)
    model = LogisticRegression(random_state=42).fit(X_train_em, y_train)
    print(n, round(model.score(X_test_em, y_test)*100, 2))

1 55.88
2 52.21
3 56.62
4 55.15
5 54.41
6 58.82
7 57.35
8 57.35
9 58.82
10 65.44
11 66.18
12 66.18
13 65.44
14 66.18
15 64.71
16 68.38
17 68.38
18 71.32
19 72.06
20 72.06
21 72.79
22 71.32
23 73.53
24 72.06
25 71.32
26 71.32
27 71.32
28 70.59
29 69.12
30 68.38
31 67.65
32 67.65
33 67.65
34 66.91
35 68.38
36 68.38
37 67.65
38 67.65
39 68.38
40 68.38
41 69.85
42 69.85
43 69.85
44 69.85
45 65.44
46 66.18
47 65.44
48 65.44
49 68.38
50 67.65
51 65.44
52 67.65
53 66.91
54 67.65
55 67.65
56 66.91
57 66.18
58 66.18
59 64.71
60 64.71
61 63.97
62 66.91
63 65.44
64 61.03
65 62.5
66 58.82
67 58.82
68 62.5
69 62.5
70 61.03
71 61.03
72 58.09
73 57.35
74 57.35
75 59.56
76 59.56
77 58.09
78 58.09
79 58.82
80 60.29
81 60.29
82 60.29
83 59.56
84 56.62
85 55.88
86 54.41
87 52.94
88 52.94
89 52.21
90 51.47
91 51.47
92 52.94
93 52.21
94 52.21
95 52.94
96 50.74
97 52.94
98 56.62
99 55.88
100 58.82
101 58.82
102 58.09
103 58.09
104 58.82
105 58.82
106 58.82
107 57.35
108 56.62
109 55.88
110 57.35
111 57.35
1

# Write a function that will indicate if an inputted IPv4 address is accurate or not.  IP addresses are valid if they have 4 values between 0 and 255 (inclusive), punctuated by periods.

Input 1:

2.33.245.5

Output 1:

True

Input 2:

12.345.67.89

Output 2:

False

In [50]:
def IPA(address):
    try:
        numbers = address.split('.')
    
        if len(numbers) != 4:
            return False
    
        for number in numbers:
            if int(number) < 0 or int(number) > 255:
                return False
        return True

    except Exception as e:
        return False

In [53]:
IPA('2.33.245.5')

True

In [54]:
IPA('12.345.67.89')

False