In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('data_BloodPressure.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   Patient ID                     100 non-null    int64
 1    Blood Pressure Before (mmHg)  100 non-null    int64
 2    Blood Pressure After (mmHg)   100 non-null    int64
dtypes: int64(3)
memory usage: 2.5 KB


In [4]:
data.isnull().sum()

Patient ID                       0
 Blood Pressure Before (mmHg)    0
 Blood Pressure After (mmHg)     0
dtype: int64

In [5]:
data.duplicated()

0     False
1     False
2     False
3     False
4     False
      ...  
95    False
96    False
97    False
98    False
99    False
Length: 100, dtype: bool

In [6]:
data.columns

Index(['Patient ID', ' Blood Pressure Before (mmHg)',
       ' Blood Pressure After (mmHg)'],
      dtype='object')

In [7]:
data.columns = data.columns.str.strip()

In [12]:
feature_column = 'Blood Pressure Before (mmHg)'
class_column = 'Blood Pressure After (mmHg)'

In [13]:
feature_data = data['Blood Pressure Before (mmHg)']
class_data = data['Blood Pressure After (mmHg)']

# a. Check whether the distribution of all the classes are the same or not.

In [14]:
class_distributions = data.groupby(class_column).size()
print("Distribution of Classes:")
print(class_distributions)

Distribution of Classes:
Blood Pressure After (mmHg)
118     7
119     2
120     2
121     8
122     4
123     9
124     7
125     4
126     1
127     6
129    10
130     5
131     4
132     2
135     8
136     4
137     6
139     5
140     2
141     4
dtype: int64


# b. Check for the equality of variance/

In [15]:
variance_test = stats.levene(feature_data, class_data)
print("\nEquality of Variance Test:")
print("p-value =", variance_test.pvalue)


Equality of Variance Test:
p-value = 0.6715080090945376


# c. Which amount LDA and QDA would perform better on this data for classification and why.

In [16]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [17]:
X_train, X_test, y_train, y_test = train_test_split(feature_data, class_data, test_size=0.2, random_state=42)

In [18]:
# Define and train LDA model
lda = LinearDiscriminantAnalysis()
lda.fit(X_train.values.reshape(-1, 1), y_train)

In [19]:
# Define and train QDA model
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train.values.reshape(-1, 1), y_train)
"""QDA relaxes the assumption of a shared covariance matrix and allows each class to have its own covariance matrix. 
This can better capture complex relationships and non-linear boundaries between classes. 
QDA is useful when the classes have distinct covariance structures and the decision boundaries are non-linear."""

ValueError: y has only 1 sample in class 119, covariance is ill defined.

In [21]:
lda_predictions = lda.predict(X_test.values.reshape(-1, 1))

In [22]:
lda_accuracy = accuracy_score(y_test, lda_predictions)

In [23]:
print("LDA Accuracy:", lda_accuracy)

LDA Accuracy: 0.6


# d: Check the equality of mean for all classes

In [20]:
mean_equality_test = stats.f_oneway(feature_data, class_data)
print("\nEquality of Mean Test:")
print("p-value =", mean_equality_test.pvalue)


Equality of Mean Test:
p-value = 2.3565868442707578e-08
