# Dataset 1 Tasks

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
# Load the dataset
data = pd.read_csv('/Users/wajeehaqurban/Downloads/healthcare-dataset-stroke-data.csv')

In [4]:
# Step 1: Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 id                    0
gender                0
age                   0
hypertension          0
heart_disease         0
ever_married          0
work_type             0
Residence_type        0
avg_glucose_level     0
bmi                  46
smoking_status        0
stroke                0
dtype: int64


In [5]:
data['bmi'].fillna(data['bmi'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['bmi'].fillna(data['bmi'].mean(), inplace=True)


In [6]:
label_encoder = LabelEncoder()

In [7]:
data['gender'] = label_encoder.fit_transform(data['gender'])
data['ever_married'] = label_encoder.fit_transform(data['ever_married'])
data['work_type'] = label_encoder.fit_transform(data['work_type'])
data['Residence_type'] = label_encoder.fit_transform(data['Residence_type'])
data['smoking_status'] = label_encoder.fit_transform(data['smoking_status'])

In [8]:

scaler = StandardScaler()

In [9]:
data['age'] = scaler.fit_transform(data[['age']])
data['bmi'] = scaler.fit_transform(data[['bmi']])

In [10]:
print("Processed Data Sample:\n", data.head())

Processed Data Sample:
       id  gender       age  hypertension  heart_disease  ever_married  \
0   9046       1  0.634926             0              1             1   
1  51676       0  0.365898             0              0             1   
2  31112       1  1.217818             0              1             1   
3  60182       0 -0.172157             0              0             1   
4   1665       0  1.172980             1              0             1   

   work_type  Residence_type  avg_glucose_level           bmi  smoking_status  \
0          2               1             228.69  8.571905e-01               1   
1          3               0             202.21 -4.660478e-16               2   
2          2               0             105.92  3.193492e-01               2   
3          2               1             171.23  5.685927e-01               3   
4          3               0             174.12 -7.956876e-01               2   

   stroke  
0       1  
1       1  
2       1  
3 

In [11]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [12]:
kmeans = KMeans(n_clusters=6, random_state=0)
data['KMeans_Cluster'] = kmeans.fit_predict(data.drop(columns=['stroke']))

In [13]:
hierarchical = AgglomerativeClustering(n_clusters=6)
data['Hierarchical_Cluster'] = hierarchical.fit_predict(data.drop(columns=['stroke']))

In [14]:
silhouette_kmeans = silhouette_score(data.drop(columns=['KMeans_Cluster', 'Hierarchical_Cluster', 'stroke']), data['KMeans_Cluster'])
silhouette_hierarchical = silhouette_score(data.drop(columns=['KMeans_Cluster', 'Hierarchical_Cluster', 'stroke']), data['Hierarchical_Cluster'])

In [15]:
davies_bouldin_kmeans = davies_bouldin_score(data.drop(columns=['KMeans_Cluster', 'Hierarchical_Cluster', 'stroke']), data['KMeans_Cluster'])
davies_bouldin_hierarchical = davies_bouldin_score(data.drop(columns=['KMeans_Cluster', 'Hierarchical_Cluster', 'stroke']), data['Hierarchical_Cluster'])


In [16]:
# Display results
print("Clustering Validation Results:")
print(f"Silhouette Score (K-Means): {silhouette_kmeans}")
print(f"Silhouette Score (Hierarchical): {silhouette_hierarchical}")
print(f"Davies-Bouldin Index (K-Means): {davies_bouldin_kmeans}")
print(f"Davies-Bouldin Index (Hierarchical): {davies_bouldin_hierarchical}")

Clustering Validation Results:
Silhouette Score (K-Means): 0.5705170254033136
Silhouette Score (Hierarchical): 0.5427741690805332
Davies-Bouldin Index (K-Means): 0.49793292576134296
Davies-Bouldin Index (Hierarchical): 0.5023108011070582


In [17]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
X = data.drop(columns=['stroke', 'KMeans_Cluster', 'Hierarchical_Cluster'])
y = data['stroke']

In [18]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=0.95, random_state=0)
X_pca = pca.fit_transform(X_scaled)
pca_components = X_pca.shape[1]
print(f"PCA Number of Components: {pca_components}")

PCA Number of Components: 10


In [19]:
X_non_negative = MinMaxScaler().fit_transform(X)
chi2_selector = SelectKBest(chi2, k=5)
X_chi2 = chi2_selector.fit_transform(X_non_negative, y)
chi2_selected_features = X.columns[chi2_selector.get_support(indices=True)]
print("Chi-Square Selected Features:", chi2_selected_features.tolist())

Chi-Square Selected Features: ['age', 'hypertension', 'heart_disease', 'ever_married', 'avg_glucose_level']


In [20]:
from sklearn.model_selection import train_test_split

X_supervised = data.drop(columns=['stroke'])  # Drop the target column to create X
y = data['stroke']  # Define the target variable

# Impute missing values with mean (for numerical) and mode (for categorical)
X_imputed = X_supervised.apply(lambda col: col.fillna(col.mean()) if col.dtype != 'object' else col.fillna(col.mode()[0]))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=0)

In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [23]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000, random_state=0)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print("\nLogistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Precision:", precision_score(y_test, y_pred_log_reg))
print("Recall:", recall_score(y_test, y_pred_log_reg))
print("F1-Score:", f1_score(y_test, y_pred_log_reg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))


Logistic Regression
Accuracy: 0.7333333333333333
Precision: 0.6708860759493671
Recall: 0.7066666666666667
F1-Score: 0.6883116883116883
Confusion Matrix:
 [[79 26]
 [22 53]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.77       105
           1       0.67      0.71      0.69        75

    accuracy                           0.73       180
   macro avg       0.73      0.73      0.73       180
weighted avg       0.74      0.73      0.73       180



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("\nRandom Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1-Score:", f1_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest
Accuracy: 0.7277777777777777
Precision: 0.6756756756756757
Recall: 0.6666666666666666
F1-Score: 0.6711409395973155
Confusion Matrix:
 [[81 24]
 [25 50]]
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.77      0.77       105
           1       0.68      0.67      0.67        75

    accuracy                           0.73       180
   macro avg       0.72      0.72      0.72       180
weighted avg       0.73      0.73      0.73       180



In [25]:
from sklearn.svm import SVC

# Define feature set and target variable
X_supervised = data.drop(columns=['stroke'])  # Drop the target column to create X
y = data['stroke']  # Define the target variable

# Impute missing values with mean (for numerical) and mode (for categorical)
X_imputed = X_supervised.apply(lambda col: col.fillna(col.mean()) if col.dtype != 'object' else col.fillna(col.mode()[0]))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=0)

# Create and train SVM model
svm = SVC(probability=True, random_state=0)
svm.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm.predict(X_test)

# Evaluate and print results
print("\nSupport Vector Machine (SVM)")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("F1-Score:", f1_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))



Support Vector Machine (SVM)
Accuracy: 0.5833333333333334
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
Confusion Matrix:
 [[105   0]
 [ 75   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.58      1.00      0.74       105
           1       0.00      0.00      0.00        75

    accuracy                           0.58       180
   macro avg       0.29      0.50      0.37       180
weighted avg       0.34      0.58      0.43       180



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print("\nK-Nearest Neighbors (KNN)")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Precision:", precision_score(y_test, y_pred_knn))
print("Recall:", recall_score(y_test, y_pred_knn))
print("F1-Score:", f1_score(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))


K-Nearest Neighbors (KNN)
Accuracy: 0.4666666666666667
Precision: 0.34782608695652173
Recall: 0.32
F1-Score: 0.3333333333333333
Confusion Matrix:
 [[60 45]
 [51 24]]
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.57      0.56       105
           1       0.35      0.32      0.33        75

    accuracy                           0.47       180
   macro avg       0.44      0.45      0.44       180
weighted avg       0.46      0.47      0.46       180



In [27]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=0)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("\nXGBoost")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Precision:", precision_score(y_test, y_pred_xgb))
print("Recall:", recall_score(y_test, y_pred_xgb))
print("F1-Score:", f1_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.




XGBoost
Accuracy: 0.7444444444444445
Precision: 0.6986301369863014
Recall: 0.68
F1-Score: 0.6891891891891891
Confusion Matrix:
 [[83 22]
 [24 51]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.79      0.78       105
           1       0.70      0.68      0.69        75

    accuracy                           0.74       180
   macro avg       0.74      0.74      0.74       180
weighted avg       0.74      0.74      0.74       180

