In [None]:
import pandas as pd

# Load dataset
framingham_data = pd.read_csv("./datasets/framingham.csv")

# Information about features and missing values
print(framingham_data.info())
print(framingham_data.isnull().sum())

In [5]:
# Removing duplicate rows
framingham_data = framingham_data.drop_duplicates()

# Handling missing values
# For simplicity, let's fill missing values with mean for numerical features
framingham_data.fillna(framingham_data.mean(), inplace=True)

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Features and target variable
X = framingham_data.drop(columns=['TenYearCHD'])
y = framingham_data['TenYearCHD']

# Selecting 10 best features
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)

# Get the selected feature indices
selected_features_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_features_indices]
selected_features

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Splitting the dataset into training and testing sets (40/60)
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.6, random_state=42)

# Data scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
from sklearn.neighbors import KNeighborsClassifier

# Training the KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

# Predicting on the test set
y_pred = knn.predict(X_test_scaled)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, r2_score

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

# R2 Score (not applicable for classification, maybe you meant another metric?)
r2 = r2_score(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("R2 Score:", r2)


In [None]:
# Training the KNN classifier with a different distance metric
knn_custom = KNeighborsClassifier(n_neighbors=3, metric='manhattan')
knn_custom.fit(X_train_scaled, y_train)

# Predicting on the test set
y_pred_custom = knn_custom.predict(X_test_scaled)

# Evaluation
conf_matrix_custom = confusion_matrix(y_test, y_pred_custom)
accuracy_custom = accuracy_score(y_test, y_pred_custom)
r2 = r2_score(y_test, y_pred)
print("Confusion Matrix with custom distance metric:")
print(conf_matrix_custom)
print("Accuracy with custom distance metric:", accuracy_custom)
print("R2 Score:", r2)


In [None]:

from sklearn.decomposition import PCA

# Perform PCA
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Retraining the KNN classifier with PCA-transformed features
knn_pca = KNeighborsClassifier(n_neighbors=3)
knn_pca.fit(X_train_pca, y_train)

# Predicting on the test set
y_pred_pca = knn_pca.predict(X_test_pca)

# Evaluation
conf_matrix_pca = confusion_matrix(y_test, y_pred_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)

print("Confusion Matrix with PCA:")
print(conf_matrix_pca)
print("Accuracy with PCA:", accuracy_pca)
print("R2 Score:", r2)
