In [56]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score
import math

# Importing and Analysis of Dataset

In [None]:
data = pd.read_csv("./framingham.csv")
data.head()

In [None]:
data.describe()

In [None]:
print(f"DATASET SIZE: {data.shape}")

In [None]:
data.isnull().sum() # to get no. of null values or say missing values in the dataset

# Removing missing values

In [None]:
print(data["education"].median())
print(data["cigsPerDay"].mode()[0])
print(data["BPMeds"].mode()[0])
print(data["totChol"].mode()[0])
print(data["BMI"].mean())
print(data["heartRate"].mode()[0])
print(data["glucose"].mode()[0])

In [None]:
data["education"] = data["education"].fillna(data["education"].median())
data["cigsPerDay"] = data["cigsPerDay"].fillna(data["cigsPerDay"].mode()[0])
data["BPMeds"] = data["BPMeds"].fillna(data["BPMeds"].mode()[0])
data["totChol"] = data["totChol"].fillna(data["totChol"].mode()[0])
data["BMI"] = data["BMI"].fillna(data["BMI"].mean())
data["heartRate"] = data["heartRate"].fillna(data["heartRate"].mode()[0])
data["glucose"] = data["glucose"].fillna(data["glucose"].mode()[0])
data.isnull().sum() # to get no. of null values or say missing values in the dataset

# Feature Selection using Correlation Matrix

In [None]:
# Now we need to peform feature or variable selection
plt.figure(figsize=(10,10))
sns.heatmap(data.corr(), cmap="coolwarm", annot=True, fmt="0.2f", linewidths=0.5, linecolor="Black")

In [64]:
dependent_X = data[["male", "age", "currentSmoker", "cigsPerDay", "BPMeds", "prevalentStroke", "prevalentHyp", "diabetes", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose"]]
independent_Y = data["TenYearCHD"]

# Train and Test Data Split

In [65]:
x_train, x_test, y_train, y_test = train_test_split(dependent_X, independent_Y, random_state=42, train_size=0.8, shuffle=True)

In [None]:
print(f"DATASET: {data.shape}")
print(f"X_TRAIN: {x_train.shape}")
print(f"X_TEST: {x_test.shape}")
print(f"Y_TRAIN: {y_train.shape}")
print(f"Y_TEST: {y_test.shape}")

# Generating Model using `sklearn`'s LogisticRegression

In [67]:
model = LogisticRegression()

In [None]:
model.fit(x_train, y_train)

In [69]:
y_predicted = model.predict(x_test)

In [None]:
accuracy = accuracy_score(y_true=y_test, y_pred=y_predicted)
mse = mean_absolute_error(y_true=y_test, y_pred=y_predicted)
rmse = math.sqrt(mse)

print(f"Accuracy Score of Model: {accuracy}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")