Importing Libraries

In [None]:
import numpy as np  # Importing NumPy 
import pandas as pd  # Importing Pandas 

Load .csv dataset into a pandas dataFrame

In [None]:
df=pd.read_csv("https://drive.google.com/uc?export=download&id=1XyhVIZaKYZczlM2alun_fofilqTBq_9c")

shows the top 5 records of the dataset

In [None]:
df.head()

Tuple representing the number of rows and columns in the DataFrame

In [None]:
df.shape

Summary of the DataFrame, including data types and non-null counts

In [None]:
df.info()

Describing the statistical summary of numerical type data

In [None]:
df.describe()

Retrieve the unique values in the 'smoking_status' column

In [None]:
df.smoking_status.unique() 

Statistical summary of categorical type data

In [None]:
df.describe(include = object)

Display the count of missing values for each column

In [None]:
df.isnull().sum()

Calculate the missing values percentage for each column

In [None]:
missing_values_percentage = (df.isnull().mean() * 100).round(2)

# Display the missing values percentage for each column
print("Missing Values Percentage:\n")
print(missing_values_percentage)

Handling null values

In [None]:
 # df['bmi'] = df['bmi'].fillna(df['bmi'].median())
# df.isnull().sum()

Importing Matplotlib and Seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

Age Distribution with Stroke Incidence-Histplot

In [None]:
plt.figure(figsize=(10, 6))
custom_palette = ["#457b9d", "#e63946"]  
sns.histplot(data=df, x='age', hue='stroke', multiple='stack', kde=True, bins=30, palette=custom_palette)  
plt.title('Age Distribution with Stroke Incidence')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

Hypertension and Heart Disease vs Stroke Incidence-Countplot

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

custom_palette = ["#457b9d", "#ef233c"]

# First subplot for Hypertension vs Stroke Incidence
sns.countplot(data=df, x='hypertension', hue='stroke', palette=custom_palette, ax=axes[0])
axes[0].set_title('Hypertension vs Stroke Incidence')
axes[0].set_xlabel('Hypertension (1 = Yes, 0 = No)')
axes[0].set_ylabel('Count')

# Second subplot for Heart Disease vs Stroke Incidence
sns.countplot(data=df, x='heart_disease', hue='stroke', palette=custom_palette, ax=axes[1])
axes[1].set_title('Heart Disease vs Stroke Incidence')
axes[1].set_xlabel('Heart Disease (1 = Yes, 0 = No)')
axes[1].set_ylabel('Count')

plt.subplots_adjust(wspace=0.4)  # Adjust wspace to control the space between plots
plt.show()


Average Glucose Level Distribution by Stroke-Kdeplot

In [None]:
plt.figure(figsize=(10, 6))
custom_palette = ["#457b9d", "#ef233c"]
sns.kdeplot(data=df, x='avg_glucose_level', hue='stroke', fill=True, palette=custom_palette)
plt.title('Average Glucose Level Distribution by Stroke')
plt.xlabel('Average Glucose Level')
plt.ylabel('Density')
plt.show()

BMI Distribution with Stroke Incidence-Histplot

In [None]:
plt.figure(figsize=(10, 6))
custom_palette = ["#457b9d", "#ef233c"]
sns.histplot(data=df, x='bmi', hue='stroke', kde=True, bins=30, palette=custom_palette)
plt.title('BMI Distribution with Stroke Incidence')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.show()

Gender Distribution by Stroke Incidence-Countplot

In [None]:
plt.figure(figsize=(6, 5))
custom_palette = ["#457b9d", "#ef233c"]
sns.countplot(data=df, x='gender', hue='stroke', palette=custom_palette)
plt.title("Gender Distribution by Stroke Incidence")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

Age Distribution by Smoking Status and Stroke Incidence-Boxplot

In [None]:
plt.figure(figsize=(12, 6))
custom_palette = ["#457b9d", "#ef233c"]
sns.boxplot(data=df, x="smoking_status", y="age", hue="stroke", palette=custom_palette)

plt.title("Age Distribution by Smoking Status and Stroke Incidence")
plt.xlabel("Smoking Status")
plt.ylabel("Age")
plt.legend(title='Stroke Incidence', labels=['No Stroke (0)', 'Stroke (1)'])
plt.show()

Scatter Plot of Age vs BMI by Stroke Incidence-Scatterplot

In [None]:
plt.figure(figsize=(10, 6))
custom_palette = ["#457b9d", "#ef233c"]
sns.scatterplot(data=df, x="age", y="bmi", hue="stroke", palette=custom_palette, alpha=0.7)
plt.title("Scatter Plot of Age vs BMI by Stroke Incidence")
plt.xlabel("Age")
plt.ylabel("BMI")
plt.legend(title="Stroke")
plt.show()

Pairplot for Multiple Variables

In [None]:
sns.pairplot(df, hue="stroke", palette="husl", corner=True)
plt.show()

Data Encoding 
- Convert Residence_type to binary
- Convert work_type to separate binary columns 
- Convert smoking_status to binary columns
- Drop the original 'Residence_type', 'work_type', and 'smoking_status' columns

In [None]:
df['Residence_type'] = df['Residence_type'].apply(lambda x: 1 if x == 'Urban' else 0)
print("After converting Residence_type to binary (0 = Rural, 1 = Urban):")
display(df[['Residence_type']].head())

In [None]:
work_type_dummies = pd.get_dummies(df['work_type'], prefix='work_type')
df = pd.concat([df, work_type_dummies[['work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed']]], axis=1)

# Display the first few rows to verify the transformation
print("\nAfter converting work_type to binary columns:")
print(df[['work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed']].head())

# Ensure all binary columns contain only 0 or 1
work_type_columns = ['work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed']
for col in work_type_columns:
    df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)

print("\nAfter ensuring binary columns contain only 0 or 1:")
print(df[work_type_columns].head())

In [None]:
smoking_status_dummies = pd.get_dummies(df['smoking_status'], prefix='smoking_status')
df = pd.concat([df, smoking_status_dummies[['smoking_status_never smoked', 'smoking_status_formerly smoked', 'smoking_status_smokes']]], axis=1)

# Display the first few rows to verify the transformation
print("\nAfter converting smoking_status to binary columns:")
print(df[['smoking_status_never smoked', 'smoking_status_formerly smoked', 'smoking_status_smokes']].head())

# Ensure that all binary columns contain only 0 or 1
binary_columns = ['smoking_status_never smoked', 'smoking_status_formerly smoked', 'smoking_status_smokes']
for col in binary_columns:
    df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)

print("\nAfter ensuring binary columns contain only 0 or 1:")
print(df[binary_columns].head())

In [None]:
df_model = df.drop(columns=['Residence_type', 'work_type', 'smoking_status'])
print("\nFinal transformed dataset for data modeling:")
display(df_model.head())  

Data Visualization
- Encoding all columns in dataset

In [None]:

for column in df.columns:
    # Ensure the column is a Series and has a dtype
    if isinstance(df[column], pd.Series):
        # Check if column has object type (usually represents strings or categorical data)
        if df[column].dtype == 'object':
            # If it has only two unique values, use binary encoding
            if df[column].nunique() == 2:
                # Map the values directly to 0 and 1
                df[column] = df[column].map({df[column].unique()[0]: 1, df[column].unique()[1]: 0})
            # If it has more than two unique values, use one-hot encoding
            else:
                df = pd.get_dummies(df, columns=[column], drop_first=True)
        # If column is boolean (True/False), convert to 0/1
        elif df[column].dtype == bool:
            df[column] = df[column].astype(int)

# Convert any boolean columns (not already converted) to integers (True/False to 1/0)
df = df.applymap(lambda x: 1 if x is True else (0 if x is False else x))

# Remove duplicate columns (in case of one-hot encoding)
df = df.loc[:, ~df.columns.duplicated()]

# Check the transformed DataFrame
print("\nEncoded DataFrame:")
print(df.head())  # Display the first few rows of the encoded DataFrame


Ridge Regression Model
- Accuracy
- RSME

By importing Ridge Regression function

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
import numpy as np

# Step 1: Data Preparation
X = df.drop(columns='stroke')  # Drop the 'stroke' column from the features
y = df['stroke'].values  # Target variable

# Handle missing values (if any)
df = df.dropna()  # Drop rows with missing values (if any)
X = df.drop(columns='stroke')  # Re-define X after dropping NaNs
y = df['stroke'].values  # Re-define y after dropping NaNs

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# Step 3: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Define and train the Ridge Regression model
ridge_modell = Ridge(alpha=0.5)  # Set alpha (regularization strength)
ridge_modell.fit(X_train_scaled, y_train)

# Step 5: Make predictions
y_predd = ridge_modell.predict(X_test_scaled)


y_pred_classr = (y_predd >= 0.14).astype(int)  # Convert predictions to binary classes (0 or 1)
accuracy = accuracy_score(y_test, y_pred_classr)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_predd))
print("Accuracy:", accuracy*100)
print("RMSE:", rmse*100)


Without importing Ridge Regression function

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error

class RidgeRegression:
    def __init__(self, lr=0.01, n_iters=2000, alpha=0.5):
        self.lr = lr
        self.n_iters = n_iters
        self.alpha = alpha
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.n_iters):
            y_pred = np.dot(X, self.weights) + self.bias
            dw = (1 / num_samples) * np.dot(X.T, (y_pred - y)) + (self.alpha / num_samples) * self.weights
            db = (1 / num_samples) * np.sum(y_pred - y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

# Train and predict
ridge_model = RidgeRegression(lr=0.01, n_iters=500, alpha=0.5)
ridge_model.fit(X_train_scaled, y_train)

y_pred = ridge_model.predict(X_test_scaled)

# Classification threshold and accuracy
y_pred_class = (y_pred >= 0.14).astype(int)

accuracy = accuracy_score(y_test, y_pred_class)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Display results
print("Accuracy:", accuracy * 100, "%")
print("RMSE:", rmse*100)


Linear Regression Model
- RSME
- Accuracy

In [None]:
#without importing linearregression 
import numpy as np

from sklearn.metrics import mean_squared_error, accuracy_score

class LinearRegression:
    def __init__(self, lr=0.001, n_iters=2000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.n_iters):
            y_pred = np.dot(X, self.weights) + self.bias
            dw = (1 / num_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / num_samples) * np.sum(y_pred - y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias


# Assuming X_train_scaled, X_test_scaled, y_train, y_test are already defined
linear_model = LinearRegression(lr=0.01, n_iters=500)
linear_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = linear_model.predict(X_test_scaled)

# Convert continuous predictions to binary (if target is binary classification)
y_pred_class = (y_pred >= 0.5).astype(int)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate Accuracy (only if the target is binary)
if set(y_test).issubset({0, 1}):
    accuracy = accuracy_score(y_test, y_pred_class)
    print("Accuracy:", accuracy * 100, "%")
else:
    print("Accuracy cannot be computed for non-binary target variables.")

# Display RMSE
print("RMSE:", rmse)


By importing Linear Regression function

In [None]:
from sklearn.metrics import mean_squared_error # directy importing LinearRegression()
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
linear_reg_predictions = linear_reg.predict(X_test)
linear_reg_rmse = np.sqrt(mean_squared_error(y_test, linear_reg_predictions))
linear_reg_rmse*100


In [None]:
accuracy_linear = np.mean((linear_reg_predictions.round() == y_test))
print("Linear Regression Accuracy :", accuracy_linear*100)

Lasso Regression 
- rsme
- accuracy

In [None]:
lasso_reg = Lasso()
lasso_reg.fit(X_train, y_train)
lasso_reg_predictions = lasso_reg.predict(X_test)
lasso_reg_rmse = np.sqrt(mean_squared_error(y_test, lasso_reg_predictions))
lasso_reg_rmse*100

In [None]:
accuracy_lasso = np.mean((lasso_reg_predictions .round() == y_test))
print("Lasso Regression Accuracy :", accuracy_lasso*100)

Logistic Regression
- RSME
- Accuracy

In [None]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score
logistic_reg = LogisticRegression()  
logistic_reg.fit(X_train, y_train)
logistic_reg_predictions = logistic_reg.predict(X_test)
logistic_reg_rmse = np.sqrt(mean_squared_error(y_test, logistic_reg_predictions))
print("RSME:",logistic_reg_rmse*100)

In [None]:
accuracy_logistic = np.mean((logistic_reg_predictions.round() == y_test))
print("Logistic Regression Accuracy :", accuracy_logistic*100)

Visualizing and comparing all models: RSME and Accuracy

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Example pre-calculated RMSE and Accuracy for each model
models = ['Logistic Regression', 'Linear Regression', 'Lasso Regression', 'Ridge Regression']

# Example RMSE and Accuracy values (replace these with your actual calculated values)
rmse_values = [19.671435554683786
, 18.12776784844222
,18.952674831947895
,18.128190549583962]  # Replace with actual RMSE values
accuracy_values = [96.13034623217924
, 96.23217922606925, 96.23217922606925, 92.4643584521385]  # Replace with actual accuracy values

# Create x-axis positions for each model
x = np.arange(len(models))  # Position of the models on x-axis
width = 0.35  # Width of the bars for grouped bar chart

# Create a grouped bar chart
fig, ax = plt.subplots(figsize=(10, 6))

# Create bars for RMSE and Accuracy
bar1 = ax.bar(x - width/2, rmse_values, width, label='RMSE', color='#ef233c')
bar2 = ax.bar(x + width/2, accuracy_values, width, label='Accuracy', color='#457b9d')

# Add text labels on the bars
ax.bar_label(bar1, fmt='%.2f', padding=5)
ax.bar_label(bar2, fmt='%.2f', padding=5)

# Set the labels and title
ax.set_xlabel('Models', fontsize=14)
ax.set_ylabel('Percentage ', fontsize=14)
ax.set_title('Comparison of Models: RMSE and Accuracy', fontsize=16)
ax.set_xticks(x)
ax.set_xticklabels(models, fontsize=12)
ax.legend()

# Display the plot
plt.tight_layout()
plt.show()

Precision, Recall, F1 Score, Accuracy Score, Confusion Matrix for Ridge Regression Model

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

# Calculate metrics
precision = precision_score(y_test, y_pred_classr)
recall = recall_score(y_test, y_pred_classr)
f1 = f1_score(y_test, y_pred_classr)
accuracy = accuracy_score(y_test, y_pred_classr)

# Print results
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_classr)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=["No Stroke", "Stroke"])
disp.plot(cmap='Blues')

Precision, Recall, F1 Score, Accuracy Score, Confusion Matrix for Lasso Regression Model

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# Convert continuous predictions from Lasso Regression into binary predictions
y_pred_class = (lasso_reg_predictions >= 0.05).astype(int)  # Apply threshold to convert to binary

# Calculate metrics
precision = precision_score(y_test, y_pred_class)
recall = recall_score(y_test, y_pred_class)
f1 = f1_score(y_test, y_pred_class)
accuracy = accuracy_score(y_test, y_pred_class)

# Print results
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=["No Stroke", "Stroke"])
disp.plot(cmap='Blues')

Precision, Recall, F1 Score, Accuracy Score, Confusion Matrix for Logistic Regression Model

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# Convert continuous predictions into binary predictions
y_pred_class = ( logistic_reg_predictions>= 0.2).astype(int)  # Convert continuous predictions to binary (0 or 1)

# Calculate metrics
y_pred_class=logistic_reg_predictions
precision = precision_score(y_test, y_pred_class)
recall = recall_score(y_test, y_pred_class)
f1 = f1_score(y_test, y_pred_class)
accuracy = accuracy_score(y_test, y_pred_class)

# Print results
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=["No Stroke", "Stroke"])
disp.plot(cmap='Blues')

Precision, Recall, F1 Score, Accuracy Score, Confusion Matrix for Linear Regression Model

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
y_pred_class
# Convert the continuous predictions into binary predictions
y_pred_class = (linear_reg_predictions >= 0.23).astype(int)

# Calculate the evaluation metrics
precision = precision_score(y_test, y_pred_class)
recall = recall_score(y_test, y_pred_class)
f1 = f1_score(y_test, y_pred_class)
accuracy = accuracy_score(y_test, y_pred_class)

# Print the results
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Precision Recall Curve -Ridge Regression Model

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt


# Predict on the test set using the trained ridge model
y_pred_prob = ridge_model.predict(X_test_scaled)  # Get continuous predictions

# Calculate precision and recall for Ridge regression
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)

# Plot the Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', color='#457b9d', label='Ridge Regression')
plt.title('Precision-Recall Curve - Ridge Regression', fontsize=16)
plt.xlabel('Recall', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.legend()
plt.grid(True)
plt.show()

To check dataset biased or not and its visual representation

In [None]:
s=df['stroke'].value_counts(normalize=True)
print (s)

In [None]:
import matplotlib.pyplot as plt

# Assuming 's' refers to the target variable column
s = df['stroke']  # Replace 'stroke' with the actual column name if needed

# Plotting the percentage distribution of the target variable
s.value_counts(normalize=True).plot(kind='bar', color='#457b9d')
plt.title("Distribution of Classes in Target Variable (Percentage)")
plt.xlabel("Class")
plt.ylabel("Percentage")
plt.xticks(rotation=0)  # Keeps the class labels horizontal
plt.show()