In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Necessary Libraries:
-------

In [46]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

## Load and Prepare Data:
_______
load the dataset using Pandas and prepare the features (X) and target variable (y).

In [23]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    X = data[['age', 'avg_glucose_level']].values
    y = data['stroke'].values
    return X, y

## Split Data into Training and Testing Sets:
_____________
Use the train_test_split function from Scikit-Learn to split the data into training and testing sets.

In [24]:
def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

## Implement Linear Regression:
_____________
Define a function to implement linear regression using gradient descent.

In [29]:
def gradient_descent(X, y, learning_rate=0.01, num_iterations=1000):
    m, n = X.shape
    theta = np.zeros(n)
    
    for _ in range(num_iterations):
        predictions = np.dot(X, theta)
        errors = y - predictions
        gradient = -2/m * np.dot(X.T, errors)
        
        # Update theta with adaptive learning rate
        learning_rate *= np.linalg.norm(gradient) / np.linalg.norm(X)  # Adaptive learning rate
        theta -= learning_rate * gradient
        
    return theta

## Train the Model:
_____________
Combine the data loading, splitting, and linear regression steps to train the model.

In [30]:
def train_model(X_train, y_train):
    theta = gradient_descent(X_train, y_train)
    return theta

## Evaluate the Model:
_____________
Define a function to evaluate the trained model on the testing set using Mean Squared Error.

In [31]:
def evaluate_model(theta, X_test, y_test):
    predictions = np.dot(X_test, theta)
    mse = mean_squared_error(y_test, predictions)
    return mse

## Calculate accuracy
_____________

In [33]:
# Additional function to calculate accuracy
def calculate_accuracy(predictions, threshold=0.5):
    binary_predictions = np.where(predictions >= threshold, 1, 0)
    return binary_predictions

## Putting it All Together:
_____________

In [47]:
def main():
    file_path = '/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv'
    X, y = load_data(file_path)
    X_train, X_test, y_train, y_test = split_data(X, y)
    theta = train_model(X_train, y_train)
    mse = evaluate_model(theta, X_test, y_test)
    print("Mean Squared Error:", mse)
    binary_predictions = calculate_accuracy(predictions)
    accuracy = accuracy_score(y_test, binary_predictions)
    print("Accuracy:", accuracy)

if __name__ == "__main__":
    main()

Mean Squared Error: 0.05725120323739136
Accuracy: 0.9393346379647749


## Inference
______

The values indicate the performance of the linear regression model on the testing data. Let's break down the implications of the Mean Squared Error (MSE) and the Accuracy:

Mean Squared Error (MSE):

The MSE value shows 0.05725120323739136, which represents the average squared difference between the actual target values and the predicted values made by our linear regression model. A lower MSE indicates that the model's predictions are closer to the actual values, which is generally desirable. In this context, the MSE suggests that, on average, the squared difference between the predicted and actual stroke probabilities is relatively small.

Accuracy:

The accuracy value shows 0.9393346379647749, which represents the proportion of correct predictions made by your model among all the predictions on the testing data. An accuracy of 0.9393 (or approximately 94%) indicates that our model correctly classified 94% of the instances in the testing set based on the predicted stroke probabilities. This is a measure of the model's overall correctness in terms of its predictions.

**In summary:**

**The low MSE value suggests that the model's predictions are reasonably close to the actual stroke probabilities.**

**The high accuracy value indicates that the model's predictions are correct for around 94% of the instances in the testing set.**