In [33]:
import pandas as pd
import numpy as np

# load the dataset
insurance = pd.read_csv("insurance.csv")

# convert categorical variables to numerical values
insurance['sex'] = insurance['sex'].apply({'male': 0, 'female': 1}.get)
insurance['smoker'] = insurance['smoker'].apply({'yes': 1, 'no': 0}.get)
insurance['region'] = insurance['region'].apply({'southwest': 1, 'southeast': 2, 'northwest': 3, 'northeast': 4}.get)

# define features (X) and the target variable (y)
X = insurance[['age', 'sex', 'bmi', 'children', 'smoker', 'region']]
y = insurance['charges']

# scale the features
X_mean = X.mean()
X_std = X.std()
X_scaled = (X - X_mean) / X_std

# Split the dataset into 70% training and 30% testing sets
split_ratio = 0.7
split_index = int(split_ratio * len(insurance))

X_train = X_scaled[:split_index]
y_train = y[:split_index]
X_test = X_scaled[split_index:]
y_test = y[split_index:]


# Implement Linear Regression from scratch with a suitable learning rate
def linear_regression(X, y, learning_rate, iterations):
    n_samples, n_features = X.shape
    weights = np.zeros(n_features)
    bias = 0

    for _ in range(iterations):
        y_predicted = np.dot(X, weights) + bias
        dw = (1/n_samples) * np.dot(X.T, (y_predicted - y))
        db = (1/n_samples) * np.sum(y_predicted - y)
        
        weights -= learning_rate * dw
        bias -= learning_rate * db

    # Calculate Mean Squared Error manually
    mse = np.mean((y_predicted - y) ** 2)
    
    return weights, bias, mse


# A more suitable learning rate
learning_rate = 0.1
# increased number of iterations for better convergence
iterations = 10000  
weights, bias, mse = linear_regression(X_train, y_train, learning_rate, iterations)

print("Mean Squared Error:", mse)

# Predict charges for a new patient
new_patient_data = np.array([24, 1, 23.50, 0, 0, 3])
new_patient_data_scaled = (new_patient_data - X_mean) / X_std
new_patient_charges = np.dot(new_patient_data_scaled, weights) + bias
print("Predicted charges for the new patient:", new_patient_charges)


Mean Squared Error: 34244988.80237877
Predicted charges for the new patient: 1796.5802548315423
