Step 1: Import Libraries

In [7]:
import os
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

Step 2: Load Data Using PySpark

In [16]:
# Start Spark session
spark = SparkSession.builder.appName("InsuranceModel").getOrCreate()

# Define file path and load the data into a Spark DataFrame
file_path = r"C:\Users\HP\Desktop\Project-4-Team-2/insurance.csv"
df_spark = spark.read.csv(file_path, header=True, inferSchema=True)

# Convert Spark DataFrame to Pandas DataFrame for use with scikit-learn
df = df_spark.toPandas()
print(df.head())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


## Step 3: Data Cleaning, Normalization, and Standardization

 #### Check for Missing Values and Handle Them

#### Standardize Numerical Features

In [25]:
# Drop rows with missing values
df.dropna(inplace=True)

# Check the data types and convert categorical columns if necessary
print(df.dtypes)

age                   int32
bmi                 float64
children              int32
charges             float64
sex_male               bool
smoker_yes             bool
region_northwest       bool
region_southeast       bool
region_southwest       bool
dtype: object


In [26]:
# Convert categorical variables to numeric using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

# Convert categorical variables to numeric using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

In [27]:
# Separate features and target variable
X = df.drop("charges", axis=1)  # Assuming 'charges' is the target column
y = df["charges"]

In [28]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Step 4: Initialize, Train, and Evaluate the Model

In [None]:
# Initialize and train a linear regression model for R-squared evaluation
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate R-squared
y_pred = lr_model.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred)
print(f"Linear Regression R-squared: {r2}")

# Initialize and train a Random Forest Classifier for accuracy evaluation
# Convert y to binary for classification, assuming target values are categorical in binary classification context
y_train_class = np.where(y_train > y_train.median(), 1, 0)
y_test_class = np.where(y_test > y_train.median(), 1, 0)

rf_model = RandomForestClassifier(random_state=12)
rf_model.fit(X_train_scaled, y_train_class)

# Make predictions and evaluate accuracy
y_pred_class = rf_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"Random Forest Classification Accuracy: {accuracy}")

Linear Regression R-squared: 0.7835929767120722
Random Forest Classification Accuracy: 0.9328358208955224


Step 5: Model Optimization

In [30]:
# Optimize by tuning the Random Forest model's hyperparameters
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train_class)

best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test_scaled)
best_accuracy = accuracy_score(y_test_class, y_pred_best)

print(f"Optimized Random Forest Accuracy: {best_accuracy}")
print(f"Best Hyperparameters: {grid_search.best_params_}")


Optimized Random Forest Accuracy: 0.9402985074626866
Best Hyperparameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}


Step 6: Save Model Performance Results

In [31]:
# Record optimization process and results
results = {
    'Model': ['Initial RF', 'Optimized RF'],
    'Accuracy': [accuracy, best_accuracy],
    'Hyperparameters': ['Default', str(grid_search.best_params_)]
}

results_df = pd.DataFrame(results)
results_df.to_csv("model_optimization_results.csv", index=False)
print("Optimization results saved to CSV.")


Optimization results saved to CSV.


Step 7: Print Final Model Performance

In [34]:
# Final output of model performance
if r2 >= 0.75:
    print(f"Final Linear Regression Model R-squared: {r2}")
else:
    print("Linear Regression model did not meet the required R-squared threshold of 0.80.")

if best_accuracy >= 0.75:
    print(f"Final Random Forest Model Accuracy: {best_accuracy}")
else:
    print("Random Forest model did not meet the required accuracy threshold of 75%.")


Final Linear Regression Model R-squared: 0.7835929767120722
Final Random Forest Model Accuracy: 0.9402985074626866
