Step 1: Import Libraries

In [18]:
import os
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


Step 2: Load Data Using PySpark

In [19]:
# Start Spark session
spark = SparkSession.builder.appName("InsuranceModel").getOrCreate()

# Define file path and load the data into a Spark DataFrame
file_path = r"C:\Users\HP\Desktop\Project-4-Team-2/insurance.csv"
df_spark = spark.read.csv(file_path, header=True, inferSchema=True)

# Convert Spark DataFrame to Pandas DataFrame for use with scikit-learn
df = df_spark.toPandas()
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Step 3: Data Cleaning, Normalization, and Standardization

 #### Check for Missing Values and Handle Them

#### Standardize Numerical Features

In [20]:
# Drop rows with missing values
df.dropna(inplace=True)

# Check the data types and convert categorical columns if necessary
print("Data types before conversion:")
#print(df.dtypes)
print(df.dtypes)

# Convert object columns to categorical if they should be categorical
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

print("\nData types after conversion:")
print(df.dtypes)

# Standardize numerical features
scaler = StandardScaler()
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print("\nStandardized numerical features:")
df[numerical_cols].head()


Data types before conversion:
age           int32
sex          object
bmi         float64
children      int32
smoker       object
region       object
charges     float64
dtype: object

Data types after conversion:
age            int32
sex         category
bmi          float64
children       int32
smoker      category
region      category
charges      float64
dtype: object

Standardized numerical features:


Unnamed: 0,bmi,charges
0,-0.45332,0.298584
1,0.509621,-0.953689
2,0.383307,-0.728675
3,-1.305531,0.719843
4,-0.292556,-0.776802


In [21]:
# Convert categorical variables to numeric using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

# Convert categorical variables to numeric using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

In [22]:
# Separate features and target variable
X = df.drop("charges", axis=1)  # Assuming 'charges' is the target column
y = df["charges"]

In [23]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Step 4: Initialize, Train, and Evaluate the Model

In [24]:
lr_model = LinearRegression()
# Perform cross-validation and calculate R-squared scores
cv_scores = cross_val_score(lr_model, X_train_scaled, y_train, cv=5, scoring='r2')  # Using 5-fold cross-validation

# Print the mean and standard deviation of the R-squared scores
print(f"Linear Regression R-squared: {cv_scores.mean()} ± {cv_scores.std()}")

#For Random Forest Classifier:
# Convert y to binary for classification
y_train_class = np.where(y_train > y_train.median(), 1, 0)

# Initialize the random forest classifier
rf_model = RandomForestClassifier(random_state=12)

# Perform cross-validation and calculate accuracy scores
cv_scores_class = cross_val_score(rf_model, X_train_scaled, y_train_class, cv=5, scoring='accuracy')  # Using 5-fold cross-validation
# Print the mean and standard deviation of the accuracy scores
print(f"Random Forest Classification Accuracy: {cv_scores_class.mean()} ± {cv_scores_class.std()}")

Linear Regression R-squared: 0.7331101109097584 ± 0.04890863158789755
Random Forest Classification Accuracy: 0.9242990654205607 ± 0.011214953271028045


Step 5: Model Optimization

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Perform grid search
grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train_class)

# Get the best estimator and evaluate it
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test_scaled)

# Make sure y_test_class is defined as in Step 1 for this evaluation
y_test_class = np.where(y_test > y_test.median(), 1, 0)  # Define this if missing
best_accuracy = accuracy_score(y_test_class, y_pred_best)

print(f"Optimized Random Forest Accuracy: {best_accuracy}")
print(f"Best Hyperparameters: {grid_search.best_params_}")


Optimized Random Forest Accuracy: 0.9104477611940298
Best Hyperparameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}


In [26]:
# Optimize by tuning the Random Forest model's hyperparameters
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train_class)

best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test_scaled)
best_accuracy = accuracy_score(y_test_class, y_pred_best)

print(f"Optimized Random Forest Accuracy: {best_accuracy}")
print(f"Best Hyperparameters: {grid_search.best_params_}")


Optimized Random Forest Accuracy: 0.9104477611940298
Best Hyperparameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}


Step 6: Save Model Performance Results

In [27]:
# Record optimization process and results
results = {
    'Model': ['Initial RF', 'Optimized RF'],
    'Accuracy': [best_accuracy, best_accuracy],
    'Hyperparameters': ['Default', str(grid_search.best_params_)]
}

results_df = pd.DataFrame(results)
results_df.to_csv("model_optimization_results.csv", index=False)
print("Optimization results saved to CSV.")

Optimization results saved to CSV.


Step 7: Print Final Model Performance

In [28]:
import pandas as pd

# Record optimization process and results
results = {
    'Model': ['Initial RF', 'Optimized RF'],
    'Accuracy': [cv_scores_class.mean(), best_accuracy],  # First is the initial model's score
    'Hyperparameters': ['Default', str(grid_search.best_params_)]
}

results_df = pd.DataFrame(results)
results_df.to_csv("model_optimization_results.csv", index=False)
results_df



Unnamed: 0,Model,Accuracy,Hyperparameters
0,Initial RF,0.924299,Default
1,Optimized RF,0.910448,"{'max_depth': None, 'min_samples_split': 5, 'n..."
