In [11]:
#Step 1 Data preprocessing

In [1]:
#import the necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
#import the dataset
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
#drop unnecessary columns
df.drop(['region','children'], axis=1, inplace=True)
df.head()

Unnamed: 0,age,sex,bmi,smoker,charges
0,19,female,27.9,yes,16884.924
1,18,male,33.77,no,1725.5523
2,28,male,33.0,no,4449.462
3,33,male,22.705,no,21984.47061
4,32,male,28.88,no,3866.8552


In [6]:
#encode categorical variables
encoder = LabelEncoder()
df['age'] = encoder.fit_transform(df['age'])
df['sex'] = encoder.fit_transform(df['sex'])
df['bmi'] = encoder.fit_transform(df['bmi'])

df['smoker'] = encoder.fit_transform(df['smoker'])
df['charges'] = encoder.fit_transform(df['charges'])
df.head()

Unnamed: 0,age,sex,bmi,smoker,charges
0,1,0,197,1,1005
1,0,1,350,0,57
2,10,1,331,0,306
3,15,1,73,0,1097
4,14,1,223,0,254


In [7]:
#handle the missing values
df.fillna(df.mean(), inplace=True)

In [9]:
#scale the numerical variables
scaler = StandardScaler()
df[['age', 'sex', 'bmi', 'smoker', ]] = scaler.fit_transform(df[['age','sex', 'bmi', 'smoker', ]])

In [10]:
#Step 2 split the data into training and testing sets

In [15]:
#import the necessary libraries
from sklearn.model_selection import train_test_split

In [13]:
#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('charges', axis=1), df['charges'], test_size=0.3, random_state=42)

In [14]:
#Step 3 Train the base models

In [16]:
#import the necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [17]:
#train the base models
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


RandomForestClassifier(random_state=42)

In [18]:
#finally the base models are trained