In [17]:
import io
import pandas as pd
import random
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [4]:
# loading dataset file
from google.colab import files
uploaded = files.upload()

Saving insurance.csv to insurance.csv


In [18]:
# read from to dataset to verify it has been loaded
df = pd.read_csv("/content/insurance.csv")

In [19]:
# checking datatypes for features before converting some of them
print(df.dtypes)

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object


In [7]:
# Handling using maps to convert to other datatype to be numeric for some features
sex_mapping = {'female':0, 'male':1}
smoker_mapping = {'no':0, 'yes':1}
region_mapping = {'southwest': 0, 'southeast': 1, 'northwest': 2, 'northeast': 3}

In [8]:
df['sex'] = df['sex'].map(sex_mapping)
df['smoker'] = df['smoker'].map(smoker_mapping)
df['region'] = df['region'].map(region_mapping)

In [9]:
# check if features are already converted
print(df.dtypes)

age           int64
sex           int64
bmi         float64
children      int64
smoker        int64
region        int64
charges     float64
dtype: object


In [10]:
features = ['age', 'sex', 'bmi', 'children', 'smoker', 'region']
X = df[features]
Y = df.charges # Target that needed to be predicted later

In [11]:
# showing some statistics about features before scaling some features as a preprocessing for data also
print(df.describe())

               age          sex          bmi     children       smoker  \
count  1338.000000  1338.000000  1338.000000  1338.000000  1338.000000   
mean     39.207025     0.505232    30.663397     1.094918     0.204783   
std      14.049960     0.500160     6.098187     1.205493     0.403694   
min      18.000000     0.000000    15.960000     0.000000     0.000000   
25%      27.000000     0.000000    26.296250     0.000000     0.000000   
50%      39.000000     1.000000    30.400000     1.000000     0.000000   
75%      51.000000     1.000000    34.693750     2.000000     0.000000   
max      64.000000     1.000000    53.130000     5.000000     1.000000   

            region       charges  
count  1338.000000   1338.000000  
mean      1.484305  13270.422265  
std       1.104885  12110.011237  
min       0.000000   1121.873900  
25%       1.000000   4740.287150  
50%       1.000000   9382.033000  
75%       2.000000  16639.912515  
max       3.000000  63770.428010  


In [12]:
# Scaling features that needs to be scaled because its large range
scaled_features = ['age', 'bmi']
for feature in scaled_features:
    min_age = df[feature].min()
    max_age = df[feature].max()
    avg_age = df[feature].mean()
    df[feature] = (df[feature] - avg_age) / max_age - min_age


In [13]:
# Showing some statistics about features after scaling
print(df.describe())

               age          sex          bmi     children       smoker  \
count  1338.000000  1338.000000  1338.000000  1338.000000  1338.000000   
mean    -18.000000     0.505232   -15.960000     1.094918     0.204783   
std       0.219531     0.500160     0.114779     1.205493     0.403694   
min     -18.331360     0.000000   -16.236744     0.000000     0.000000   
25%     -18.190735     0.000000   -16.042197     0.000000     0.000000   
50%     -18.003235     1.000000   -15.964958     1.000000     0.000000   
75%     -17.815735     1.000000   -15.884142     2.000000     0.000000   
max     -17.612610     1.000000   -15.537139     5.000000     1.000000   

            region       charges  
count  1338.000000   1338.000000  
mean      1.484305  13270.422265  
std       1.104885  12110.011237  
min       0.000000   1121.873900  
25%       1.000000   4740.287150  
50%       1.000000   9382.033000  
75%       2.000000  16639.912515  
max       3.000000  63770.428010  


In [14]:
# spiliting data to training and test sets : 1000 sample for training, 338 for testing
def train_test_split(X, Y, test_size=0.253, random_state=None):
    if random_state is not None:
        random.seed(random_state)

    total_samples = len(X)
    test_samples = int(test_size * total_samples)

    test_indices = random.sample(range(total_samples), test_samples)
    train_indices = [i for i in range(total_samples) if i not in test_indices]

    x_train = X.iloc[train_indices]
    x_test = X.iloc[test_indices]
    y_train = Y.iloc[train_indices]
    y_test = Y.iloc[test_indices]
    return x_train, x_test, y_train, y_test


x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=40)


In [15]:
# Constructing the 1st Model Using SVM
svm_model = svm.SVR()
svm_model.fit(x_train, y_train) # applying the training process for the model
y_pred_svm = svm_model.predict(x_test) # make predictions

In [16]:
# Evaluating SVM Model using Mean Squared Error Rule
mse_svm = mean_squared_error(y_test, y_pred_svm)
print(mse_svm)

162586793.49785438


In [None]:
# Here Construct the 2nd Model Using Linear Regrission


In [None]:
# Evaluate the 2nd Model Using Linear Regrission


In [None]:
# Here Construct the 3rd Model


In [None]:
# Evaluate the 3rd Model
