In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv("datasets/medical.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
data["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [7]:
data["children"].unique()

array([0, 1, 3, 2, 5, 4], dtype=int64)

In [8]:
data["age"].unique()

array([19, 18, 28, 33, 32, 31, 46, 37, 60, 25, 62, 23, 56, 27, 52, 30, 34,
       59, 63, 55, 22, 26, 35, 24, 41, 38, 36, 21, 48, 40, 58, 53, 43, 64,
       20, 61, 44, 57, 29, 45, 54, 49, 47, 51, 42, 50, 39], dtype=int64)

In [10]:
#minimum age
data["age"].min()

18

In [11]:
data["children"].min()

0

In [12]:
data["charges"].min()

1121.8739

In [13]:
data["charges"].max()

63770.42801

In [15]:
data.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder

In [23]:
regression = LinearRegression()
encoder = OneHotEncoder()

In [31]:
#transforming the sex value
en_sex = encoder.fit_transform(data[["sex"]]).toarray()

In [32]:
en_sex

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [39]:
data["sex"] = en_sex
#0 female
#1 male

In [40]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,yes,southwest,16884.924
1,18,1.0,33.77,1,no,southeast,1725.5523
2,28,1.0,33.0,3,no,southeast,4449.462
3,33,1.0,22.705,0,no,northwest,21984.47061
4,32,1.0,28.88,0,no,northwest,3866.8552


In [41]:
#encoding the smoker value
en_smoke = encoder.fit_transform(data[["smoker"]]).toarray()

In [45]:
#0 yes
#1 no
en_smoke

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]])

In [46]:
data["smoker"] = en_smoke

In [47]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,0.0,southwest,16884.924
1,18,1.0,33.77,1,1.0,southeast,1725.5523
2,28,1.0,33.0,3,1.0,southeast,4449.462
3,33,1.0,22.705,0,1.0,northwest,21984.47061
4,32,1.0,28.88,0,1.0,northwest,3866.8552


In [48]:
input = data[["age","bmi","children","smoker"]]
output = data["charges"]

In [60]:
x_train,x_test,y_train,y_test = train_test_split(input,output,test_size=0.3,random_state=0)

In [61]:
model = regression.fit(x_train,y_train)

In [62]:
prediction = model.predict(x_test)

In [63]:
r2_score(prediction,y_test)

0.710083675826441