In [1]:
import pandas as pd
import statsmodels.api as sm

In [2]:
insurance_data = pd.read_csv("DataSets/insurance.csv")
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
insurance_data.shape

(1338, 7)

In [4]:
insurance_data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

###### statsmodels is a Python module that provides classes and functions for the estimation of many different statistical models, as well as for conducting statistical tests, and statistical data exploration.

In [5]:
insurance_data.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

### Encoding

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
en_sex = le.fit_transform(insurance_data.sex)
en_smoker = le.fit_transform(insurance_data.smoker)
en_region = le.fit_transform(insurance_data.region)

In [4]:
insurance_data['sex'] =  en_sex
insurance_data['smoker'] = en_smoker
insurance_data['region'] = en_region


In [8]:
insurance_data 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


In [9]:
X = insurance_data.drop(columns = ['charges'])

In [10]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,3
1,18,1,33.770,1,0,2
2,28,1,33.000,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.880,0,0,1
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1
1334,18,0,31.920,0,0,0
1335,18,0,36.850,0,0,2
1336,21,0,25.800,0,0,3


In [11]:
y = insurance_data.charges

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state=10)

What is OLS?

In [13]:
XX = sm.add_constant(X_train,prepend = False)
print(XX)
# X_train with constants column

      age  sex     bmi  children  smoker  region  const
428    21    0  16.815         1       0       0    1.0
226    28    1  38.060         0       0       2    1.0
867    57    1  43.700         1       0       3    1.0
397    21    1  31.020         0       0       2    1.0
300    36    1  27.550         3       0       0    1.0
...   ...  ...     ...       ...     ...     ...    ...
1180   42    0  41.325         1       0       0    1.0
1147   20    0  31.920         0       0       1    1.0
527    51    0  25.800         1       0       3    1.0
1149   42    1  34.100         0       0       3    1.0
1289   44    1  34.320         1       0       2    1.0

[936 rows x 7 columns]


In [14]:
est = sm.OLS(y_train,XX)
est2 = est.fit()
print(est2.summary())
# Here OLS(a,b): a should be dependent variable and b is independent

# An endogenous variable is a variable in a statistical model
# that's changed or determined by its relationship with other variables within the model. In other words, 
# an endogenous variable is synonymous with a dependent variable

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.761
Model:                            OLS   Adj. R-squared:                  0.760
Method:                 Least Squares   F-statistic:                     494.2
Date:                Tue, 14 Sep 2021   Prob (F-statistic):          5.05e-285
Time:                        20:28:51   Log-Likelihood:                -9479.2
No. Observations:                 936   AIC:                         1.897e+04
Df Residuals:                     929   BIC:                         1.901e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
age          267.0648     14.191     18.819      0.0

In [10]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train,y_train)
pred = reg.predict(X_test)

In [16]:
pred

array([ 8300.25331111,  5898.37327561, 15202.36890326, 11687.23188017,
        3744.64036534,  9717.48857035,  9965.91189885, 26201.36280933,
        8015.08128162, 31276.66456561,  1807.24286867,  4147.07380188,
        7494.67368466, 11456.56430654, 13008.33765694, 11360.33257462,
        5652.22997614, 18402.0423214 , 10769.16902585, 31167.14428256,
       34409.19260264,  4106.30178863,  1189.30452588,  7463.3122754 ,
        4583.2438516 , 13525.43289152, 11813.24171734, 10755.64256762,
        6318.34542428, 31196.9204845 , -1849.14890589, 34141.19663923,
        9645.20311011,  7700.38031302,  8089.82326242, 12048.50626972,
       13607.92344874, 12583.0736086 , 10082.8315077 ,  8588.08764997,
       16134.77015137, 10089.09469913,  5550.92111062, 11512.75228309,
        5266.74290175,  7595.58446318,  6905.54606857, 34652.16949477,
       11325.49924672, 10829.04475023, 32867.71036904,  5543.98780892,
       10332.99479677, 14937.60932216, 11358.93405539, 13568.6005838 ,
      

In [17]:
reg.score(X_test,y_test)

0.7183245518407815

###### Here we drop the region and sex column as they have very low coef

In [6]:
X1 = insurance_data.drop(columns = ["region","sex"])
y1 = insurance_data.charges

In [7]:
X1_train,X1_test,y1_train,y1_test = train_test_split(X1,y1,test_size =0.2, random_state = 10)

In [12]:
est1 = sm.OLS(y1_train,X1_train)
est3 = est1.fit()
print(est3.summary())

                                 OLS Regression Results                                
Dep. Variable:                charges   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          1.644e+33
Date:                Tue, 14 Sep 2021   Prob (F-statistic):                        0.00
Time:                        20:32:33   Log-Likelihood:                          26050.
No. Observations:                1070   AIC:                                 -5.209e+04
Df Residuals:                    1065   BIC:                                 -5.207e+04
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [13]:
reg1 = LinearRegression()
reg1.fit(X1_train,y1_train)
reg1.score(X1_test,y1_test)

1.0