# My Learnings :-
* Drop rows whose target is NaN
* Above point ensures y_train , y_test have no NaN
* Handle Non Numeric Values in y_train, y_test, y_pred using
    * `y = pd.to_numeric(y, errors='coerce')`
* Ensure dtype of y_train and y_test is the same

# Task 1 :- Clean up Insurance data

In [1]:
# prompt: Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Python Essentials/Exercises/Health insurance Prediction/insurance.csv')

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0.0,yes,southwest,16884.924
1,18.0,male,33.77,1.0,no,Southeast,1725.5523
2,28.0,male,33.0,3.0,no,southeast,$4449.462
3,33.0,male,22.705,0.0,no,northwest,$21984.47061
4,32.0,male,28.88,0.0,no,northwest,$3866.8552


## Let us first gather some info about our dataset

In [5]:
print(f"Number of Datapoints: {len(df)}")
print(f"Columns names: {df.columns}")

Number of Datapoints: 1338
Columns names: Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1272 non-null   float64
 1   sex       1272 non-null   object 
 2   bmi       1272 non-null   float64
 3   children  1272 non-null   float64
 4   smoker    1272 non-null   object 
 5   region    1272 non-null   object 
 6   charges   1284 non-null   object 
dtypes: float64(3), object(4)
memory usage: 73.3+ KB


### Checking for Null values

In [7]:
df.isna().sum()

Unnamed: 0,0
age,66
sex,66
bmi,66
children,66
smoker,66
region,66
charges,54


#### Dropping rows whose target is NaN

In [8]:
df = df.dropna(subset = ['charges'])

In [9]:
df.isna().sum()

Unnamed: 0,0
age,42
sex,37
bmi,45
children,37
smoker,41
region,38
charges,0


### Checking the unique categories in the categorical columns

In [10]:
df['sex'].value_counts()

Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
male,510
female,492
M,63
woman,61
man,61
F,60


In [11]:
df['children'].value_counts()

Unnamed: 0_level_0,count
children,Unnamed: 1_level_1
0.0,541
1.0,287
2.0,205
3.0,136
4.0,20
-1.0,18
5.0,16
-2.0,12
-3.0,9
-4.0,3


Bruh! What does -4 children mean ???

In [12]:
df['smoker'].value_counts()

Unnamed: 0_level_0,count
smoker,Unnamed: 1_level_1
no,988
yes,255


In [13]:
df['region'].value_counts()

Unnamed: 0_level_0,count
region,Unnamed: 1_level_1
Southeast,168
southeast,166
Northwest,159
southwest,158
Northeast,151
northeast,151
northwest,148
Southwest,145


### Checking stuff in the numerical columns

In [14]:
df.describe()

Unnamed: 0,age,bmi,children
count,1242.0,1239.0,1247.0
mean,35.096618,30.589419,0.949479
std,22.639992,6.115336,1.304953
min,-64.0,15.96,-4.0
25%,24.0,26.21,0.0
50%,38.0,30.25,1.0
75%,51.0,34.54,2.0
max,64.0,53.13,5.0


Its not showing anything wrt `charges` because that column has certain values which start with a dollar

In [15]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0.0,yes,southwest,16884.924
1,18.0,male,33.77,1.0,no,Southeast,1725.5523
2,28.0,male,33.0,3.0,no,southeast,$4449.462
3,33.0,male,22.705,0.0,no,northwest,$21984.47061
4,32.0,male,28.88,0.0,no,northwest,$3866.8552


In [16]:
df.loc[2, ['charges']].dtype

dtype('O')

* So it is thinking of charges as a column containing strings

## Stuff to do :-  
* Convert `df['charges']` to float
    * First remove dollar from strings
    * Next `.astype(float)`
* `df['children']` :-
    * Appropriately handle negative children values
    * Ordinal Encoding is by default existing for this column
* `df['sex']` :-
    * Convert everything to Male or Female
    * Do the one-hot encoding
* `df['region']` :-   
    * First convert everything to lower case
    * Do the one-hot encoding

### Fixing `df['charges']`

In [17]:
def float_string(x):
    if type(x) == float:
        return x
    if type(x) == str:
        return x[1:]

#### Or use `x.replace('$', '')`

In [18]:
df['charges'] = df['charges'].apply(float_string)

In [19]:
df['charges'].unique()

array(['6884.924', '725.5523', '4449.462', ..., '1629.8335', '007.945',
       '9141.3603'], dtype=object)

In [20]:
# Identify and handle non-numeric values
df['charges'] = pd.to_numeric(df['charges'], errors='coerce')

# Drop rows with NaN
df = df.dropna(subset=['charges'])

# Convert to float
df['charges'] = df['charges'].astype(float)

# Verify the conversion
print(df['charges'].dtype)  # Should print 'float64'

float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['charges'] = df['charges'].astype(float)


### Fixing `df['sex']`

In [21]:
"""
male	517
female	503
man	64
M	64
woman	62
F
"""
df.loc[df['sex'] == 'man', 'sex'] = 'male'
df.loc[df['sex'] == 'M', 'sex'] = 'male'
df.loc[df['sex'] == 'woman', 'sex'] = 'female'
df.loc[df['sex'] == 'F', 'sex'] = 'female'

In [22]:
df['sex'].value_counts()

Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
male,631
female,610


### Fixing `df['region']`

In [23]:
x = 'Hello'
print(x.lower())

hello


In [24]:
def lower_fn(string):
    if type(string) == str:
        return string.lower()
    else :
        return string

In [25]:
df['region'] = df['region'].apply(lower_fn)

In [26]:
df['region'].value_counts()

Unnamed: 0_level_0,count
region,Unnamed: 1_level_1
southeast,329
northwest,304
southwest,302
northeast,302


Apparantly this column is of type int64

In [27]:
df['region'].astype(str)

Unnamed: 0,region
0,southwest
1,southeast
2,southeast
3,northwest
4,northwest
...,...
1333,northwest
1334,northeast
1335,southeast
1336,southwest


### Fixing `df['children']` and `df['age']` :- Making it all positive

In [28]:
def make_pos(x):
    return abs(x)

In [29]:
df['children'] = df['children'].apply(make_pos)
df['age'] = df['age'].apply(make_pos)

In [30]:
df['children'].value_counts()

Unnamed: 0_level_0,count
children,Unnamed: 1_level_1
0.0,538
1.0,304
2.0,217
3.0,142
4.0,23
5.0,16


In [31]:
df['age'].value_counts()

Unnamed: 0_level_0,count
age,Unnamed: 1_level_1
18.0,65
19.0,63
51.0,29
48.0,28
26.0,27
50.0,27
42.0,27
54.0,27
44.0,27
24.0,27


# Task 2 :-  Splitting data :-

In [32]:
y = df['charges']
X = df.drop(columns = ['charges'])

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [34]:
y_train.isna().sum()

0

In [35]:
y_test.isna().sum()

0

In [36]:
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
33,63.0,male,28.31,0.0,no,northwest
162,54.0,male,39.6,1.0,no,southwest
976,48.0,male,40.15,0.0,no,southeast
607,59.0,female,23.655,0.0,yes,northwest
788,29.0,male,22.515,3.0,no,northeast


# Task 3 :- Setup Pipelines and Transforms

In [37]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output= False)
scaler = StandardScaler()
mean_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

In [38]:
cat_cols = ['sex', 'smoker', 'region']
# impute children to escape Nan's
num_cols = ['age', 'bmi']

In [39]:
cat_pipeline = Pipeline(steps = [
    ('freq impute', cat_imputer),
    ('encode', ohe)
                ])

num_pipeline = Pipeline(steps = [
    ('mean impute', mean_imputer),
    ('scaling', scaler)
                ])

In [40]:
ct = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols),
    ('child_col', cat_imputer, ['children'])],
    remainder = 'passthrough',
    n_jobs=-1)

# Task 4 :- Setup and Train Model

In [54]:
from sklearn.linear_model import LinearRegression
import xgboost as xg
from sklearn.ensemble import RandomForestRegressor

In [60]:
model = LinearRegression()
#model = xg.XGBRegressor()
#model = RandomForestRegressor()

In [61]:
final_pipe = Pipeline(steps = [
    ('column_transform', ct),
    ('regression_model', model)
])

In [62]:
final_pipe.fit(X_train, y_train)

# Task 5 :- Evaluate the model

In [63]:
y_pred = final_pipe.predict(X_test)

In [46]:
print(len(y_pred) == len(y_pred))

True


In [47]:
from sklearn.metrics import mean_squared_error

In [48]:
mse = mean_squared_error(y_pred, y_test)

In [49]:
print(mse)

38074687.92128561


In [50]:
from sklearn.metrics import r2_score

In [64]:
r2_score(y_test, y_pred)

0.18851806755686729