# VAUTECH IT SOLUTIONS – TASK 3
 
 
 **Intern**: Ansh Verma

 **Intern ID**: VT26DS001

 **Domain**: Data Science

 **Company**: VAUTECH IT SOLUTIONS

 **Mentor**: Vishal Rajbhar

**Task**: Data Ingestion & Quality Assessment

# Objective

To clean the dataset and enhance its features by handling missing and incorrect values, removing duplicates, and preparing the data for reliable analysis.

In [2]:
# Import & Load Data

import pandas as pd
import numpy as np

df = pd.read_csv("insurance.csv")
df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,,female,27.9,0,yes,southwest,16884.924
1,18.0,male,33.77,1,no,southeast,1725.5523
2,28.0,male,33.0,3,no,southeast,4449.462
3,33.0,male,22.705,0,no,northwest,21984.47061
4,32.0,male,28.88,0,no,northwest,3866.8552


In [3]:
# Check Missing Values
## Finding How Many Missing values are There.  

df.isnull().sum()


age         5
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
# Finding the average of the age and fill the average value into missing column.

df.fillna(df.mean(numeric_only=True), inplace=True)


In [20]:
# Finding the sum of null Values.

df.isnull().sum()

age                  0
sex                  0
bmi                  0
children             0
smoker               0
charges              0
bmi_category         0
age_group            0
high_risk            0
charges_per_child    0
region_northwest     0
region_southeast     0
region_southwest     0
dtype: int64

In [6]:
# Finding the duplicate values.

df.duplicated().any()

np.True_

In [7]:
# Showes the duplicate row.

df[df.duplicated()]


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
581,19.0,male,30.59,0,no,northwest,1639.5631


In [8]:
# Count Duplicate Rows.

df.duplicated().sum()


np.int64(1)

In [9]:
# Removing duplicate Values.

df.drop_duplicates(inplace=True)


In [10]:
# Converts BMI into health category.

df['bmi_category'] = pd.cut(
    df['bmi'],
    bins=[0,18.5,25,30,100],
    labels=['Underweight','Normal','Overweight','Obese']
)


In [11]:
# Helps in age-based analysis.

df['age_group'] = pd.cut(
    df['age'],
    bins=[0,18,35,50,100],
    labels=['Teen','Young Adult','Adult','Senior']
)


In [12]:
# If BMI > 30 AND smoker → high risk = 1
## Else → 0

df['high_risk'] = np.where(
    (df['bmi'] > 30) & (df['smoker'] == 'yes'),
    1, 0
)


In [13]:
# Prevents division by zero.

df['charges_per_child'] = df['charges'] / (df['children'] + 1)


In [14]:
# Convert Sex & Smoker.

df['sex'] = df['sex'].map({'male':0, 'female':1})
df['smoker'] = df['smoker'].map({'no':0, 'yes':1})


In [None]:
# Creates region_northwest, region_southeast.

df = pd.get_dummies(df, columns=['region'], drop_first=True)


In [None]:
df

Unnamed: 0,age,sex,bmi,children,smoker,charges,bmi_category,age_group,high_risk,charges_per_child,region_northwest,region_southeast,region_southwest
0,39.182296,1,27.900,0,1,16884.92400,Overweight,Adult,0,16884.924000,False,False,True
1,18.000000,0,33.770,1,0,1725.55230,Obese,Teen,0,862.776150,False,True,False
2,28.000000,0,33.000,3,0,4449.46200,Obese,Young Adult,0,1112.365500,False,True,False
3,33.000000,0,22.705,0,0,21984.47061,Normal,Young Adult,0,21984.470610,True,False,False
4,32.000000,0,28.880,0,0,3866.85520,Overweight,Young Adult,0,3866.855200,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50.000000,0,30.970,3,0,10600.54830,Obese,Adult,0,2650.137075,True,False,False
1334,18.000000,1,31.920,0,0,2205.98080,Obese,Teen,0,2205.980800,False,False,False
1335,18.000000,1,36.850,0,0,1629.83350,Obese,Teen,0,1629.833500,False,True,False
1336,21.000000,1,25.800,0,0,2007.94500,Overweight,Young Adult,0,2007.945000,False,False,True


In [None]:
# Final Check.

df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 1337
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   age                1337 non-null   float64 
 1   sex                1337 non-null   int64   
 2   bmi                1337 non-null   float64 
 3   children           1337 non-null   int64   
 4   smoker             1337 non-null   int64   
 5   charges            1337 non-null   float64 
 6   bmi_category       1337 non-null   category
 7   age_group          1337 non-null   category
 8   high_risk          1337 non-null   int64   
 9   charges_per_child  1337 non-null   float64 
 10  region_northwest   1337 non-null   bool    
 11  region_southeast   1337 non-null   bool    
 12  region_southwest   1337 non-null   bool    
dtypes: bool(3), category(2), float64(4), int64(4)
memory usage: 100.9 KB
