In [228]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [229]:
dataset_path = "Dataset_Brief.csv"
df = pd.read_csv(dataset_path)

In [230]:
missing_rate = df.isna().sum()/df.shape[0]
missing_rate
df = df.dropna()

In [231]:
print(df.duplicated().shape[0])

duplicates = df.duplicated()
duplicates_data = df[duplicates]


print("\nLignes avec des duplicatas:")
print(duplicates_data)

df = df.drop_duplicates(subset=["age", "sex", "bmi", "children", "smoker", "region"], keep='first')

df.shape[0]

1338

Lignes avec des duplicatas:
     age   sex    bmi  children smoker     region    charges
581   19  male  30.59         0     no  northwest  1639.5631


1335

In [232]:
columns_to_encode = ['sex', 'smoker', 'region']
df['children'] = df['children'].astype(int)
df['bmi'] = df['bmi'].astype(float)
df['charges'] = df['charges'].astype(float)

# Check if columns_to_encode are present in the DataFrame
for col in columns_to_encode:
    if col not in df.columns:
        print(f"Warning: Column '{col}' not found in the DataFrame.")
        columns_to_encode.remove(col)

# Continue with encoding the columns that exist
for col in columns_to_encode:
    unique_values = df[col].unique()
    for value in unique_values:
        new_col_name = f'is_{value.lower().replace(" ", "_")}'
        df[new_col_name] = (df[col] == value).astype(int)


In [233]:
# Create bmi categories
bins = [0, 30, 40, 50, 60, 100]
labels = ['Sous_poids', 'Poids_normal', 'Surpoids', 'Obésité_modérée', 'Obésité_sévère']
df['bmi_category'] = pd.cut(df['bmi'], bins=bins, labels=labels, right=False)

# Create dummy variables for the bmi categories
bmi_dummies = pd.get_dummies(df['bmi_category'], prefix='is', prefix_sep='_', drop_first=True)

# Concatenate the dummy variables with the original DataFrame
df = pd.concat([df, bmi_dummies], axis=1)


# Create age categories
bins = [0, 20, 30, 50, 60, 100]
labels = ['Jeune', 'Jeune_Adulte', 'Adulte', 'Senior', 'Tres_Senior']
df['age_category'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

# Create dummy variables for the age categories
age_dummies = pd.get_dummies(df['age_category'], prefix='is', prefix_sep='_', drop_first=True)

# Concatenate the dummy variables with the original DataFrame
df = pd.concat([df, age_dummies], axis=1)


# Sélection des colonnes spécifiques dans le DataFrame
df = df[['age','is_Jeune_Adulte', 'is_Adulte', 'is_Senior', 'is_Tres_Senior', 'is_female', 'is_male', 'bmi', 'is_Poids_normal', 'is_Surpoids', 'is_Obésité_modérée', 'is_Obésité_sévère', 'children', 'is_yes', 'is_no', 'is_southwest', 'is_southeast', 'is_northwest', 'is_northeast', 'charges']]
df = df.rename(columns={'is_yes': 'is_smoker', 'is_no': 'is_not_smoker'})




In [234]:
df.describe()

Unnamed: 0,age,is_Jeune_Adulte,is_Adulte,is_Senior,is_Tres_Senior,is_female,is_male,bmi,is_Poids_normal,is_Surpoids,is_Obésité_modérée,is_Obésité_sévère,children,is_smoker,is_not_smoker,is_southwest,is_southeast,is_northwest,is_northeast,charges
count,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0
mean,39.253933,0.209738,0.401498,0.202996,0.085393,0.494382,0.505618,30.658157,0.459176,0.065918,0.002247,0.0,1.097378,0.205243,0.794757,0.243446,0.27191,0.242697,0.241948,13286.778216
std,14.030779,0.407274,0.490385,0.40238,0.279571,0.500156,0.500156,6.101456,0.498517,0.248231,0.047369,0.0,1.205727,0.404031,0.404031,0.429323,0.445111,0.428874,0.428423,12115.61515
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,15.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1121.8739
25%,27.0,0.0,0.0,0.0,0.0,0.0,0.0,26.255,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4746.69845
50%,39.0,0.0,0.0,0.0,0.0,0.0,1.0,30.4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,9386.1613
75%,51.0,0.0,1.0,0.0,0.0,1.0,1.0,34.6875,1.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,16717.01075
max,64.0,1.0,1.0,1.0,1.0,1.0,1.0,53.13,1.0,1.0,1.0,0.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,63770.42801


In [235]:
df.to_csv('Clean_Dataset_Brief.csv', index=False)