In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Colonnes extraites de adult.names
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'income'
]

# Charger le fichier d'entraînement
df_train = pd.read_csv("adult/adult.data", names=columns, sep=',', skipinitialspace=True)

# Charger le fichier de test
df_test = pd.read_csv("adult/adult.test", names=columns, sep=',', skipinitialspace=True, skiprows=1)

# Concaténer les deux fichiers pour une base unique
df = pd.concat([df_train, df_test], ignore_index=True)


# Nettoyage income (supprimer le point à la fin pour les données test)
#df['income'] = df['income'].apply(lambda x: x.replace('.', '') if isinstance(x, str) else x)


In [None]:
# Taille
print("Nombre de lignes :", df.shape[0])
print("Nombre de colonnes :", df.shape[1])
# Types
print("\nTypes des colonnes :\n", df.dtypes)

In [None]:
# Aperçu
df.head()

In [None]:
df.describe(include='all')

In [None]:
df.replace('?', np.nan, inplace=True)

print(df.isnull().sum())

In [None]:
import matplotlib.pyplot as plt

# Calcul du nombre de valeurs manquantes par colonne
missing_counts = df.isnull().sum()
missing_counts = missing_counts[missing_counts > 0]

# Affichage graphique
plt.figure(figsize=(10, 5))
missing_counts.sort_values().plot(kind='barh', color='coral')
print("Colonnes avec valeurs manquantes :")
print(missing_counts)

plt.title("Valeurs manquantes par colonne")
plt.xlabel("Nombre de valeurs manquantes")
plt.ylabel("Colonnes")
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
print(f"Nombre de doublons : {df.duplicated().sum()}")

In [3]:
# Remplacer les valeurs manquantes 
df['workclass'] = df['workclass'].fillna(df['workclass'].mode()[0])
df['occupation'] = df['occupation'].fillna(df['occupation'].mode()[0])
df['native-country'] = df['native-country'].fillna(df['native-country'].mode()[0])

In [4]:
print(df.isnull().sum())


age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


In [5]:
#Supprimer les doublons
print(f"Nombre de doublons avant suppression : {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
print(f"Nombre de doublons après suppression : {df.duplicated().sum()}")

Nombre de doublons avant suppression : 29
Nombre de doublons après suppression : 0


In [None]:
# Encoder la variable cible 'income' en binaire
df['income'] = df['income'].apply(lambda x: 1 if '>50K' in x else 0)

In [7]:
# Encoder les variables catégoriques (One-Hot Encoding)
df_encoded = pd.get_dummies(df.drop('income', axis=1))

In [8]:
# Finalement, X et y prêts pour le modèle
X = df_encoded
y = df['income']


In [9]:
print("Données nettoyées et prêtes pour modélisation.")
print(X.head())
print(y.head())

Données nettoyées et prêtes pour modélisation.
   age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0   39   77516             13          2174             0              40   
1   50   83311             13             0             0              13   
2   38  215646              9             0             0              40   
3   53  234721              7             0             0              40   
4   28  338409             13             0             0              40   

   workclass_?  workclass_Federal-gov  workclass_Local-gov  \
0        False                  False                False   
1        False                  False                False   
2        False                  False                False   
3        False                  False                False   
4        False                  False                False   

   workclass_Never-worked  ...  native-country_Portugal  \
0                   False  ...                    False   

In [11]:
# Feature Engineering
# Tranches d'âge
bins = [0, 25, 45, 65, 100]
labels = ['Jeune', 'Adulte', 'Senior', 'Très Senior']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)

# Capital net
df['capital_net'] = df['capital-gain'] - df['capital-loss']

#  Heures longues de travail
df['travail_long'] = (df['hours-per-week'] > 40).astype(int)

#  Regroupement pays
top_countries = df['native-country'].value_counts().nlargest(5).index
df['native_country_grouped'] = df['native-country'].apply(lambda x: x if x in top_countries else 'Autres')

#  Mariage
married_status = ['Married-civ-spouse', 'Married-AF-spouse', 'Married-spouse-absent']
df['married'] = df['marital-status'].apply(lambda x: 1 if x in married_status else 0)

# Expérience estimée
df['experience_years'] = df['age'] - df['education-num'] - 6
df['experience_years'] = df['experience_years'].apply(lambda x: max(x, 0))

print(df[['age', 'age_group', 'capital_net', 'travail_long', 'native_country_grouped', 'married', 'experience_years']].head())


   age age_group  capital_net  travail_long native_country_grouped  married  \
0   39    Adulte         2174             0          United-States        0   
1   50    Senior            0             0          United-States        1   
2   38    Adulte            0             0          United-States        0   
3   53    Senior            0             0          United-States        1   
4   28    Adulte            0             0                 Autres        1   

   experience_years  
0                20  
1                31  
2                23  
3                40  
4                 9  


In [14]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,capital-loss,hours-per-week,native-country,income,age_group,capital_net,travail_long,native_country_grouped,married,experience_years
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,0,40,United-States,0,Adulte,2174,0,United-States,0,20
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0,13,United-States,0,Senior,0,0,United-States,1,31
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,0,40,United-States,0,Adulte,0,0,United-States,0,23
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,0,40,United-States,0,Senior,0,0,United-States,1,40
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,0,40,Cuba,0,Adulte,0,0,Autres,1,9
