In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("adult.csv")
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## Data Inspection
Data inspection is the initial review of a dataset to find missing values, incorrect data types, and gather basic statistics, providing insights into its quality and structure.

In [3]:
# Identify Missing Values
df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [4]:
# Identify Specific character as null
(df == '?').sum()

age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [5]:
# Identify Data Types
df.dtypes

age                 int64
workclass          object
fnlwgt              int64
education          object
educational-num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income             object
dtype: object

In [6]:
# Initial analysis before cleaning
df.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


## Cleaning Data
Cleaning data involves eliminating or rectifying inaccuracies, inconsistencies, and missing values within your dataset, utilizing techniques such as handling missing values via deletion or imputation, rectifying data types, and detecting and eliminating duplicate entries, ultimately resulting in more precise and dependable analysis.

#### Objective number 1: turn question marks into null

In [7]:
# replace null representatives with null
df.replace('?', pd.NA, inplace=True)
df.isnull().sum()

age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [9]:
# replace null values with a placeholder values
df['occupation'] = df['occupation'].fillna("Unemployed")
df['occupation']

0        Machine-op-inspct
1          Farming-fishing
2          Protective-serv
3        Machine-op-inspct
4               Unemployed
               ...        
48837         Tech-support
48838    Machine-op-inspct
48839         Adm-clerical
48840         Adm-clerical
48841      Exec-managerial
Name: occupation, Length: 48842, dtype: object

### **Trimming and Cleaning Text Data**
# replace null values with a placeholder values
df['occupation'] = df['occupation'].fillna("Unemployed")
df['occupation']
# drop rows with null values
df.dropna(inplace=True)
df

In [10]:
# Replace text with other text
occupation_mapping = {
 'Machine-op-inspct': 'Machine Operator',
 'Farming-fishing': 'Farming and Fishing',
 'Protective-serv': 'Protective Services'
}
df['occupation'].map(occupation_mapping).fillna(df['occupation'])

0           Machine Operator
1        Farming and Fishing
2        Protective Services
3           Machine Operator
4                 Unemployed
                ...         
48837           Tech-support
48838       Machine Operator
48839           Adm-clerical
48840           Adm-clerical
48841        Exec-managerial
Name: occupation, Length: 48842, dtype: object

In [11]:
# drop rows with null values
df.dropna(inplace=True)
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [12]:
# replace parts of strings
df['occupation'].replace('-', ' ', regex=True)

0        Machine op inspct
1          Farming fishing
2          Protective serv
3        Machine op inspct
5            Other service
               ...        
48837         Tech support
48838    Machine op inspct
48839         Adm clerical
48840         Adm clerical
48841      Exec managerial
Name: occupation, Length: 45232, dtype: object

In [13]:
#replace parts of integers
df['educational-num'].replace(9, 19, regex=True)

0         7
1        19
2        12
3        10
5         6
         ..
48837    12
48838    19
48839    19
48840    19
48841    19
Name: educational-num, Length: 45232, dtype: int64

In [14]:
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


Does this replacement affect the original df? I was running df, but the values didn’t seem to have changed
It doesn't affect the df because we have not replaced the original df['educational-num'].

This is like a clone with replaced values! 😄

In [15]:
#replace parts of integers
df['educational-num'].replace(9, 19, regex=True)

0         7
1        19
2        12
3        10
5         6
         ..
48837    12
48838    19
48839    19
48840    19
48841    19
Name: educational-num, Length: 45232, dtype: int64

In [16]:
# changing column data types
df['income'] = df['income'].astype('category')
df.dtypes

age                   int64
workclass            object
fnlwgt                int64
education            object
educational-num       int64
marital-status       object
occupation           object
relationship         object
race                 object
gender               object
capital-gain          int64
capital-loss          int64
hours-per-week        int64
native-country       object
income             category
dtype: object

### **Renaming columns and Reindexing**

In [17]:
# Change columns names
df.rename(columns={'native-country': "Country", 'hours-per-week': 'Working Hours'}, inplace=True)
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,Working Hours,Country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [18]:
# Reindexing - Only focusing on certain columns
df.reindex(columns=['age', 'gender', 'Country', 'income', 'occupation'])

Unnamed: 0,age,gender,Country,income,occupation
0,25,Male,United-States,<=50K,Machine-op-inspct
1,38,Male,United-States,<=50K,Farming-fishing
2,28,Male,United-States,>50K,Protective-serv
3,44,Male,United-States,>50K,Machine-op-inspct
5,34,Male,United-States,<=50K,Other-service
...,...,...,...,...,...
48837,27,Female,United-States,<=50K,Tech-support
48838,40,Male,United-States,>50K,Machine-op-inspct
48839,58,Female,United-States,<=50K,Adm-clerical
48840,22,Male,United-States,<=50K,Adm-clerical


### **Filtering and Selecting Data**
Filtering and selecting data are fundamental for focusing analysis on specific segments. Techniques include Boolean indexing for conditional selection, using .query() for complex queries, and filtering data based on conditions.

In [19]:
# .query() for indexing/filtering
""" 
    Columns are put in backticks (`), and can be used for complex
    querying.
"""
high_income = df.query('`income` == ">50K"')
high_income

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,Working Hours,Country,income
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
10,65,Private,184454,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,6418,0,40,United-States,>50K
14,48,Private,279724,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,3103,0,48,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48819,38,Private,139180,Bachelors,13,Divorced,Prof-specialty,Unmarried,Black,Female,15020,0,45,United-States,>50K
48826,39,Local-gov,111499,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,20,United-States,>50K
48835,53,Private,321865,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K


In [20]:
# Logical/Compound Operators
df.query("`relationship` == 'Wife' and `educational-num` == 13")

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,Working Hours,Country,income
208,34,Private,357145,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,50,United-States,>50K
376,28,Private,302903,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Asian-Pac-Islander,Female,0,1485,40,United-States,<=50K
409,38,Private,272476,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,24,United-States,>50K
480,39,Private,85783,Bachelors,13,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,28,United-States,<=50K
581,37,Self-emp-not-inc,143774,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,40,Germany,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48313,40,State-gov,31627,Bachelors,13,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,20,United-States,<=50K
48345,49,Private,93639,Bachelors,13,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,43,United-States,<=50K
48533,37,Self-emp-not-inc,103925,Bachelors,13,Married-civ-spouse,Sales,Wife,White,Female,0,0,50,United-States,<=50K
48628,36,State-gov,212143,Bachelors,13,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,20,United-States,>50K


In [21]:
# Getting unique values of a column
df['occupation'].unique()

array(['Machine-op-inspct', 'Farming-fishing', 'Protective-serv',
       'Other-service', 'Prof-specialty', 'Craft-repair', 'Adm-clerical',
       'Exec-managerial', 'Tech-support', 'Sales', 'Priv-house-serv',
       'Transport-moving', 'Handlers-cleaners', 'Armed-Forces',
       'Unemployed'], dtype=object)

can drop base on the value of the column? You can try df = df[df["educational-num"] != 7"]

This should remove values that aren't edu-num of 7 from the original dataframe.

Hope this helps! 😄axis=0 (row or x-axis)
axis=1 (column or y-axis)

In [22]:
# Drop
# Row (single)
shortened_df = df.drop(0)
# Row (mulitple)
shortened_df = df.drop([0, 1, 2])
# Column (single)
shortened_df = df.drop('fnlwgt', axis=1)
# Column (mulitple)
shortened_df = df.drop(['fnlwgt', 'capital-gain', 'capital-loss'], axis=1)
# columns and rows
shortened_df = df.drop([0, 1, 3]).drop(['education', 'marital-status'], axis=1)
shortened_df

Unnamed: 0,age,workclass,fnlwgt,educational-num,occupation,relationship,race,gender,capital-gain,capital-loss,Working Hours,Country,income
2,28,Local-gov,336951,12,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
5,34,Private,198693,6,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
7,63,Self-emp-not-inc,104626,15,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,10,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,4,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,12,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,9,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,9,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,9,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


### **Handling Duplicates**
Identifying and removing duplicate records are crucial for maintaining data quality. Pandas provides .duplicated() and .drop_duplicates() for finding and removing duplicates, ensuring each data point is unique for accurate analysis.

In [23]:
# Checking for duplicates
df.duplicated().sum()

np.int64(47)

In [24]:
# Drop duplicates
df.drop_duplicates(inplace=True)
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,Working Hours,Country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [25]:
df.duplicated().sum()

np.int64(0)