# Adult Dataset Analysis - Part 01

## Downloaded 'adult.csv' dataset, from UC Irvince Machine Learning Repository

In [2]:
# URL : https://archive.ics.uci.edu/dataset/2/adult
#!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo 

#import pandas library
import pandas as pd

# Downloading the dataset from the online source. The first two lines are given by the online source mentioned above
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
Adult = fetch_ucirepo(id=2) 

#putting data in a pandas dataframe
X = Adult.data.features
y = Adult.data.targets
data=pd.concat([X,y],axis=1)


## Number of Null values in columns and report them

In [3]:
import numpy as np

#Assign the 'data' dataset as 'adult'
adult = data.copy()

# Replace the cells which has '?' with 'NaN'
adult.replace('?', np.nan, inplace=True)

#count the number of null values
null_counts = adult.isna().sum()
print("\nNull counts per column: ")
print(null_counts)

#identify and report columns with null values
report_column = []

#Report the columns with null values
columns_has_null = null_counts[null_counts > 0]
print("\nColumns with Null values: ")
print(columns_has_null)

report_column.append(columns_has_null.index.tolist())



Null counts per column: 
age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
income               0
dtype: int64

Columns with Null values: 
workclass         2799
occupation        2809
native-country     857
dtype: int64


## Let's find the columns which has wrong data

We can use Descriptive Statistics to check for extreme values, unrealistic measurements, outliers or inconsistencies. 
We can use the 'descirbe()' method for this
Also it is important to check for data types of each column. 
Numerical columns should not contain non-numerical values. 
Categorical columns may have unexpected strings. 'dtypes' attribute can be used for this.
Unique values can be found using 'nunique()' method. 
A low value of unique values in a supposedly diverse column might indicate errors or duplicates. 
For a column we can use 'unique()' method to find its unique values.


In [9]:
#Descriptive Statistics
print("\nDescriptive Statistics: ")
print(adult.describe())

#Data Types of columns
print("\nData Types of Columns: ")
print(adult.dtypes)

#Check for unique values of columns
print("\nCheck for Unique Values: ")
print(adult.nunique())

#Check for income types in the dataset
print("\nincome types in the dataset: ")
print(adult['income'].unique())
report_column.append('income')

# More than 90 hours a week and less than 15 hours a week are considered outliers
hours_outliers = adult[(adult['hours-per-week'] < 15) | (adult['hours-per-week'] > 90)]
print("\n Hours Outliers: ")
print(hours_outliers)

if not hours_outliers.empty:
    report_column.append('hours-per-week')
    
# a person should be more than 18 years to be called as an adult and maximum age 120
age_outliers = adult[(adult['age'] < 18) | (adult['age'] > 120)]
print("\n Age outliers: ")
print(age_outliers)

if not age_outliers.empty:
    report_column.append('age')
    
#Columns which has wrong data
print("\nColumns which has wrong Data: ")
print(report_column)


Descriptive Statistics: 
                age        fnlwgt  education-num  capital-gain  capital-loss  \
count  48842.000000  4.884200e+04   48842.000000  48842.000000  48842.000000   
mean      38.643585  1.896641e+05      10.078089   1079.067626     87.502314   
std       13.710510  1.056040e+05       2.570973   7452.019058    403.004552   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.175505e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.781445e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.376420e+05      12.000000      0.000000      0.000000   
max       90.000000  1.490400e+06      16.000000  99999.000000   4356.000000   

       hours-per-week  
count    48842.000000  
mean        40.422382  
std         12.391444  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  

Data Types of Columns: 
age         

## Let's find out more about the columns which contained Null values

In [10]:
# Columns which had Null Values
print("\n",columns_has_null.index)
#Nominal
print("\n", adult['workclass'].unique())
#Nominal
print("\n", adult['occupation'].unique())
#Nominal
print("\n", adult['native-country'].unique())



 Index(['workclass', 'occupation', 'native-country'], dtype='object')

 ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' nan
 'Self-emp-inc' 'Without-pay' 'Never-worked']

 ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' nan
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']

 ['United-States' 'Cuba' 'Jamaica' 'India' nan 'Mexico' 'South'
 'Puerto-Rico' 'Honduras' 'England' 'Canada' 'Germany' 'Iran'
 'Philippines' 'Italy' 'Poland' 'Columbia' 'Cambodia' 'Thailand' 'Ecuador'
 'Laos' 'Taiwan' 'Haiti' 'Portugal' 'Dominican-Republic' 'El-Salvador'
 'France' 'Guatemala' 'China' 'Japan' 'Yugoslavia' 'Peru'
 'Outlying-US(Guam-USVI-etc)' 'Scotland' 'Trinadad&Tobago' 'Greece'
 'Nicaragua' 'Vietnam' 'Hong' 'Ireland' 'Hungary' 'Holand-Netherlands']


## Let's delete all the rows contained null values and verify the function

In [17]:
# Number of null values of all columns
old_null_counts = adult.isna().sum()
print("Number of null values of all columns (OLD): \n", old_null_counts)
print('\n\n')

#Delete all rows contains null values
new_adult = adult.dropna()

# Number of null values of all columns
new_null_counts = new_adult.isna().sum()
print("Number of null values of all columns (NEW): \n", new_null_counts)

new_adult


Number of null values of all columns (OLD): 
 age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
income               0
dtype: int64



Number of null values of all columns (NEW): 
 age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K.
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


## Reindex the dataset after removing the Null values

In [19]:
new_adult = new_adult.reset_index(drop=True)
new_adult

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45217,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K.
45218,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
45219,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
45220,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


## Determining the average contribution of each "sex" and "occupation" category to the "capital gain"

In [44]:
import pandas as pd
print("\nFollowing is the Average Capital gain by Sex and Occupation")

#Used the 'Groupby' by method
average_capital_gain = new_adult['capital-gain'].groupby([new_adult['sex'], new_adult['occupation']]).mean()

#Round the average to two decimal points
print(average_capital_gain.round(2))



Following is the Average Capital gain by Sex and Occupation
sex     occupation       
Female  Adm-clerical          464.78
        Craft-repair          746.89
        Exec-managerial      1157.57
        Farming-fishing       707.76
        Handlers-cleaners     505.76
        Machine-op-inspct     142.90
        Other-service         203.84
        Priv-house-serv       202.83
        Prof-specialty       1212.47
        Protective-serv      1987.49
        Sales                 271.73
        Tech-support          523.69
        Transport-moving      488.82
Male    Adm-clerical          498.85
        Armed-Forces          521.29
        Craft-repair          705.74
        Exec-managerial      2645.21
        Farming-fishing       721.67
        Handlers-cleaners     253.70
        Machine-op-inspct     375.70
        Other-service         223.30
        Priv-house-serv        42.43
        Prof-specialty       3624.44
        Protective-serv       541.45
        Sales            

## Which country has the highest number of people with a Bachelors degree?

In [45]:
#Used the groupby method
bachelor_counts = new_adult[new_adult['education']=='Bachelors'].groupby('native-country').size()
print("Number of Bachelors Degree Holders in each country: \n\n",bachelor_counts)

#identifying the country with maximum bachelors
most_bachelors_country = bachelor_counts.idxmax()

#Identify the maximum number of bachelors
highest_count = bachelor_counts.max()

print('\nWhich country has the highest number of people with a Bachelors degree?')
print(most_bachelors_country, ' with ', highest_count, 'bachelors')

Number of Bachelors Degree Holders in each country: 

 native-country
Cambodia                         5
Canada                          36
China                           25
Columbia                         6
Cuba                            19
Dominican-Republic               6
Ecuador                          4
El-Salvador                      7
England                         33
France                          10
Germany                         49
Greece                           5
Guatemala                        3
Haiti                            5
Honduras                         2
Hong                             5
Hungary                          5
India                           36
Iran                            18
Ireland                          8
Italy                           13
Jamaica                          9
Japan                           26
Laos                             3
Mexico                          42
Nicaragua                        3
Outlying-US(Guam-USV

## A Function to return the country with the maximum average age and the minimum average age

In [46]:
#A Function to find the country with Maximum Age and Minimum Age
def avg_age_find (df):
    #average age for each country
    avg_age_country = df.groupby('native-country')['age'].mean()
    
    #find the max avg age country
    max_avg_age_country = avg_age_country.idxmax()
    max_avg_age = np.max(avg_age_country)
    
    #find the min avg age country
    min_avg_age_country = avg_age_country.idxmin()
    min_avg_age = np.min(avg_age_country)
    
    return max_avg_age_country, min_avg_age_country, max_avg_age, min_avg_age

#apply the function to the task_dataset
max_avg_age_country, min_avg_age_country, max_avg_age, min_avg_age  = avg_age_find(new_adult)

#Print the results
print("\nCountry with Maximum Average Age: ")
print('Country:',max_avg_age_country, '\nAverage Age: ', round(max_avg_age, 2))
print("\nCountry with Minimum Average Age: ")
print('Country: ',min_avg_age_country,'\nAverage Age: ', round(min_avg_age, 2))



Country with Maximum Average Age: 
Country: Hungary 
Average Age:  49.22

Country with Minimum Average Age: 
Country:  Guatemala 
Average Age:  31.76


## A function to split the task_dataset in half column-wise and swap the first half and the second half

In [47]:
print("\n task_dataset columns before split and swap")
print(new_adult.columns)

#Function for split and swap  the dataset
def split_and_swap_columns(df):
    #Number of columns in the dataset
    num_of_col = len(df.columns)
    
    # check for even number of columns
    if num_of_col % 2 == 0:  
            first_half = df.iloc[:, :num_of_col // 2]
            second_half = df.iloc[:, num_of_col // 2:]
    else:
            first_half = df.iloc[:, :num_of_col // 2 + num_of_col % 2]
            second_half = df.iloc[:, num_of_col // 2 + num_of_col % 2:]
    
    #Swap the firt and second halves
    new_dataset = pd.concat([second_half, first_half], axis =1)
    
    return new_dataset


#Check the function and its results
print("\n task_dataset columns after split and swap")
print(split_and_swap_columns(new_adult).columns)




 task_dataset columns before split and swap
Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

 task_dataset columns after split and swap
Index(['race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
       'native-country', 'income', 'age', 'workclass', 'fnlwgt', 'education',
       'education-num', 'marital-status', 'occupation', 'relationship'],
      dtype='object')


## A function that receives two numerical columns' names and compare their values for all rows. If the value of the first column is greater than the second column, the function produces True, otherwise, it produces False. The function appends an additional column to the dataset to store the results of the comparison for all rows. 

In [48]:
# Function that receives two numerical columns and compare their values for all rows
def columns_comparison(df, C1, C2):
    new_column = f"{C1}_greater_than_{C2}"
    df[new_column] = df[C1] > df[C2]   
    return df

#Apply the function to the 'age' and 'hours-per-week' columns
#new column name: age_greater_than_hours-per-week
results_df = columns_comparison(new_adult, 'age','hours-per-week')

#Print the task_dataset
#print(results_df)
results_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,age_greater_than_hours-per-week
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,False
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,True
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,False
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,True
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45217,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K.,False
45218,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.,True
45219,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.,False
45220,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.,True


## Instead of deleting the values, we can use the 'mode' values of the columns to replace the 'Null' values

The dataset is imported again as adult2, followed by the identification of categorical columns with null values using the isna().any() method. Subsequently, relevant columns containing null values are filtered and their names stored for further processing. 

Next, mode values, representing the most frequent values for each selected column, are computed using the mode() method. These mode values are utilized to fill null values in the selected columns, ensuring completeness of the dataset. 

Finally, the code verifies the absence of null values post-filling and prints out the mode values for reference. Overall, this approach effectively handles missing values in categorical columns, facilitating subsequent analysis or processing of the dataset.


In [21]:
#import the dataset again
adult2 = adult.copy()

#mode of the categorical columns which contains null values
categorical_columns_with_null = adult2.select_dtypes(include=['object'])

#Create a 'Boolean' column with 'True' and 'False'
categorical_columns_with_null = adult2.isna().any()

print(categorical_columns_with_null)

# Assuming categorical_columns_with_null is a Series with column names as index and True/False as values
selected_columns = categorical_columns_with_null[categorical_columns_with_null]

#Print the selected columns
print(selected_columns)

#Get the names of the selected columns
selected_columns_names = selected_columns.index.tolist()

#Select only the relevant columns from the 'adult' dataset
selected_columns_data = adult2[selected_columns_names]

#Find the mode values for each selected column
mode_values = selected_columns_data.mode().iloc[0]

print("\nMode values of the categorical columns which contain Null values: ")
print(mode_values)

# Fill Null values with mode values for the selected columns
adult2[selected_columns_names] = adult2[selected_columns_names].fillna(mode_values)

#verify
print("\n", adult2.isna().any())
print("\n", adult2.mode())

adult2

age               False
workclass          True
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation         True
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country     True
income            False
dtype: bool
workclass         True
occupation        True
native-country    True
dtype: bool

Mode values of the categorical columns which contain Null values: 
workclass                Private
occupation        Prof-specialty
native-country     United-States
Name: 0, dtype: object

 age               False
workclass         False
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation        False
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country    False
income            F

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,Private,321403,HS-grad,9,Widowed,Prof-specialty,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


## I still observe that the 'income' column has wrong data. Let's correct it

In [22]:
print("The income column and its unique values before processing")
print(adult2['income'].unique(),'\n')
income_column = adult2['income']

#replace the wrong values with correct values
adult2['income'] = income_column.replace({'<=50K.':'<=50K','>50K.' :'>50K' })
print("The income column and its unique values after processing")
print(adult2['income'].unique())


The income column and its unique values before processing
['<=50K' '>50K' '<=50K.' '>50K.'] 

The income column and its unique values after processing
['<=50K' '>50K']


## Encoding the columns and ready the dataset for Normalization

Firstly, the code employs a LabelEncoder to encode the 'income' column, transforming categorical income values into numerical representations. This ensures compatibility with machine learning algorithms that require numerical input features for prediction tasks. Furthermore, the code selects an ordinal column, 'education_level', and encodes it using the LabelEncoder, appropriately preserving the ordinal nature of the data. Additionally, a nominal column, 'marital-status', is encoded using one-hot encoding via the pd.get_dummies() function, generating binary columns for each unique category.

The code appends the encoded data from the one-hot encoded 'marital-status' column to the original dataset, enhancing the dataset with the additional encoded features necessary for predictive modeling. Finally, the numerical columns, 'age' and 'education_level', are normalized using the MinMaxScaler, ensuring consistency in scale across all numerical features in the dataset. 

This normalization step enhances the performance and convergence of machine learning algorithms, contributing to more accurate predictive modeling. Overall, the code effectively prepares the dataset for predictive analysis by encoding categorical data and normalizing numerical features, laying the groundwork for successful machine learning tasks.

In [23]:
#import necessary libraries
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

#create a LabelEncoder
label_encoder = LabelEncoder()

#fit and transform the 'income column'
adult2['income_encoded'] = label_encoder.fit_transform(adult2['income'])

print("\nOriginal Values:")
print(adult2['income'].unique())

print("\nEncoded Values: ")
print(adult2['income_encoded'].unique())
print(adult2[['income', 'income_encoded']])

#fit and transform the 'education_level'
adult2['education_level'] = label_encoder.fit_transform(adult2['education'])

#Encoding a nominal column
encoded_adult2 = pd.get_dummies(adult2, columns=['marital-status'], prefix=['marital-status'])

#Append the encoded data to the dataset
adult2 = pd.concat([adult2, encoded_adult2], axis=1)

adult2


Original Values:
['<=50K' '>50K']

Encoded Values: 
[0 1]
      income  income_encoded
0      <=50K               0
1      <=50K               0
2      <=50K               0
3      <=50K               0
4      <=50K               0
...      ...             ...
48837  <=50K               0
48838  <=50K               0
48839  <=50K               0
48840  <=50K               0
48841   >50K               1

[48842 rows x 2 columns]


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,income,income_encoded,education_level,marital-status_Divorced,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,marital-status_Separated,marital-status_Widowed
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,<=50K,0,9,False,False,False,False,True,False,False
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,<=50K,0,9,False,False,True,False,False,False,False
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,<=50K,0,11,True,False,False,False,False,False,False
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,<=50K,0,1,False,False,True,False,False,False,False
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,<=50K,0,9,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,...,<=50K,0,9,True,False,False,False,False,False,False
48838,64,Private,321403,HS-grad,9,Widowed,Prof-specialty,Other-relative,Black,Male,...,<=50K,0,11,False,False,False,False,False,False,True
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,...,<=50K,0,9,False,False,True,False,False,False,False
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,...,<=50K,0,9,True,False,False,False,False,False,False


## Normalise the Numerical Columns of the dataset

In [24]:
#Normalizing numerical columns
#Identifying Numerical columns
numerical_columns = ['age', 'education_level']

#Check for numerical columns
if all(col in adult2.columns for col in numerical_columns):
    scaler = MinMaxScaler()
    adult2[numerical_columns] = scaler.fit_transform(adult2[numerical_columns])
    print("\nNormalization Completed")
else:
    print("\nNumerical Columns not found")


Normalization Completed


In [25]:
#Normalized dataset
#print(adult2)
adult2

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,income,income_encoded,education_level,marital-status_Divorced,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,marital-status_Separated,marital-status_Widowed
0,0.301370,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,<=50K,0,0.600000,False,False,False,False,True,False,False
1,0.452055,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,<=50K,0,0.600000,False,False,True,False,False,False,False
2,0.287671,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,<=50K,0,0.733333,True,False,False,False,False,False,False
3,0.493151,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,<=50K,0,0.066667,False,False,True,False,False,False,False
4,0.150685,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,<=50K,0,0.600000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.301370,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,...,<=50K,0,0.600000,True,False,False,False,False,False,False
48838,0.643836,Private,321403,HS-grad,9,Widowed,Prof-specialty,Other-relative,Black,Male,...,<=50K,0,0.733333,False,False,False,False,False,False,True
48839,0.287671,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,...,<=50K,0,0.600000,False,False,True,False,False,False,False
48840,0.369863,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,...,<=50K,0,0.600000,True,False,False,False,False,False,False
