Loading the Dataset

In [93]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

In [94]:
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 

In [95]:
# metadata 
print(adult.metadata) 

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

In [96]:
# variable information 
print(adult.variables) 

              name     role         type      demographic  \
0              age  Feature      Integer              Age   
1        workclass  Feature  Categorical           Income   
2           fnlwgt  Feature      Integer             None   
3        education  Feature  Categorical  Education Level   
4    education-num  Feature      Integer  Education Level   
5   marital-status  Feature  Categorical            Other   
6       occupation  Feature  Categorical            Other   
7     relationship  Feature  Categorical            Other   
8             race  Feature  Categorical             Race   
9              sex  Feature       Binary              Sex   
10    capital-gain  Feature      Integer             None   
11    capital-loss  Feature      Integer             None   
12  hours-per-week  Feature      Integer             None   
13  native-country  Feature  Categorical            Other   
14          income   Target       Binary           Income   

                       

In [97]:
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [98]:
df.to_csv("Data\dataset.csv")

Cleaning the Dataset

In [141]:
df = pd.read_csv("Data\dataset.csv")

In [143]:
# Handle missing values
df.replace('?', pd.NA, inplace=True)  # Convert '?' to NaN
df.replace(' ?', pd.NA, inplace=True)
df.dropna(inplace=True)  # Drop rows with missing values

# Reset the index
df.reset_index(drop=True, inplace=True)

In [144]:
# Check if any column contains "?" values
contains_question_mark = df.apply(lambda x: x.astype(str).str.contains('\?').any())

# Print columns containing "?" values
print("Columns containing '?' values:")
print(contains_question_mark[contains_question_mark].index)


Columns containing '?' values:
Index([], dtype='object')


In [145]:
print(df)

       Unnamed: 0  age         workclass  fnlwgt  education  education-num  \
0               0   39         State-gov   77516  Bachelors             13   
1               1   50  Self-emp-not-inc   83311  Bachelors             13   
2               2   38           Private  215646    HS-grad              9   
3               3   53           Private  234721       11th              7   
4               4   28           Private  338409  Bachelors             13   
...           ...  ...               ...     ...        ...            ...   
45217       48836   33           Private  245211  Bachelors             13   
45218       48837   39           Private  215419  Bachelors             13   
45219       48839   38           Private  374983  Bachelors             13   
45220       48840   44           Private   83891  Bachelors             13   
45221       48841   35      Self-emp-inc  182148  Bachelors             13   

           marital-status         occupation   relationship  \


In [146]:
df = df.drop_duplicates()#drop duplicate rows

In [147]:
print(df)

       Unnamed: 0  age         workclass  fnlwgt  education  education-num  \
0               0   39         State-gov   77516  Bachelors             13   
1               1   50  Self-emp-not-inc   83311  Bachelors             13   
2               2   38           Private  215646    HS-grad              9   
3               3   53           Private  234721       11th              7   
4               4   28           Private  338409  Bachelors             13   
...           ...  ...               ...     ...        ...            ...   
45217       48836   33           Private  245211  Bachelors             13   
45218       48837   39           Private  215419  Bachelors             13   
45219       48839   38           Private  374983  Bachelors             13   
45220       48840   44           Private   83891  Bachelors             13   
45221       48841   35      Self-emp-inc  182148  Bachelors             13   

           marital-status         occupation   relationship  \


In [148]:
# Remove duplicate columns
df_T = df.T.drop_duplicates().T

In [149]:
# If needed, you can also reset the index after removing rows or columns
df = df.reset_index(drop=True)

In [150]:
df['income'].value_counts()

income
<=50K     22654
<=50K.    11360
>50K       7508
>50K.      3700
Name: count, dtype: int64

In [151]:
# Replace "." with an empty string in the "income" column
df['income'] = df['income'].str.replace('.', '')

# Count the values in the "income" column after removing "."
print(df['income'].value_counts())


income
<=50K    34014
>50K     11208
Name: count, dtype: int64


In [152]:
# Count the number of null values in each column
null_counts = df.isnull().sum()

# Display the number of null values in each column
print(null_counts)


Unnamed: 0        0
age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


In [153]:
# Remove the "Unnamed" column
df.drop(columns="Unnamed: 0", inplace=True)

In [154]:
print(df)

       age         workclass  fnlwgt  education  education-num  \
0       39         State-gov   77516  Bachelors             13   
1       50  Self-emp-not-inc   83311  Bachelors             13   
2       38           Private  215646    HS-grad              9   
3       53           Private  234721       11th              7   
4       28           Private  338409  Bachelors             13   
...    ...               ...     ...        ...            ...   
45217   33           Private  245211  Bachelors             13   
45218   39           Private  215419  Bachelors             13   
45219   38           Private  374983  Bachelors             13   
45220   44           Private   83891  Bachelors             13   
45221   35      Self-emp-inc  182148  Bachelors             13   

           marital-status         occupation   relationship  \
0           Never-married       Adm-clerical  Not-in-family   
1      Married-civ-spouse    Exec-managerial        Husband   
2                D

In [155]:
df.to_csv("Data\cleaned_dataset.csv")

ENCODING

In [173]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [174]:
df=pd.read_csv("Data\cleaned_dataset.csv")

In [175]:
# Split to X and y
X = df.drop(columns=['income']) #split income column
y = df['income']

In [176]:
categorical_columns = X.select_dtypes(include=['object']).columns
#Selecting String type data sets for encoding

In [177]:
new_cat=X[categorical_columns]

In [178]:
new_cat.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [179]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(dtype='int')

encode_cat= encoder.fit_transform(new_cat) 
#encoding the needed categorical values

#non-categorical columns selected
X_int = X.select_dtypes(exclude=['object'])

# Convert the one-hot encoded array to a DataFrame
encoded_df = pd.DataFrame(encode_cat.toarray(), columns=encoder.get_feature_names_out(), index=new_cat.index)

# Combine the one-hot encoded DataFrame with the non-categorical columns
encoded_total_X = pd.concat([X_int, encoded_df], axis=1)

# Display the first few rows of the encoded DataFrame
print(encoded_total_X.head())

   Unnamed: 0  age  fnlwgt  education-num  capital-gain  capital-loss  \
0           0   39   77516             13          2174             0   
1           1   50   83311             13             0             0   
2           2   38  215646              9             0             0   
3           3   53  234721              7             0             0   
4           4   28  338409             13             0             0   

   hours-per-week  workclass_Federal-gov  workclass_Local-gov  \
0              40                      0                    0   
1              13                      0                    0   
2              40                      0                    0   
3              40                      0                    0   
4              40                      0                    0   

   workclass_Private  ...  native-country_Portugal  \
0                  0  ...                        0   
1                  0  ...                        0   
2       

In [180]:
# Remove the "Unnamed" column
encoded_total_X.drop(columns="Unnamed: 0", inplace=True)

In [181]:
encoded_total_X.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [182]:
# Remove the "Unnamed" column
encoded_total_X.drop(columns="education-num", inplace=True)

In [183]:
encoded_total_X.head()

Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,2174,0,40,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,0,0,13,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,0,0,40,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,0,0,40,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,0,0,40,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [184]:
# Save the encoded DataFrame to a new CSV file
encoded_total_X.to_csv("Data/encoded_dataset.csv", index=False)

Splitting the Data

In [187]:
from sklearn.model_selection import train_test_split

In [188]:
df=pd.read_csv("Data\encoded_dataset.csv")

In [189]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(encoded_total_X, y, test_size=0.2, random_state=42)

In [191]:
# Save training and testing datasets to CSV files
X_train.to_csv("Data\X_train.csv", index=False)
X_test.to_csv("Data\X_test.csv", index=False)
y_train.to_csv("Data\y_train.csv", index=False, header=True)
y_test.to_csv("Data\y_test.csv", index=False, header=True)