In [28]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

In [29]:
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 

In [30]:
# metadata 
print(adult.metadata) 

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

In [31]:
# variable information 
print(adult.variables) 

              name     role         type      demographic  \
0              age  Feature      Integer              Age   
1        workclass  Feature  Categorical           Income   
2           fnlwgt  Feature      Integer             None   
3        education  Feature  Categorical  Education Level   
4    education-num  Feature      Integer  Education Level   
5   marital-status  Feature  Categorical            Other   
6       occupation  Feature  Categorical            Other   
7     relationship  Feature  Categorical            Other   
8             race  Feature  Categorical             Race   
9              sex  Feature       Binary              Sex   
10    capital-gain  Feature      Integer             None   
11    capital-loss  Feature      Integer             None   
12  hours-per-week  Feature      Integer             None   
13  native-country  Feature  Categorical            Other   
14          income   Target       Binary           Income   

                       

In [33]:
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [34]:
df.to_csv("Data\dataset.csv")

In [35]:
df = pd.read_csv("Data\dataset.csv")

In [36]:
# Handle missing values
df.replace(' ?', pd.NA, inplace=True)  # Convert '?' to NaN
df.dropna(inplace=True)  # Drop rows with missing values

In [37]:
print(df)

       Unnamed: 0  age         workclass  fnlwgt  education  education-num  \
0               0   39         State-gov   77516  Bachelors             13   
1               1   50  Self-emp-not-inc   83311  Bachelors             13   
2               2   38           Private  215646    HS-grad              9   
3               3   53           Private  234721       11th              7   
4               4   28           Private  338409  Bachelors             13   
...           ...  ...               ...     ...        ...            ...   
48836       48836   33           Private  245211  Bachelors             13   
48837       48837   39           Private  215419  Bachelors             13   
48839       48839   38           Private  374983  Bachelors             13   
48840       48840   44           Private   83891  Bachelors             13   
48841       48841   35      Self-emp-inc  182148  Bachelors             13   

           marital-status         occupation   relationship  \


In [38]:
df = df.drop_duplicates()#drop duplicate rows

In [39]:
print(df)

       Unnamed: 0  age         workclass  fnlwgt  education  education-num  \
0               0   39         State-gov   77516  Bachelors             13   
1               1   50  Self-emp-not-inc   83311  Bachelors             13   
2               2   38           Private  215646    HS-grad              9   
3               3   53           Private  234721       11th              7   
4               4   28           Private  338409  Bachelors             13   
...           ...  ...               ...     ...        ...            ...   
48836       48836   33           Private  245211  Bachelors             13   
48837       48837   39           Private  215419  Bachelors             13   
48839       48839   38           Private  374983  Bachelors             13   
48840       48840   44           Private   83891  Bachelors             13   
48841       48841   35      Self-emp-inc  182148  Bachelors             13   

           marital-status         occupation   relationship  \


In [40]:
# Remove duplicate columns
df_T = df.T.drop_duplicates().T

In [41]:
# If needed, you can also reset the index after removing rows or columns
df = df.reset_index(drop=True)

In [44]:
df['income'].value_counts()

income
<=50K     24720
<=50K.    11360
>50K       7841
>50K.      3700
Name: count, dtype: int64

In [46]:
# Replace "." with an empty string in the "income" column
df['income'] = df['income'].str.replace('.', '')

# Count the values in the "income" column after removing "."
print(df['income'].value_counts())


income
<=50K    36080
>50K     11541
Name: count, dtype: int64


In [47]:
# Count the number of null values in each column
null_counts = df.isnull().sum()

# Display the number of null values in each column
print(null_counts)


Unnamed: 0        0
age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


In [48]:
print(df)

       Unnamed: 0  age         workclass  fnlwgt  education  education-num  \
0               0   39         State-gov   77516  Bachelors             13   
1               1   50  Self-emp-not-inc   83311  Bachelors             13   
2               2   38           Private  215646    HS-grad              9   
3               3   53           Private  234721       11th              7   
4               4   28           Private  338409  Bachelors             13   
...           ...  ...               ...     ...        ...            ...   
47616       48836   33           Private  245211  Bachelors             13   
47617       48837   39           Private  215419  Bachelors             13   
47618       48839   38           Private  374983  Bachelors             13   
47619       48840   44           Private   83891  Bachelors             13   
47620       48841   35      Self-emp-inc  182148  Bachelors             13   

           marital-status         occupation   relationship  \


In [49]:
df.to_csv("Data\cleaned_dataset.csv")

ENCODING

In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [5]:
df=pd.read_csv("Data\cleaned_dataset.csv")

In [6]:
# Split to X and y
X = df.drop(columns=['income']) #split income column
y = df['income']

In [7]:
categorical_columns = X.select_dtypes(include=['object']).columns
#Selecting String type data sets for encoding

In [8]:
new_cat=X[categorical_columns]

In [9]:
new_cat.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [24]:
df_encoded = pd.get_dummies(df,)


SyntaxError: invalid syntax. Perhaps you forgot a comma? (4142013071.py, line 1)

In [21]:
print(df_encoded)

       Unnamed: 0  age  fnlwgt  education-num  capital-gain  capital-loss  \
0               0   39   77516             13          2174             0   
1               1   50   83311             13             0             0   
2               2   38  215646              9             0             0   
3               3   53  234721              7             0             0   
4               4   28  338409             13             0             0   
...           ...  ...     ...            ...           ...           ...   
47616       48836   33  245211             13             0             0   
47617       48837   39  215419             13             0             0   
47618       48839   38  374983             13             0             0   
47619       48840   44   83891             13          5455             0   
47620       48841   35  182148             13             0             0   

       hours-per-week  workclass_?  workclass_Federal-gov  \
0             

In [22]:
# Save the encoded DataFrame to a new CSV file
df_encoded.to_csv("Data/preprocessed_dataset.csv", index=False)