In [21]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, RepeatedKFold, RandomizedSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report


# To ignore warnings
import warnings
warnings.filterwarnings('ignore')


ModuleNotFoundError: No module named 'xgboost'

In [22]:
# Load the dataset
df = pd.read_csv('adult_with_headers.csv')

# Describe the dataset
def dataset_description(df_obj):
    print('Dataset shape:', df_obj.shape)
    print('Dataset columns:', df_obj.columns)
    display(df_obj.head())
    display(df_obj.info())
    display(df_obj.describe())
    print('Null values:', df_obj.isnull().sum())

dataset_description(df)


Dataset shape: (32561, 15)
Dataset columns: Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


None

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


Null values: age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [23]:
def categorical_feature_analysis(df, var_name):
    print(f'Data values by class of feature {var_name}:\n{df[var_name].value_counts(dropna=False)}')
    print(f'Percentage of data values by class of feature {var_name}:\n{df[var_name].value_counts(normalize=True, dropna=False)}')
    print(f'Unique values of feature {var_name}:\n{df[var_name].unique()}')

# Example usage
categorical_feature_analysis(df, 'workclass')


Data values by class of feature workclass:
workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64
Percentage of data values by class of feature workclass:
workclass
Private             0.697030
Self-emp-not-inc    0.078038
Local-gov           0.064279
?                   0.056386
State-gov           0.039864
Self-emp-inc        0.034274
Federal-gov         0.029483
Without-pay         0.000430
Never-worked        0.000215
Name: proportion, dtype: float64
Unique values of feature workclass:
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']


In [24]:
def continuous_feature_analysis(df, var_name):
    print(f'Distribution of data values of feature {var_name}:\n{df[var_name].value_counts(dropna=False)}')
    print(f'Percentage of data distribution of feature {var_name}:\n{df[var_name].value_counts(normalize=True, dropna=False)}')
    print(f'Total unique values of feature {var_name}: {len(df[var_name].unique())}')
    display(df[var_name].describe())

# Example usage
continuous_feature_analysis(df, 'age')


Distribution of data values of feature age:
age
36    898
31    888
34    886
23    877
35    876
     ... 
83      6
88      3
85      3
86      1
87      1
Name: count, Length: 73, dtype: int64
Percentage of data distribution of feature age:
age
36    0.027579
31    0.027272
34    0.027210
23    0.026934
35    0.026903
        ...   
83    0.000184
88    0.000092
85    0.000092
86    0.000031
87    0.000031
Name: proportion, Length: 73, dtype: float64
Total unique values of feature age: 73


count    32561.000000
mean        38.581647
std         13.640433
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: age, dtype: float64

In [25]:
def std_features_all(all_X, vars):
    std_sca = StandardScaler()
    std_sca.fit(all_X[vars])
    all_X.loc[:, vars] = std_sca.transform(all_X[vars])
    return all_X

def std_features(train_X, val_X, vars):
    std_sca = StandardScaler()
    std_sca.fit(train_X[vars])
    train_X.loc[:, vars] = std_sca.transform(train_X[vars])
    val_X.loc[:, vars] = std_sca.transform(val_X[vars])
    return train_X, val_X

def norm_features(train_X, val_X, vars):
    mm_sca = MinMaxScaler()
    mm_sca.fit(train_X[vars])
    train_X.loc[:, vars] = mm_sca.transform(train_X[vars])
    val_X.loc[:, vars] = mm_sca.transform(val_X[vars])
    return train_X, val_X

# Example usage
numeric_features = ['age', 'hours_per_week']
df = std_features_all(df, numeric_features)


In [None]:
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(df[['feature2']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['feature2']))

In [26]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [27]:
# Example: Logistic Regression
model = LogisticRegression()
model.fit(train_df.drop('income', axis=1), train_df['income'])
predictions = model.predict(test_df.drop('income', axis=1))

# Evaluate the model
print('Accuracy:', accuracy_score(test_df['income'], predictions))
print('Recall:', recall_score(test_df['income'], predictions))
print('Precision:', precision_score(test_df['income'], predictions))
print('F1 Score:', f1_score(test_df['income'], predictions))
print('Confusion Matrix:\n', confusion_matrix(test_df['income'], predictions))
print('Classification Report:\n', classification_report(test_df['income'], predictions))


ValueError: could not convert string to float: ' Local-gov'