In [1]:
!pip install xgboost
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [25]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV

In [3]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test' , skiprows = 1, header = None)
col_labels = ['age', 'workclass', 'fnlwgt', 'education','education_num','marital_status', 'occupation','relationship', 'race', 'sex', 'capital_gain','capital_loss', 'hours_per_week', 'native_country', 'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels

In [4]:
train_set.shape, test_set.shape

((32561, 15), (16281, 15))

In [5]:
def strip(df):
  for i in df.columns:
    if df[i].dtypes == 'O':
      df[i] = df[i].str.strip()
  return df

In [6]:
def query(q:str, df):
  cols = []
  for i in df.columns:
    if df[i].dtypes == 'O' and q in df[i].unique():
      print(f'columns containing "{q}" :',i)
      cols.append(i)
  return cols

In [7]:
def replace(l:list, df):
  print(f'replacing values of {l} with most frequent value')
  for i in l:
    print(i)
    df[i] = df[i].str.replace('?',df[i].mode()[0])
  print(f'replacing done ..')

In [9]:
def encode(df1,df2):
  enc = LabelEncoder()
  for i in df1.columns:
    if df1[i].dtypes == 'O':
      df1[i] = enc.fit_transform(df1[i])
      df2[i] = enc.transform(df2[i])
  return df1,df2

In [10]:
def scale(df1):
  scale = StandardScaler()
  df1=scale.fit_transform(df1)
  return df1

In [11]:
train_set = strip(train_set)
test_set = strip(test_set)

In [12]:
train_unwanted = query('?',train_set)
test_unwanted = query('?',test_set)

columns containing "?" : workclass
columns containing "?" : occupation
columns containing "?" : native_country
columns containing "?" : workclass
columns containing "?" : occupation
columns containing "?" : native_country


In [13]:
replace(train_unwanted, train_set)
replace(test_unwanted, test_set)

replacing values of ['workclass', 'occupation', 'native_country'] with most frequent value
workclass
occupation
native_country
replacing done ..
replacing values of ['workclass', 'occupation', 'native_country'] with most frequent value
workclass
occupation
native_country
replacing done ..


  df[i] = df[i].str.replace('?',df[i].mode()[0])


In [14]:
# from <=50k. to <=50k (removing . from end)
test_set.iloc[:,-1] = test_set.iloc[:,-1].transform(lambda x:x[:-1])

In [15]:
train_set, test_set = encode(train_set, test_set)

In [16]:
train_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,5,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,3,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,3,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,3,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [17]:
X_train = train_set.iloc[:,:-1]
y_train = train_set.iloc[:,-1]

X_test = test_set.iloc[:,:-1]
y_test = test_set.iloc[:,-1]

In [22]:
X_train = scale(X_train)

In [23]:
X_train.shape, y_train.shape

((32561, 14), (32561,))

In [24]:
train = xgb.DMatrix(X_train, label = y_train)
test = xgb.DMatrix(X_test, label = y_test)

In [26]:
param_grid = {'learning_rate':[1,0.5,0.1,0.01,0.001],
              'max_depth':[3,5,10,20],
              'n_estimators':[10,50,100,200]}

In [27]:
grid = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic'),
                    param_grid,
                    verbose=3)

In [28]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV 1/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.859 total time=   0.3s
[CV 2/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.858 total time=   0.1s
[CV 3/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.863 total time=   0.1s
[CV 4/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.865 total time=   0.1s
[CV 5/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.866 total time=   0.1s
[CV 1/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.869 total time=   0.5s
[CV 2/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.865 total time=   0.5s
[CV 3/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.869 total time=   0.5s
[CV 4/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.873 total time=   0.5s
[CV 5/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.871 total time=   0.5s
[CV 1/5] END learn

In [30]:
grid.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}

In [32]:
best_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200} # bestparams after grid search operation

In [35]:
model = xgb.XGBClassifier(learning_rate= 0.1, max_depth= 5, n_estimators= 200)
model.fit(X_train, y_train)

In [36]:
y_pred = model.predict(scale(X_test))

In [37]:
y_pred

array([0, 0, 0, ..., 1, 0, 1])

In [38]:
accuracy = accuracy_score(y_test,y_pred)

In [39]:
accuracy

0.8670843314292734