In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("adult.csv")
df.columns = df.columns.str.replace("." , "_" )
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [3]:
df['workclass'] = df['workclass'].replace('?', 'Unknown')
df["occupation"] = df["occupation"].replace('?', 'Unknown')

df = df.loc[~df["workclass"].isin(["Never-worked", "Without-pay"])]

df["workclass"] = df["workclass"].replace({
    "Local-gov":"Goverment",
    "State-gov" : "Goverment",
    "Federal-gov" : "Goverment"
})

df.drop(columns="fnlwgt" , inplace=True)

df.loc[(df['education_num'] <= 8) | (df['education_num'] == 'Preschool'), 'education_num'] = 8
df.drop(columns="education" , inplace=True)

df["marital_status"] = df["marital_status"].replace({
    "Married-AF-spouse" : "Married-civ-spouse",
    "Married-spouse-absent":"Separated",
})

df["occupation"] = df["occupation"].replace({
    "Protective-serv":"Protective/Armed",
    "Armed-Forces" : "Protective/Armed",
    "Handlers-cleaners" : "Other-service",
    "Priv-house-serv":"Other-service"
})

df["relationship"] = df["relationship"].replace({
    "Not-in-family" : "Single",
    "Own-child" : "Single"
})

df["race"] = df["race"].replace("Amer-Indian-Eskimo" ,"Other")

df.drop(columns="native_country" , inplace=True)

df["sex"] = df["sex"].replace({
    "Male":1,
    "Female":0
})

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit

In [5]:
sss = StratifiedShuffleSplit(n_splits=10 , random_state=42 , test_size=0.2)
Stratifiedlist = []
for train_index , test_index in sss.split(df , df["income"]):
    sratified_train = df.iloc[train_index]
    sratified_test = df.iloc[test_index]

    Stratifiedlist.append([sratified_train ,sratified_test ])

train , test = Stratifiedlist[0]

In [6]:
X_train = train.drop(columns=["income"])
y_train = train["income"]

X_test = test.drop(columns=["income"])
y_test = test["income"]


In [7]:
y_train = y_train.apply(lambda x: 1 if x == '>50K' else 0)
y_test = y_test.apply(lambda x: 1 if x == '>50K' else 0)

In [8]:
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn.compose import ColumnTransformer

In [9]:
numeric = ["age" , "education_num" , "capital_gain","capital_loss","hours_per_week"]
categorical = ["workclass" , "marital_status" , "occupation" ,"relationship" , "race"]

In [10]:
preprocessor = ColumnTransformer(transformers=[
    ("num" , StandardScaler() , numeric),
    ("cat" , OneHotEncoder() , categorical),
])

X_train_t= preprocessor.fit_transform(X_train).toarray()
X_test_t= preprocessor.transform(X_test).toarray()

X_train = pd.DataFrame(X_train_t, columns=preprocessor.get_feature_names_out(), index=X_train.index)
X_test = pd.DataFrame(X_test_t, columns=preprocessor.get_feature_names_out(), index=X_test.index)

In [20]:
from sklearn.metrics import f1_score , recall_score , precision_score , ConfusionMatrixDisplay , accuracy_score

In [12]:
from sklearn.neighbors import KNeighborsClassifier

In [13]:
knn = KNeighborsClassifier(n_neighbors=11)

In [14]:
knn.fit(X_train, y_train)

In [15]:
y_pred = knn.predict(X_test)

In [16]:
knn.score(X_test, y_test)

0.8401966810079902

In [21]:
accuracy_score(y_test , y_pred)

0.8401966810079902

In [17]:
f1_score(y_test , y_pred)

0.640635798203179

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
rfr = RandomForestClassifier(n_estimators=500 , max_depth=15)

In [41]:
rfr.fit(X_train , y_train)

In [42]:
rfr_pred = rfr.predict(X_test)

In [43]:
print("Score :" ,rfr.score(X_test, y_test))
print("Accuracy :" , accuracy_score(y_test , rfr_pred))
print("F1_Score : " , f1_score(y_test , rfr_pred))

Score : 0.8586355255070682
Accuracy : 0.8586355255070682
F1_Score :  0.6612665684830633


In [44]:
from xgboost import XGBClassifier

In [111]:
xgb_model = XGBClassifier(
    n_estimators=2000,
    max_depth=8,
    gamma=2,
    subsample=0.7,
    learning_rate=0.01,          # کمک به کاهش overfitting
    colsample_bytree=0.85,        # هر درخت فقط از 80٪ ویژگی‌ها استفاده کنه → مدل generalize بهتر
    random_state=42,
    n_jobs=-1                    # استفاده از همه CPU ها
)

In [112]:
xgb_model.fit(
    X_train, y_train,
)

In [113]:
xgb_pred = xgb_model.predict(X_test)

In [114]:
print("Accuracy :" , accuracy_score(y_test , xgb_pred))
print("F1_Score : " , f1_score(y_test , xgb_pred))

Accuracy : 0.8684695759065765
F1_Score :  0.7019498607242339


In [88]:
from sklearn.model_selection import GridSearchCV

In [89]:
param_grid = {"max_depth": [5,6,7,8,9,10],
              "n_estimators=2000":[1000,1500 , 1700 , 1900 , 2200],
              "gamma":[0,0.5,1,2],
              "subsample":[0.6 , 0.7,0,8,0,9],
              "colsample_bytree":[0.6 , 0.7,0,8,0,9]
              }

In [90]:
gs = GridSearchCV(XGBClassifier(), param_grid ,scoring="f1" , cv=5 )  

In [91]:
gs.fit(X_train[:5000] , y_train[:5000] )

12000 fits failed out of a total of 21600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2400 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "d:\Anaconda\Lib\site-packages\xgboost\sklearn.py", line 1683, in fit
    self._Booster = train(
                    ~~~~~^
        params,
        ^^^^^^^
    ...<9 lines>...
        callbacks=self.callbacks,
        ^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  Fil

In [94]:
gs.best_params_

{'colsample_bytree': 0.6,
 'gamma': 2,
 'max_depth': 5,
 'n_estimators=2000': 1000,
 'subsample': 0.7}