In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline

In [None]:
df = pd.read_csv("14-income_evaluation.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
col_names = ["age", "workclass", "finalweight", "education", "education_num", "marital_status", "occupation", 
            "relationship", "race", "sex","capital_gain", "capital_loss","hours_per_week", "native_country","income"]

In [None]:
df.columns = col_names

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
categorical = [col for col in df.columns if df[col].dtype == "O"]
numerical = [col for col in df.columns if df[col].dtype != "O"]

In [None]:
categorical

In [None]:
numerical

In [None]:
df[categorical].head()

In [None]:
for col in categorical: 
    print(df[col].value_counts())

In [None]:
fig, ax = plt.subplots(figsize = (10,8))
ax = sns.countplot(x = "income", hue = "sex", data = df)
ax.set_title("Distribution of income by gender")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize = (10,8))
ax = sns.countplot(x = "income", hue = "race", data = df)
ax.set_title("Distribution of income by gender")
plt.show()

In [None]:
sns.catplot(y = df["hours_per_week"], hue = df["income"])
plt.show()

In [None]:
over_40_hours = df[df["hours_per_week"] > 40]
under_40_hours = df[df["hours_per_week"] <= 40]

In [None]:
over_40_hours["income"].value_counts()

In [None]:
under_40_hours["income"].value_counts()

In [None]:
df["workclass"].unique()

In [None]:
df["workclass"] = df["workclass"].replace(" ?", np.nan)

In [None]:
df["workclass"].value_counts()

In [None]:
df["marital_status"].unique()

In [None]:
df["occupation"].unique()

In [None]:
df["occupation"] = df["occupation"].replace(" ?", np.nan)

In [None]:
df["native_country"] = df["native_country"].replace(" ?", np.nan)

In [None]:
df.isnull().sum()

In [None]:
sns.pairplot(df, hue = "income")
plt.show()

In [None]:
X = df.drop("income", axis = 1)
y = df["income"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0)

In [None]:
categorical = [col for col in X_train.columns if X_train[col].dtype == "O"]

In [None]:
X_train[categorical]

In [None]:
X_train[categorical].isnull().sum()

In [None]:
X_test[categorical].isnull().sum()

In [None]:
for i in [X_train, X_test]: 
    i["workclass"] = i["workclass"].fillna(X_train["workclass"].mode()[0])
    i["occupation"] = i["occupation"].fillna(X_train["occupation"].mode()[0])
    i["native_country"] = i["native_country"].fillna(X_train["native_country"].mode()[0])

In [None]:
X_train[categorical].isnull().sum()

In [None]:
X_test[categorical].isnull().sum()

In [None]:
# encoding

In [None]:
X_train[categorical].head()

In [None]:
df[categorical].nunique()

In [None]:
y_train_binary = y_train.apply(lambda x: 1 if x.strip() == ">50K" else 0)

In [None]:
target_means = y_train_binary.groupby(X_train["native_country"]).mean()

In [None]:
X_train["native_country_encoded"] = X_train["native_country"].map(target_means)
X_train["native_country_encoded"] = X_train["native_country_encoded"].fillna(y_train_binary.mean())

X_test["native_country_encoded"] = X_test["native_country"].map(target_means)
X_test["native_country_encoded"] = X_test["native_country_encoded"].fillna(y_train_binary.mean())

In [None]:
X_train.head()

In [None]:
X_train.drop("native_country", axis = 1, inplace = True)
X_test.drop("native_country", axis = 1, inplace = True)

In [None]:
categorical

In [None]:
one_hot_categories = ['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex']

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 

In [None]:
encoder = ColumnTransformer(
    transformers = [
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), one_hot_categories)
    ], remainder = "passthrough"
)

In [None]:
X_train_enc = encoder.fit_transform(X_train)
X_test_enc = encoder.transform(X_test)

In [None]:
columns = encoder.get_feature_names_out()

In [None]:
columns

In [None]:
X_train = pd.DataFrame(X_train_enc, columns = columns, index = X_train.index)
X_test = pd.DataFrame(X_test_enc, columns = columns, index = X_test.index)

In [None]:
X_train

In [None]:
cols = X_train.columns

In [None]:
from sklearn.preprocessing import RobustScaler 
scaler = RobustScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns = cols)
X_test = pd.DataFrame(X_test, columns = cols)

In [None]:
# training 

In [None]:
from sklearn.ensemble import RandomForestClassifier 

In [None]:
rfc = RandomForestClassifier(n_estimators=10, random_state=15)
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
rfc = RandomForestClassifier(n_estimators=100, random_state=15)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
feature_scores =pd.Series(rfc.feature_importances_, index = X_train.columns).sort_values(ascending=False)

In [None]:
feature_scores

In [None]:
feature_scores.tail(10)

In [None]:
X_train = X_train.drop(["cat__education_ 12th", "cat__race_ Other", "cat__education_ 5th-6th", "cat__education_ 1st-4th", "cat__marital_status_ Married-AF-spouse", "cat__occupation_ Priv-house-serv",
                        "cat__workclass_ Without-pay", "cat__education_ Preschool", "cat__occupation_ Armed-Forces", "cat__workclass_ Never-worked"], axis = 1)

In [None]:
X_train.head()

In [None]:
X_test = X_test.drop(["cat__education_ 12th", "cat__race_ Other", "cat__education_ 5th-6th", "cat__education_ 1st-4th", "cat__marital_status_ Married-AF-spouse", "cat__occupation_ Priv-house-serv",
                        "cat__workclass_ Without-pay", "cat__education_ Preschool", "cat__occupation_ Armed-Forces", "cat__workclass_ Never-worked"], axis = 1)

In [None]:
rfc = RandomForestClassifier(n_estimators=100, random_state=15)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# hyperparameter tuning 

In [None]:
rfc_params = {
    "n_estimators": [100, 200, 500, 1000],
    "max_depth": [5,8,10,15, None], 
    "max_features": ["sqrt", "log2", 5,6,7,8],
    "min_samples_split": [2,8,15,20]
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV 
rfc = RandomForestClassifier()

In [None]:
rscv = RandomizedSearchCV(estimator=rfc, param_distributions=rfc_params, cv = 3, n_jobs = -1)
rscv.fit(X_train, y_train)

In [None]:
y_pred = rscv.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
rscv.best_params_