In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import pydotplus
import optuna
import dtale

# read files
data_file = 'data/traindata.csv'
lable_file = 'data/trainlabel.txt'
judge_file = 'data/testdata.csv'
data_df = pd.read_csv(data_file)
judge_df = pd.read_csv(judge_file)
lable = []
with open(lable_file) as f:
    for line in f:
        lable.append(1 if line.strip() == '1' else 0)
data_df = data_df.join(pd.DataFrame(lable, columns=['label']))

# data cleansing
# mask = data_df.isin(['?'])
# unkown = mask.any(axis=1)
# data_df.drop(data_df[unkown].index, inplace=True)
# data_df = data_df.reset_index(drop=True)
# lable = [lable[i] for i in range(len(lable)) if not unkown[i]]
# weight = data_df['fnlwgt'].to_list()[:20001]
# data_df.drop('fnlwgt', axis=1, inplace=True)
# data_df = pd.get_dummies(data_df)
# features = data_df.columns.tolist()

# dtale.show(data_df, open_browser=True)



In [138]:
from sklearn.ensemble import RandomForestClassifier

dummies = pd.get_dummies(data_df.drop(columns=['sex', 'native.country', 'race']),
                         columns=['workclass', 'education', 'marital.status', 'occupation', 'relationship'])
train_df = dummies[:20001]
test_df = dummies[20000:]

rf = RandomForestClassifier(n_estimators=114, max_depth=68, min_samples_split=67, max_leaf_nodes=424)
rf.fit(train_df.drop(columns=['fnlwgt', 'label']), train_df['label'], train_df['fnlwgt'])
predictions = rf.predict(test_df.drop(columns=['fnlwgt', 'label']))
print(metrics.accuracy_score(predictions, test_df['label']))

# estimator = rf.estimators_[0]
# dot_data = export_graphviz(estimator,
#                 out_file= None,
#                 feature_names = features,
#                 class_names= ['True', 'False'],
#                 rounded = True,
#                 proportion = False,
#                 precision = 2,
#                 filled = True)
# graph = pydotplus.graph_from_dot_data(dot_data)
# graph.write_pdf('tree.pdf')

0.8592406876790831


In [136]:
def objective(trial:optuna.Trial):
    depth = trial.suggest_int('depth', 7, 86)
    split = trial.suggest_int('split', 49, 2333)
    leaves = trial.suggest_int('leaves', 360, 4090)
    rf = RandomForestClassifier(n_estimators=44, max_depth=depth, min_samples_split=split, max_leaf_nodes=leaves)
    # {'depth': 41, 'split': 55, 'leaves': 417}
    # {'depth': 55, 'split': 14, 'leaves': 737}
    # {'depth': 49, 'split': 107, 'leaves': 465}
    rf.fit(train_df.drop(columns=['fnlwgt', 'label']), train_df['label'], train_df['fnlwgt'])
    predictions = rf.predict(test_df.drop(columns=['fnlwgt', 'label']))
    acc = metrics.accuracy_score(predictions, test_df['label'])
    return acc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=233, n_jobs=-1)
study.best_params, study.best_value

[I 2023-06-09 00:13:55,557] A new study created in memory with name: no-name-ef27da31-afe2-4ffe-89d4-78ac5bc525c1
[I 2023-06-09 00:13:57,731] Trial 12 finished with value: 0.8363180515759312 and parameters: {'depth': 7, 'split': 2125, 'leaves': 3312}. Best is trial 12 with value: 0.8363180515759312.
[I 2023-06-09 00:13:57,751] Trial 13 finished with value: 0.836676217765043 and parameters: {'depth': 85, 'split': 2128, 'leaves': 1526}. Best is trial 13 with value: 0.836676217765043.
[I 2023-06-09 00:13:57,798] Trial 6 finished with value: 0.8459885386819485 and parameters: {'depth': 66, 'split': 1584, 'leaves': 2708}. Best is trial 6 with value: 0.8459885386819485.
[I 2023-06-09 00:13:57,855] Trial 2 finished with value: 0.836676217765043 and parameters: {'depth': 67, 'split': 1886, 'leaves': 2947}. Best is trial 6 with value: 0.8459885386819485.
[I 2023-06-09 00:13:57,886] Trial 3 finished with value: 0.8474212034383954 and parameters: {'depth': 82, 'split': 1876, 'leaves': 1323}. Best

({'depth': 45, 'split': 76, 'leaves': 2283}, 0.8603151862464183)

In [133]:
dummies = pd.get_dummies(data_df.drop(columns=['native.country']),
                         columns=['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race'])
train_df = dummies[:20001]
test_df = dummies[20000:]
group = train_df.groupby('sex')
classifiers = {}
for country, df in group:
    classifiers[country] = RandomForestClassifier(n_estimators=98, max_depth=21, min_samples_split=219, max_leaf_nodes=1039)
    classifiers[country].fit(df.drop(columns=['sex', 'fnlwgt', 'label']), df['label'], df['fnlwgt'])
    
group = test_df.groupby('sex')
predictions_all, labels_all = [], []
for country, df in group:
    predictions = classifiers[country].predict(df.drop(columns=['sex', 'fnlwgt', 'label']))
    predictions_all += list(predictions)
    labels_all += df['label'].tolist()
    
print(metrics.accuracy_score(predictions_all, labels_all))

0.8542263610315186


In [3]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

dummies = pd.get_dummies(data_df.drop(columns=['sex', 'native.country', 'race', 'marital.status', 'education']),
                         columns=['workclass', 'occupation', 'relationship'])
train_df = dummies[:20001]
test_df = dummies[20000:]

bnb = BernoulliNB()
bnb.fit(train_df.drop(columns=['fnlwgt', 'label']), train_df['label'], train_df['fnlwgt'])
predictions = bnb.predict(test_df.drop(columns=['fnlwgt', 'label']))
print(metrics.accuracy_score(predictions, test_df['label']))

canb = CategoricalNB()
canb.fit(train_df.drop(columns=['fnlwgt', 'label']), train_df['label'], train_df['fnlwgt'])
predictions = canb.predict(test_df.drop(columns=['fnlwgt', 'label']))
print(metrics.accuracy_score(predictions, test_df['label']))

conb = ComplementNB()
conb.fit(train_df.drop(columns=['fnlwgt', 'label']), train_df['label'], train_df['fnlwgt'])
predictions = conb.predict(test_df.drop(columns=['fnlwgt', 'label']))
print(metrics.accuracy_score(predictions, test_df['label']))

gnb = GaussianNB()
gnb.fit(train_df.drop(columns=['fnlwgt', 'label']), train_df['label'], train_df['fnlwgt'])
predictions = gnb.predict(test_df.drop(columns=['fnlwgt', 'label']))
print(metrics.accuracy_score(predictions, test_df['label']))

mnb = MultinomialNB()
mnb.fit(train_df.drop(columns=['fnlwgt', 'label']), train_df['label'], train_df['fnlwgt'])
predictions = mnb.predict(test_df.drop(columns=['fnlwgt', 'label']))
print(metrics.accuracy_score(predictions, test_df['label']))

0.8101719197707736
0.8517191977077364
0.7818767908309455
0.8345272206303725
0.7818767908309455


In [39]:
from sklearn.neural_network import MLPClassifier

dummies = pd.get_dummies(data_df.drop(columns=[]),
                         columns=['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country'])
train_df = dummies[:20001]
test_df = dummies[20000:]

mlp = MLPClassifier(hidden_layer_sizes=(359,), alpha=0.00042062613323840626, batch_size=926, learning_rate_init=0.00015703156146915697, max_iter=99)
mlp.fit(train_df.drop(columns=['fnlwgt', 'label']), train_df['label'])
predictions = mlp.predict(test_df.drop(columns=['fnlwgt', 'label']))
print(metrics.accuracy_score(predictions, test_df['label'], sample_weight=test_df['fnlwgt']))

0.8580020469927475


In [30]:
def objective(trial:optuna.Trial):
    layer_size = trial.suggest_int('layer_size', 200, 500)
    alpha = trial.suggest_float('alpha', 0.0001, 0.0007)
    batch_size = trial.suggest_int('batch_size', 100, 1000)
    rate_init = trial.suggest_float('rate_init', 0.0001, 0.001)
    iter = trial.suggest_int('iter', 80, 200)
    mlp = MLPClassifier(hidden_layer_sizes=(layer_size,), alpha=alpha, batch_size=batch_size, learning_rate_init=rate_init, max_iter=iter)
    mlp.fit(train_df.drop(columns=['fnlwgt', 'label']), train_df['label'])
    predictions = mlp.predict(test_df.drop(columns=['fnlwgt', 'label']))
    return metrics.accuracy_score(predictions, test_df['label'], sample_weight=test_df['fnlwgt'])

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1)
study.best_params, study.best_value

[I 2023-06-09 11:41:05,305] A new study created in memory with name: no-name-eab1cab8-2f4e-4281-97e7-f718b679b180
[I 2023-06-09 11:41:32,506] Trial 3 finished with value: 0.8480237549708112 and parameters: {'layer_size': 394, 'alpha': 0.00040187758828554563, 'batch_size': 519, 'rate_init': 0.0008639019815129508, 'iter': 88}. Best is trial 3 with value: 0.8480237549708112.
[I 2023-06-09 11:41:34,168] Trial 13 finished with value: 0.8330775060096345 and parameters: {'layer_size': 337, 'alpha': 0.00013080486812701192, 'batch_size': 774, 'rate_init': 0.0008298065530352417, 'iter': 94}. Best is trial 3 with value: 0.8480237549708112.
[I 2023-06-09 11:41:36,679] Trial 7 finished with value: 0.8387842409810535 and parameters: {'layer_size': 322, 'alpha': 0.00017721993551073629, 'batch_size': 596, 'rate_init': 0.0007432029424784124, 'iter': 80}. Best is trial 3 with value: 0.8480237549708112.
[I 2023-06-09 11:41:38,074] Trial 12 finished with value: 0.8444852988983728 and parameters: {'layer_s

({'layer_size': 320,
  'alpha': 0.00042062613323840626,
  'batch_size': 926,
  'rate_init': 0.00015703156146915697,
  'iter': 99},
 0.8620590022663032)

In [None]:
from xgboost import XGBClassifier

dummies = pd.get_dummies(data_df.drop(columns=[]),
                         columns=['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country'])
train_df = dummies[:20001]
test_df = dummies[20000:]

xgb = XGBClassifier()
xgb.fit()
predictions = xgb.predict(train_df.drop(columns=['fnlwgt', 'label']), train_df['label'], train_df['fnlwgt'])
print(xgb.score(predictions, test_df['label']))