## task:
### revenue prediction
### use: voting models

In [19]:
import pandas as pd
pd.set_option('display.max_columns', 50)

train0 = pd.read_csv('train.csv') 
X0_train = train0.drop('is_rich', axis=1)
y0_train = train0['is_rich']

test0 = pd.read_csv('test.csv')

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train0.drop('is_rich', axis=1), train0['is_rich'], \
                                                    random_state=81, test_size=0.3, stratify=train0['is_rich'])

## pipeline preprocessing

In [10]:
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, QuantileTransformer, StandardScaler, RobustScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

def my_rbf(X):
    Y = X.min()
    return rbf_kernel(X, [Y], gamma=0.1)

rbf = make_pipeline(
    FunctionTransformer(my_rbf, feature_names_out="one-to-one"),
)

preprocessing = make_column_transformer(
    (rbf, ['capital-gain']),
    (rbf, ['capital-loss']),
    (QuantileTransformer(output_distribution='normal'), ['age', 'fnlwgt']),
    (MinMaxScaler(), ['educational-num', 'hours-per-week']),
    
    (OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [
        'workclass',
        'education',
        'marital-status',
        'occupation',
        'relationship',
        'race',
        'gender',
        'native-country',

        'educational-num',
        'hours-per-week',
    ]),
  
    remainder='drop')

tmp_out = preprocessing.fit_transform(X0_train)
df_out = pd.DataFrame(tmp_out, columns=preprocessing.get_feature_names_out())
df_out.head(3)

Unnamed: 0,pipeline-1__capital-gain,pipeline-2__capital-loss,quantiletransformer__age,quantiletransformer__fnlwgt,minmaxscaler__educational-num,minmaxscaler__hours-per-week,onehotencoder__workclass_?,onehotencoder__workclass_Federal-gov,onehotencoder__workclass_Local-gov,onehotencoder__workclass_Never-worked,onehotencoder__workclass_Private,onehotencoder__workclass_Self-emp-inc,onehotencoder__workclass_Self-emp-not-inc,onehotencoder__workclass_State-gov,onehotencoder__workclass_Without-pay,onehotencoder__education_10th,onehotencoder__education_11th,onehotencoder__education_12th,onehotencoder__education_1st-4th,onehotencoder__education_5th-6th,onehotencoder__education_7th-8th,onehotencoder__education_9th,onehotencoder__education_Assoc-acdm,onehotencoder__education_Assoc-voc,onehotencoder__education_Bachelors,...,onehotencoder__hours-per-week_72,onehotencoder__hours-per-week_73,onehotencoder__hours-per-week_74,onehotencoder__hours-per-week_75,onehotencoder__hours-per-week_76,onehotencoder__hours-per-week_77,onehotencoder__hours-per-week_78,onehotencoder__hours-per-week_79,onehotencoder__hours-per-week_80,onehotencoder__hours-per-week_81,onehotencoder__hours-per-week_84,onehotencoder__hours-per-week_85,onehotencoder__hours-per-week_86,onehotencoder__hours-per-week_87,onehotencoder__hours-per-week_88,onehotencoder__hours-per-week_89,onehotencoder__hours-per-week_90,onehotencoder__hours-per-week_91,onehotencoder__hours-per-week_92,onehotencoder__hours-per-week_94,onehotencoder__hours-per-week_95,onehotencoder__hours-per-week_96,onehotencoder__hours-per-week_97,onehotencoder__hours-per-week_98,onehotencoder__hours-per-week_99
0,1.0,1.0,0.248427,-0.522588,0.533333,0.377551,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,-5.199338,-1.37439,0.4,0.295918,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,-0.190587,-0.540563,0.8,0.193878,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## hyperparameters tuning

In [13]:
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

smt = SMOTE(random_state=1)

pipeline = Pipeline([
    ('preprocessor', preprocessing),
    ('smote', smt),
    ('classifier', LogisticRegression()),
])

param = {
    'classifier': [
        # KNeighborsClassifier(n_neighbors=9, weights='distance'),
        # RandomForestClassifier(random_state=1, class_weight='balanced', criterion='entropy', max_depth=14, max_features=None, min_samples_split=7),
        # LogisticRegression(random_state=1, class_weight='balanced', max_iter=1000, penalty='l1', C=0.2, solver='liblinear'),
        LogisticRegression(random_state=1, class_weight='balanced', max_iter=1000),
        # GradientBoostingClassifier(random_state=1, n_estimators=200, subsample=0.8, min_samples_split=3),
        # SVC(random_state=1, class_weight='balanced', C=15),
    ],
    # 'classifier__n_neighbors': [9],
    # 'classifier__weights': ['distance'],
    
    # 'classifier__criterion': ['entropy'],
    # 'classifier__max_depth': [14],
    # 'classifier__max_features': [None],
    # 'classifier__min_samples_split': [7],

    'classifier__penalty': ['l1'],
    'classifier__C': [0.2],
    'classifier__solver': ['liblinear'],

    # 'classifier__n_estimators': [200],
    # 'classifier__subsample': [0.8],
    # 'classifier__min_samples_split': [3],

    # 'classifier__C': [15],
}

grid_search = GridSearchCV(pipeline, param_grid=param, cv=3, scoring='f1_micro', n_jobs=-1)
model = grid_search.fit(X0_train, y0_train)
model.best_params_

{'classifier': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=1),
 'classifier__C': 0.2,
 'classifier__penalty': 'l1',
 'classifier__solver': 'liblinear'}

## pipelines for different models

In [32]:
pipe_rfc = Pipeline([
    ('preprocessor', preprocessing),
    ('smote', smt),
    ('classifier', RandomForestClassifier(
        random_state=1, criterion='entropy', max_depth=14, max_features=None, min_samples_split=7
    )),
])

model_rfc = pipe_rfc.fit(X0_train, y0_train)

In [34]:
pipe_lr = Pipeline([
    ('preprocessor', preprocessing),
    ('smote', smt),
    ('classifier', LogisticRegression(
        random_state=1, max_iter=1000, penalty='l1', C=0.2, solver='liblinear'
    )),
])

model_lr = pipe_lr.fit(X0_train, y0_train)

In [35]:
pipe_gbc = Pipeline([
    ('preprocessor', preprocessing),
    ('smote', smt),
    ('classifier', GradientBoostingClassifier(
        random_state=1, n_estimators=200, subsample=0.8, min_samples_split=3
    )),
])

model_gbc = pipe_gbc.fit(X0_train, y0_train)

In [36]:
pipe_knc = Pipeline([
    ('preprocessor', preprocessing),
    ('smote', smt),
    ('classifier', KNeighborsClassifier(
        n_neighbors=9
    )),
])

model_knc = pipe_knc.fit(X0_train, y0_train)

In [30]:
pipe_svc = Pipeline([
    ('preprocessor', preprocessing),
    ('smote', smt),
    ('classifier', SVC(
        random_state=1, C=15
    )),
])

model_svc = pipe_svc.fit(X0_train, y0_train)

### f1 metrics comparison

In [20]:
from sklearn.metrics import f1_score

In [41]:
predict_train_rfc = model_rfc.predict(X0_train)
predict_test_rfc = model_rfc.predict(test0)
f1_score(y0_train, predict_train_rfc),

(0.724380861823054,)

In [40]:
predict_train_lr = model_lr.predict(X0_train)
predict_test_lr = model_lr.predict(test0)
f1_score(y0_train, predict_train_lr),

(0.6745784418356457,)

In [39]:
predict_train_gbc = model_gbc.predict(X0_train)
predict_test_gbc = model_gbc.predict(test0)
f1_score(y0_train, predict_train_gbc),

(0.686667563567873,)

In [38]:
predict_train_knc = model_knc.predict(X0_train)
predict_test_knc = model_knc.predict(test0)
f1_score(y0_train, predict_train_knc),

(0.7033189969974842,)

In [37]:
predict_train_svc = model_svc.predict(X0_train)
predict_test_svc = model_svc.predict(test0)
f1_score(y0_train, predict_train_svc),

(0.788971768482674,)

In [None]:
(0.6541115583668776, 0.7338734615622489)
(0.6661945900014163, 0.6761661186213616)
(0.6733986435568953, 0.6922227275680592)
(0.6167975509117529, 0.7025655808590372)

## voting models

In [95]:
ss = train0[['id','is_rich']].copy()
ss['rfc'] = predict_train_rfc
ss['lr'] = predict_train_lr
ss['gbc'] = predict_train_gbc
ss['knc'] = predict_train_knc
ss['svc'] = predict_train_svc

ss['s'] = ss['rfc'] + ss['lr'] + ss['gbc'] + ss['knc'] + ss['svc']
ss['s2'] = 0
ss.loc[ss['s'] > 4, 's2'] = 1
ss

Unnamed: 0,id,is_rich,rfc,lr,gbc,knc,svc,s,s2
0,9768,0,0,0,0,0,0,0,0
1,9769,0,0,0,0,0,0,0,0
2,9770,0,0,0,0,0,0,0,0
3,9771,0,0,0,0,0,0,0,0
4,9772,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...
39069,48837,1,1,1,1,1,1,5,1
39070,48838,0,0,0,0,0,0,0,0
39071,48839,0,0,0,0,0,0,0,0
39072,48840,0,0,0,0,0,0,0,0


In [None]:
from sklearn.metrics import confusion_matrix

In [96]:
y_test_pred = ss['s2']
print(f1_score(ss['is_rich'], y_test_pred))
confusion_matrix(ss['is_rich'], y_test_pred)

0.7524490336245698


array([[27294,  2421],
       [ 2254,  7105]])

## output prepare

In [50]:
k = pd.read_csv('sample_submission.csv')
k['is_reach'] = predict_test_svc

In [51]:
k.to_csv('submit_svc2.csv', index=False)

In [92]:
q_rfc = pd.read_csv('submit_rfc2.csv')
q_lr = pd.read_csv('submit_lr2.csv')
q_gbc = pd.read_csv('submit_gbc2.csv')
q_knc = pd.read_csv('submit_knc2.csv')
q_svc = pd.read_csv('submit_svc2.csv')

In [101]:
a1 = q_rfc.merge(q_lr,on='id')
a2 = a1.merge(q_gbc,on='id')
a3 = a2.merge(q_knc,on='id', suffixes=('_z1','_z2'))
a4 = a3.merge(q_svc,on='id')
a4['s'] = a4['is_reach_x'] + a4['is_reach_y'] + a4['is_reach_z1'] + a4['is_reach_z2'] + a4['is_reach']
a4['s2'] = 0
a4.loc[a4['s'] > 2, 's2'] = 1
a4.drop(['is_reach_x','is_reach_y','is_reach_z1','is_reach_z2','is_reach','s'], axis=1, inplace=True)
a4.rename(columns={'s2': 'is_reach'}, inplace=True)
a4

Unnamed: 0,id,is_reach
0,0,1
1,1,0
2,2,1
3,3,1
4,4,0
...,...,...
9763,9763,1
9764,9764,0
9765,9765,0
9766,9766,0


In [102]:
f = pd.read_csv('sample_submission.csv')
f['is_reach'] = a4['is_reach']
f

Unnamed: 0,id,is_reach
0,0,1
1,1,0
2,2,1
3,3,1
4,4,0
...,...,...
9763,9763,1
9764,9764,0
9765,9765,0
9766,9766,0


In [103]:
f.to_csv('submit_S3.csv', index=False)