In [8]:
import preprocessing_census
import evaluate_model
import run_model
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [5]:
train, test = preprocessing_census.open_datasets()

missing_values = preprocessing_census.get_missing_data(train)
empty_cols = missing_values[missing_values["Percent"] >= 51].index

df, target, categorical_columns, numeric_columns = preprocessing_census.feature_transform(train, empty_cols)

Categorical columns:
 ['class_of_worker', 'education', 'marital_stat', 'major_industry_code', 'major_occupation_code', 'race', 'sex', 'full_or_part_time_employment_stat', 'tax_filer_status', 'detailed_household_summary_in_household', 'migration_code-change_in_msa', 'migration_code-change_in_reg', 'migration_code-move_within_reg', 'live_in_this_house_1_year_ago', 'citizenship', 'sexCat']

Numeric columns:
 ['age', 'detailed_industry_recode', 'detailed_occupation_recode', 'wage_per_hour', 'capital_gains', 'capital_losses', 'dividends_from_stocks', 'instance_weight', 'num_persons_worked_for_employer', 'own_business_or_self_employed', 'veterans_benefits', 'weeks_worked_in_year', 'year', 'ageCat', 'hispanicCat', 'unemployment', 'household_Frequency']


In [6]:
clf = RandomForestClassifier(bootstrap=False, max_features=9, min_samples_leaf=10,
                       min_samples_split=4, n_estimators=50, n_jobs=-1,
                       random_state=28)
xgb = XGBClassifier(eta=0.8, gamma=0.8, max_depth=None, n_estimators=40)
svc = SVC(C=1, gamma=1)

In [11]:
stc = StackingClassifier(estimators=[('svc', svc),
                               ('rf', clf),
                               ('xgb', xgb)],
                   final_estimator=LogisticRegression(C=1))

In [18]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import MinMaxScaler
from category_encoders import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

In [19]:
imputer = IterativeImputer(max_iter=30, random_state=42)
scaler = MinMaxScaler()

frequent = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
onehot = OneHotEncoder()

preprocess = make_column_transformer(
    (make_pipeline(imputer, scaler), numeric_columns),
    (make_pipeline(frequent, onehot), categorical_columns)
)

pipe = make_pipeline(preprocess, stc)

In [None]:
scores = cross_val_score(pipe, df, target, scoring='accuracy')

In [None]:
print(scores)