In [4]:
import numpy as np
import preprocessing_census
import evaluate_model
import run_model
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression

In [2]:
train, test = preprocessing_census.open_datasets()

missing_values = preprocessing_census.get_missing_data(train)
empty_cols = missing_values[missing_values["Percent"] >= 51].index

df, target, categorical_columns, numeric_columns = preprocessing_census.feature_transform(train, empty_cols)

Categorical columns:
 ['class_of_worker', 'education', 'marital_stat', 'major_industry_code', 'major_occupation_code', 'race', 'sex', 'full_or_part_time_employment_stat', 'tax_filer_status', 'detailed_household_summary_in_household', 'migration_code-change_in_msa', 'migration_code-change_in_reg', 'migration_code-move_within_reg', 'live_in_this_house_1_year_ago', 'citizenship', 'sexCat']

Numeric columns:
 ['age', 'detailed_industry_recode', 'detailed_occupation_recode', 'wage_per_hour', 'capital_gains', 'capital_losses', 'dividends_from_stocks', 'instance_weight', 'num_persons_worked_for_employer', 'own_business_or_self_employed', 'veterans_benefits', 'weeks_worked_in_year', 'year', 'ageCat', 'hispanicCat', 'unemployment', 'household_Frequency']


In [5]:
clr = LogisticRegression()

clr_param_grid = {'penalty' : ['l1', 'l2'],
                  'C' : np.logspace(-4, 4, 20),
                  'solver' : ['liblinear']}

In [6]:
start = time.time()
results = run_model.fit_model(clr, df, target, numeric_columns, categorical_columns, clr_param_grid)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] C=0.0001, penalty=l1, solver=liblinear ..........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=0.0001, penalty=l1, solver=liblinear, score=0.938, total=   1.2s
[CV] C=0.0001, penalty=l1, solver=liblinear ..........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV]  C=0.0001, penalty=l1, solver=liblinear, score=0.938, total=   1.6s
[CV] C=0.0001, penalty=l1, solver=liblinear ..........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.8s remaining:    0.0s


[CV]  C=0.0001, penalty=l1, solver=liblinear, score=0.938, total= 3.3min
[CV] C=0.0001, penalty=l1, solver=liblinear ..........................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.3min remaining:    0.0s


[CV]  C=0.0001, penalty=l1, solver=liblinear, score=0.938, total=   1.6s
[CV] C=0.0001, penalty=l1, solver=liblinear ..........................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.4min remaining:    0.0s


[CV]  C=0.0001, penalty=l1, solver=liblinear, score=0.938, total=   1.4s
[CV] C=0.0001, penalty=l2, solver=liblinear ..........................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.4min remaining:    0.0s


[CV]  C=0.0001, penalty=l2, solver=liblinear, score=0.938, total=   1.2s
[CV] C=0.0001, penalty=l2, solver=liblinear ..........................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.4min remaining:    0.0s


[CV]  C=0.0001, penalty=l2, solver=liblinear, score=0.938, total=   1.2s
[CV] C=0.0001, penalty=l2, solver=liblinear ..........................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.4min remaining:    0.0s


[CV]  C=0.0001, penalty=l2, solver=liblinear, score=0.938, total=   1.3s
[CV] C=0.0001, penalty=l2, solver=liblinear ..........................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.5min remaining:    0.0s


[CV]  C=0.0001, penalty=l2, solver=liblinear, score=0.938, total=   1.2s
[CV] C=0.0001, penalty=l2, solver=liblinear ..........................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  3.5min remaining:    0.0s


[CV]  C=0.0001, penalty=l2, solver=liblinear, score=0.938, total=   1.2s
[CV] C=0.00026366508987303583, penalty=l1, solver=liblinear ..........
[CV]  C=0.00026366508987303583, penalty=l1, solver=liblinear, score=0.938, total=   1.6s
[CV] C=0.00026366508987303583, penalty=l1, solver=liblinear ..........
[CV]  C=0.00026366508987303583, penalty=l1, solver=liblinear, score=0.938, total=   1.6s
[CV] C=0.00026366508987303583, penalty=l1, solver=liblinear ..........
[CV]  C=0.00026366508987303583, penalty=l1, solver=liblinear, score=0.938, total=   1.6s
[CV] C=0.00026366508987303583, penalty=l1, solver=liblinear ..........
[CV]  C=0.00026366508987303583, penalty=l1, solver=liblinear, score=0.938, total=   1.6s
[CV] C=0.00026366508987303583, penalty=l1, solver=liblinear ..........
[CV]  C=0.00026366508987303583, penalty=l1, solver=liblinear, score=0.938, total=   1.5s
[CV] C=0.00026366508987303583, penalty=l2, solver=liblinear ..........
[CV]  C=0.00026366508987303583, penalty=l2, solver=libli

[CV]  C=0.03359818286283781, penalty=l1, solver=liblinear, score=0.949, total=  30.9s
[CV] C=0.03359818286283781, penalty=l1, solver=liblinear .............
[CV]  C=0.03359818286283781, penalty=l1, solver=liblinear, score=0.952, total=  33.4s
[CV] C=0.03359818286283781, penalty=l1, solver=liblinear .............
[CV]  C=0.03359818286283781, penalty=l1, solver=liblinear, score=0.950, total=  32.2s
[CV] C=0.03359818286283781, penalty=l1, solver=liblinear .............
[CV]  C=0.03359818286283781, penalty=l1, solver=liblinear, score=0.950, total=  29.9s
[CV] C=0.03359818286283781, penalty=l2, solver=liblinear .............
[CV]  C=0.03359818286283781, penalty=l2, solver=liblinear, score=0.950, total=   2.1s
[CV] C=0.03359818286283781, penalty=l2, solver=liblinear .............
[CV]  C=0.03359818286283781, penalty=l2, solver=liblinear, score=0.949, total=   2.1s
[CV] C=0.03359818286283781, penalty=l2, solver=liblinear .............
[CV]  C=0.03359818286283781, penalty=l2, solver=liblinear,

[CV]  C=4.281332398719396, penalty=l1, solver=liblinear, score=0.951, total= 1.0min
[CV] C=4.281332398719396, penalty=l2, solver=liblinear ...............
[CV]  C=4.281332398719396, penalty=l2, solver=liblinear, score=0.952, total=   5.3s
[CV] C=4.281332398719396, penalty=l2, solver=liblinear ...............
[CV]  C=4.281332398719396, penalty=l2, solver=liblinear, score=0.950, total=   5.1s
[CV] C=4.281332398719396, penalty=l2, solver=liblinear ...............
[CV]  C=4.281332398719396, penalty=l2, solver=liblinear, score=0.954, total=   4.9s
[CV] C=4.281332398719396, penalty=l2, solver=liblinear ...............
[CV]  C=4.281332398719396, penalty=l2, solver=liblinear, score=0.952, total=   5.2s
[CV] C=4.281332398719396, penalty=l2, solver=liblinear ...............
[CV]  C=4.281332398719396, penalty=l2, solver=liblinear, score=0.951, total=   5.0s
[CV] C=11.288378916846883, penalty=l1, solver=liblinear ..............
[CV]  C=11.288378916846883, penalty=l1, solver=liblinear, score=0.953,

[CV]  C=545.5594781168514, penalty=l2, solver=liblinear, score=0.954, total=   7.8s
[CV] C=545.5594781168514, penalty=l2, solver=liblinear ...............
[CV]  C=545.5594781168514, penalty=l2, solver=liblinear, score=0.952, total=   5.3s
[CV] C=545.5594781168514, penalty=l2, solver=liblinear ...............
[CV]  C=545.5594781168514, penalty=l2, solver=liblinear, score=0.951, total=   5.9s
[CV] C=1438.44988828766, penalty=l1, solver=liblinear ................
[CV]  C=1438.44988828766, penalty=l1, solver=liblinear, score=0.953, total=  21.0s
[CV] C=1438.44988828766, penalty=l1, solver=liblinear ................
[CV]  C=1438.44988828766, penalty=l1, solver=liblinear, score=0.950, total=  27.0s
[CV] C=1438.44988828766, penalty=l1, solver=liblinear ................
[CV]  C=1438.44988828766, penalty=l1, solver=liblinear, score=0.954, total=  36.6s
[CV] C=1438.44988828766, penalty=l1, solver=liblinear ................
[CV]  C=1438.44988828766, penalty=l1, solver=liblinear, score=0.952, tota

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed: 57.0min finished


In [7]:
import make_prediction

In [8]:
make_prediction.get_model_performances(results, test, empty_cols)

Model parameters:
{'memory': None, 'steps': [('columntransformer', ColumnTransformer(transformers=[('pipeline-1',
                                 Pipeline(steps=[('iterativeimputer',
                                                  IterativeImputer(max_iter=30,
                                                                   random_state=42)),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 ['age', 'detailed_industry_recode',
                                  'detailed_occupation_recode', 'wage_per_hour',
                                  'capital_gains', 'capital_losses',
                                  'dividends_from_stocks', 'instance_weight',
                                  'num_persons_worked_for_employer'...
                                 ['class_of_worker', 'education',
                                  'marital_stat', 'major_industry_code',
        

Categorical columns:
 ['class_of_worker', 'education', 'marital_stat', 'major_industry_code', 'major_occupation_code', 'race', 'sex', 'full_or_part_time_employment_stat', 'tax_filer_status', 'detailed_household_summary_in_household', 'migration_code-change_in_msa', 'migration_code-change_in_reg', 'migration_code-move_within_reg', 'live_in_this_house_1_year_ago', 'citizenship', 'sexCat']

Numeric columns:
 ['age', 'detailed_industry_recode', 'detailed_occupation_recode', 'wage_per_hour', 'capital_gains', 'capital_losses', 'dividends_from_stocks', 'instance_weight', 'num_persons_worked_for_employer', 'own_business_or_self_employed', 'veterans_benefits', 'weeks_worked_in_year', 'year', 'ageCat', 'hispanicCat', 'unemployment', 'household_Frequency']
Output length : 99762
[[91513  2063]
 [ 3020  3166]]
accuracy: 0.9490487359916602
sensitivity: 0.6054694970357621
specificity: 0.9680534839685613
Best estimator : LogisticRegression(C=78.47599703514607, penalty='l1', solver='liblinear')
Best sc