In [26]:
import pandas as pd 
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.metrics import classification_report

## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('data/final_data.csv',index_col=0)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_name,marital_status,occupation,relationship,race,sex,hours-per-week,native_country,Salary Greater than 50k Or Not,NetProfit
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,1,2174
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,1,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,1,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,1,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,1,0


In [3]:
df.isna().sum()

age                               0
workclass                         0
fnlwgt                            0
education                         0
education_name                    0
marital_status                    0
occupation                        0
relationship                      0
race                              0
sex                               0
hours-per-week                    0
native_country                    0
Salary Greater than 50k Or Not    0
NetProfit                         0
dtype: int64

In [4]:
## Independent and dependent features
X = df.drop(labels=['Salary Greater than 50k Or Not'],axis=1)
Y = df[['Salary Greater than 50k Or Not']]

In [5]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_name,marital_status,occupation,relationship,race,sex,hours-per-week,native_country,NetProfit
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,2174
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,0


In [6]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include=['object','category']).columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [7]:
categorical_cols

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country'],
      dtype='object')

In [8]:
numerical_cols

Index(['age', 'fnlwgt', 'education_name', 'hours-per-week', 'NetProfit'], dtype='object')

In [9]:
for i in categorical_cols:
    print(i,"=",list(df[i].unique()))
    print()

workclass = ['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 'Local-gov', 'other', 'Self-emp-inc', 'Without-pay', 'Never-worked']

education = ['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th']

marital_status = ['Never-married', 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Separated', 'Married-AF-spouse', 'Widowed']

occupation = ['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners', 'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair', 'Transport-moving', 'Farming-fishing', 'Machine-op-inspct', 'Tech-support', 'other', 'Protective-serv', 'Armed-Forces', 'Priv-house-serv']

relationship = ['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried', 'Other-relative']

race = ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other']

sex = ['Male', 'Female']

native_country = ['United-States', 'Cuba', 'Jama

In [10]:
workclass = ['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 'Local-gov', 'other', 'Self-emp-inc', 'Without-pay', 'Never-worked']
education = ['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th']
marital_status = ['Never-married', 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Separated', 'Married-AF-spouse', 'Widowed']
occupation = ['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners', 'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair', 'Transport-moving', 'Farming-fishing', 'Machine-op-inspct', 'Tech-support', 'other', 'Protective-serv', 'Armed-Forces', 'Priv-house-serv']
relationship = ['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried', 'Other-relative']
race = ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other']
sex = ['Male', 'Female']
native_country = ['United-States', 'Cuba', 'Jamaica', 'India', 'other', 'Mexico', 'South', 'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany', 'Iran', 'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia', 'Thailand', 'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal', 'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala', 'China', 'Japan', 'Yugoslavia', 'Peru', 'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago', 'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary', 'Holand-Netherlands']

In [11]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('ordinalencoder',OrdinalEncoder(categories=[workclass,education,marital_status,occupation,relationship,race,sex,native_country])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [12]:
X_train,X_test,Y_train,Y_test =train_test_split(X,Y,train_size=0.80,random_state=42)

X_train.shape

(26048, 13)

In [13]:
Y_train.shape

(26048, 1)

In [14]:
# Applying Preprocessing to our Train Dataset
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [15]:
# Our Logistic Model
logistic_model = LogisticRegression()

# Applying Grid Search Cross 
params = {'C':[0.5,1,2,4,6,7,8,9,10,45,30],'penalty':['l1','l2','elasticnet']}
logistic_cls = GridSearchCV(logistic_model,params,scoring='accuracy',cv=5,verbose=0)

In [16]:
logistic_cls.fit(X_train,Y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
110 fits failed out of a total of 165.
The score on these train-test partitions for these parameters will be 

In [17]:
logistic_cls.best_score_

0.8185657630065194

In [18]:
logistic_cls.best_params_

{'C': 0.5, 'penalty': 'l2'}

In [19]:
from sklearn.metrics import accuracy_score,precision_recall_curve,confusion_matrix,f1_score

pred_log = logistic_cls.predict(X_test)
confusion_matrix(Y_test,pred_log)

array([[ 674,  897],
       [ 291, 4651]], dtype=int64)

In [20]:
logistic_cls.score(X_train,Y_train)

0.8191799754299754

In [21]:
accuracy_score(Y_test,pred_log)


0.81759557807462

In [22]:
print(classification_report(Y_test,pred_log))


              precision    recall  f1-score   support

           0       0.70      0.43      0.53      1571
           1       0.84      0.94      0.89      4942

    accuracy                           0.82      6513
   macro avg       0.77      0.69      0.71      6513
weighted avg       0.80      0.82      0.80      6513



In [23]:
# Testing our Model 
logistic_cls.predict(X_test.head(1))[0]

1

In [None]:
from sklearn.model_selection import StratifiedKFold


In [27]:

skf = StratifiedKFold(n_splits=10)

otmodel = LogisticRegression(solver='newton-cg')

In [35]:
def training(x_train, x_test,y_train,y_test,fold_no):
  otmodel.fit(x_train, y_train)
  score = otmodel.score(x_test,y_test)
  print('For Fold {} the accuracy is {}'.format(str(fold_no),score))

In [36]:
fold_no = 1
for train_index,test_index in skf.split(X_train, Y_train):
  train = X_train.iloc[train_index,:]
  test = Y_train.iloc[test_index,:]
  training(X_train,X_test,Y_train,Y_test, fold_no)
  fold_no += 1

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


For Fold 1 the accuracy is 0.8174420389989252
For Fold 2 the accuracy is 0.8174420389989252


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


For Fold 3 the accuracy is 0.8174420389989252
For Fold 4 the accuracy is 0.8174420389989252
For Fold 5 the accuracy is 0.8174420389989252


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


For Fold 6 the accuracy is 0.8174420389989252
For Fold 7 the accuracy is 0.8174420389989252
For Fold 8 the accuracy is 0.8174420389989252


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


For Fold 9 the accuracy is 0.8174420389989252
For Fold 10 the accuracy is 0.8174420389989252


  y = column_or_1d(y, warn=True)


In [37]:
pred_another = otmodel.predict(X_test)
confusion_matrix(Y_test,pred_another)

array([[ 674,  897],
       [ 292, 4650]], dtype=int64)