# Classification on Adult Dataset

## Preprocessing the data

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df=pd.read_csv("adult.csv")
df_test=pd.read_csv("adult_test.csv")
df.head()
df_test['salary'].values

array([' <=50K.', ' <=50K.', ' >50K.', ..., ' <=50K.', ' <=50K.',
       ' >50K.'], dtype=object)

In [46]:
df["salary"]=df["salary"]==' >50K'
df_test["salary"]=df_test["salary"]==' >50K.'
df_test['salary'].value_counts()

False    12435
True      3846
Name: salary, dtype: int64

**!! the data is a little biased with 24720 false and only 7841 true**

In [47]:
df['workclass'].value_counts()

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [48]:
df_false=df[df['salary']==False].iloc[:10000,:]
df_true=df[df['salary']==True]
df_opt=pd.concat([df_true,df_false],axis=0)
df_opt = df_opt.sample(frac=1).reset_index(drop=True)
df_opt

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,58,Private,197319,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,True
1,29,Private,271466,Assoc-voc,11,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,43,United-States,False
2,32,Private,188246,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,True
3,46,Private,192360,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,35,United-States,True
4,42,Private,200187,Assoc-voc,11,Divorced,Other-service,Unmarried,White,Female,0,0,32,United-States,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17836,63,Self-emp-inc,165667,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,False
17837,25,Private,197036,10th,6,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States,False
17838,42,Self-emp-inc,123838,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1977,50,United-States,True
17839,36,Private,215392,Assoc-voc,11,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,40,United-States,True


We took 10000 rows from the false and the 7841 rows True to make it more balanced and then shuffled the rows

In [84]:
features= df.drop(["fnlwgt","education","capital-gain","capital-loss","salary"], axis=1)
features_test= df_test.drop(["fnlwgt","education","capital-gain","capital-loss","salary"], axis=1)
features.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,native-country
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba


In [85]:
goal=df.iloc[:,-1]
goal = goal.astype(int)
test_goals=df_test.iloc[:,-1]
test_goals = test_goals.astype(int)
test_goals

0        0
1        0
2        1
3        1
4        0
        ..
16276    0
16277    0
16278    0
16279    0
16280    1
Name: salary, Length: 16281, dtype: int32

In [86]:
for col in ['workclass','marital-status','occupation','relationship','race','sex','native-country']:
    features_test = pd.concat([features_test,pd.get_dummies(features_test[col], prefix=col)],axis=1)
    features = pd.concat([features,pd.get_dummies(features[col], prefix=col)],axis=1) 
    features.drop([col],axis=1, inplace=True)
    features_test.drop([col],axis=1, inplace=True)
features_test = pd.DataFrame(columns=features.columns, index=features_test.index, data=features_test) 
features_test=features_test.fillna(0)
features_test

Unnamed: 0,age,education-num,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,25,7,40,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,9,50,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,28,12,40,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,44,10,40,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,18,10,30,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,13,36,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
16277,64,9,40,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16278,38,13,50,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
16279,44,13,40,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [87]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, goal, test_size = 0.2, random_state = 0)

In [88]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
features_test=scaler.transform(features_test)
print('after scalling, max is %d and min is %d'%(np.max(x_train), np.min(x_train)))

after scalling, max is 161 and min is -3


## PCA

In [76]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

for n in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    X_train=x_train
    pca = PCA(n_components=n)
    pca.fit(X_train)
    X_train = pca.transform(x_train)
    X_test  = pca.transform(x_test)
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print("for {} componenets".format(n))
    print(classification_report(y_test, y_pred))
    print("\n----------\n")

for 1 componenets
              precision    recall  f1-score   support

           0       0.77      0.71      0.74      1980
           1       0.67      0.74      0.70      1589

    accuracy                           0.72      3569
   macro avg       0.72      0.72      0.72      3569
weighted avg       0.73      0.72      0.72      3569


----------

for 2 componenets
              precision    recall  f1-score   support

           0       0.77      0.71      0.74      1980
           1       0.67      0.74      0.70      1589

    accuracy                           0.72      3569
   macro avg       0.72      0.72      0.72      3569
weighted avg       0.73      0.72      0.72      3569


----------

for 3 componenets
              precision    recall  f1-score   support

           0       0.81      0.74      0.78      1980
           1       0.71      0.78      0.74      1589

    accuracy                           0.76      3569
   macro avg       0.76      0.76      0.76     

The Best n for PCA is 10 so:

In [89]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(x_train)
x_train = pca.transform(x_train)
x_test  = pca.transform(x_test)
features_test=pca.transform(features_test)

In [90]:
x_train.shape

(26048, 10)

# Grid Search

In [11]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
classifier = SVC(kernel = 'rbf', random_state = 0)
parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(x_train, y_train)
print("best accuracy is :" , grid_search.best_score_)
grid_search.best_params_

best accuracy is : 0.833000507118828


{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}

The best classifier is SVC rbf kernel with paramters c=1 gamma=0.1 so this what we will use

In [102]:
train_set,goal_train,test_set,goal_test=x_train,y_train,x_test,y_test
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
def test_clf(classifier, clf_name, step_size=0.01):
    classifier.fit(train_set, goal_train)
    cm = confusion_matrix(goal_test, classifier.predict(test_set))
    print("\n",clf_name,classifier.score(test_set, goal_test))
    y_pred = classifier.predict(x_test)
    print("\n",classification_report(y_test, y_pred))
from sklearn.neighbors import KNeighborsClassifier
for K in [1, 3, 5, 10, 20, 100, 150]:
    print('K=',K)
    test_clf(KNeighborsClassifier(n_neighbors=K),'KNN')

K= 1

 KNN 0.7324180442701037

               precision    recall  f1-score   support

           0       0.78      0.75      0.76      2052
           1       0.68      0.71      0.69      1517

    accuracy                           0.73      3569
   macro avg       0.73      0.73      0.73      3569
weighted avg       0.73      0.73      0.73      3569

K= 3

 KNN 0.7595965256374334

               precision    recall  f1-score   support

           0       0.80      0.77      0.79      2052
           1       0.70      0.75      0.73      1517

    accuracy                           0.76      3569
   macro avg       0.75      0.76      0.76      3569
weighted avg       0.76      0.76      0.76      3569

K= 5

 KNN 0.7685626225833567

               precision    recall  f1-score   support

           0       0.82      0.77      0.79      2052
           1       0.71      0.76      0.74      1517

    accuracy                           0.77      3569
   macro avg       0.76      0.7

Best k for KNN is 20 so:

In [71]:
train_set,goal_train,test_set,goal_test=x_train,y_train,x_test,y_test
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB,GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
def test_clf(classifier, clf_name, step_size=0.01):
    classifier.fit(train_set, goal_train)
    cm = confusion_matrix(goal_test, classifier.predict(test_set))
    print("\n",clf_name,classifier.score(test_set, goal_test))
    y_pred = classifier.predict(x_test)
    accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
    print("\n",classification_report(y_test, y_pred))
    print("\nwith 10-fold cross validation")
    print("model acc : {:.2f} (+/- {:.2f})%".format(accuracies.mean(), accuracies.std()))

for classifier ,clf_name in zip ([DecisionTreeClassifier(random_state=0),SVC(kernel ='rbf',C= 1, gamma= 0.1, random_state = 0),BernoulliNB(),GaussianNB(),SVC(kernel='poly'),KNeighborsClassifier(n_neighbors=20),LogisticRegression(random_state=0)],
                                 ['DecisionTreeClassifier','SVC_RBF','BernoulliNB','GaussianNB','SVC_POLY','KNN','LogisticRegression']):
    test_clf(classifier, clf_name, step_size=0.01)


 DecisionTreeClassifier 0.736060521154385

               precision    recall  f1-score   support

           0       0.76      0.78      0.77      2044
           1       0.70      0.68      0.69      1525

    accuracy                           0.74      3569
   macro avg       0.73      0.73      0.73      3569
weighted avg       0.74      0.74      0.74      3569


with 10-fold cross validation
model acc : 0.72 (+/- 0.01)%

 SVC_RBF 0.8024656766601289

               precision    recall  f1-score   support

           0       0.84      0.80      0.82      2044
           1       0.75      0.80      0.78      1525

    accuracy                           0.80      3569
   macro avg       0.80      0.80      0.80      3569
weighted avg       0.80      0.80      0.80      3569


with 10-fold cross validation
model acc : 0.79 (+/- 0.01)%

 BernoulliNB 0.7579153824600728

               precision    recall  f1-score   support

           0       0.77      0.82      0.80      2044
      

The Models will be in order:
<br />1)SVC rbf 0.79 (+/- 0.01)%
<br />2)KNN 0.79 (+/- 0.01)%
<br />3)LogisticRegression 0.79 (+/- 0.01)%
<br />4)SVC poly 0.78 (+/- 0.01)%
<br />5)GaussianNB 0.75 (+/- 0.01)%
<br />6)BernoulliNB 0.74 (+/- 0.02)%
<br />7)DecisionTreeClassifier 0.72 (+/- 0.01)%

# Ensemble Learning Models

In [72]:
def test_enclf(classifier, clf_name, step_size=0.01):
    classifier.fit(train_set, goal_train)
    #cm = confusion_matrix(goal_test, classifier.predict(test_set))
    print("\n",clf_name,classifier.score(test_set, goal_test))
    y_pred = classifier.predict(x_test)
    accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
    print("\n",classification_report(y_test, y_pred))
    print("\nwith 10-fold cross validation")
    print("model acc : {:.2f} (+/- {:.2f})%".format(accuracies.mean(), accuracies.std()))

In [73]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
clf1 = SVC(kernel ='rbf',C= 1, gamma= 0.1, random_state = 0)
clf2 = KNeighborsClassifier(n_neighbors=10)
clf3 = GaussianNB()

vclf_hard = VotingClassifier(estimators=[('SVC_RBF', clf1), ('KNN', clf2), ('GNB', clf3)], voting='hard')
vclf_soft = VotingClassifier(estimators=[('SVC_RBF', clf1), ('KNN', clf2), ('GNB', clf3)], voting='soft', weights=[2,2,1])
bag = BaggingClassifier(LogisticRegression(random_state=0))
forest = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=1)
adaboost = AdaBoostClassifier(n_estimators=100)
gradboost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
xgb_clf = XGBClassifier()
for classifier ,clf_name in zip ([vclf_hard,bag,forest,adaboost,gradboost,xgb_clf],
                                 ['Voting hard','Bagging','Random forest','AdaBoost','Gradient Tree Boosting','XGBoost']):
    test_enclf(classifier, clf_name, step_size=0.01)


 Voting hard 0.8061081535444102

               precision    recall  f1-score   support

           0       0.84      0.81      0.83      2044
           1       0.76      0.80      0.78      1525

    accuracy                           0.81      3569
   macro avg       0.80      0.81      0.80      3569
weighted avg       0.81      0.81      0.81      3569


with 10-fold cross validation
model acc : 0.79 (+/- 0.01)%

 Bagging 0.7963014850098067

               precision    recall  f1-score   support

           0       0.83      0.81      0.82      2044
           1       0.75      0.78      0.77      1525

    accuracy                           0.80      3569
   macro avg       0.79      0.79      0.79      3569
weighted avg       0.80      0.80      0.80      3569


with 10-fold cross validation
model acc : 0.79 (+/- 0.01)%

 Random forest 0.791818436536845

               precision    recall  f1-score   support

           0       0.82      0.81      0.82      2044
           1   

The Models will be in order:
<br />1)Voting hard 0.79 (+/- 0.01)%
<br />2)Bagging 0.79 (+/- 0.01)%
<br />3)AdaBoost 0.79 (+/- 0.01)%
<br />4)XGBoost 0.79 (+/- 0.01)%
<br />5)Gradient Tree Boosting 0.79 (+/- 0.01)%
<br />6)Random forest 0.78 (+/- 0.01)%

## Time for the test
Let's see the performance of all the classifiers on the test set

In [77]:
train_set,goal_train,test_set,goal_test=x_train,y_train,x_test,y_test
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB,GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
def clf_testdata(classifier, clf_name, step_size=0.01):
    classifier.fit(train_set, goal_train)
    cm = confusion_matrix(test_goals, classifier.predict(features_test))
    print("\n",clf_name,classifier.score(features_test, test_goals))
    y_pred = classifier.predict(features_test)
    accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
    print("\n",classification_report(test_goals, y_pred))
    print("\naccu on training data with 10-fold cross validation was")
    print("model acc : {:.2f} (+/- {:.2f})%".format(accuracies.mean(), accuracies.std()))
for classifier ,clf_name in zip ([DecisionTreeClassifier(random_state=0),SVC(kernel ='rbf',C= 1, gamma= 0.1, random_state = 0),BernoulliNB(),GaussianNB(),SVC(kernel='poly'),KNeighborsClassifier(n_neighbors=20),LogisticRegression(random_state=0)],
                                 ['DecisionTreeClassifier','SVC_RBF','BernoulliNB','GaussianNB','SVC_POLY','KNN','LogisticRegression']):
    clf_testdata(classifier, clf_name, step_size=0.01)


 DecisionTreeClassifier 0.7498925127449174

               precision    recall  f1-score   support

           0       0.89      0.77      0.82     12435
           1       0.48      0.68      0.56      3846

    accuracy                           0.75     16281
   macro avg       0.68      0.73      0.69     16281
weighted avg       0.79      0.75      0.76     16281


accu on training data with 10-fold cross validation was
model acc : 0.72 (+/- 0.01)%

 SVC_RBF 0.7936858915300044

               precision    recall  f1-score   support

           0       0.93      0.79      0.85     12435
           1       0.54      0.80      0.65      3846

    accuracy                           0.79     16281
   macro avg       0.74      0.80      0.75     16281
weighted avg       0.84      0.79      0.81     16281


accu on training data with 10-fold cross validation was
model acc : 0.79 (+/- 0.01)%

 BernoulliNB 0.7791290461273878

               precision    recall  f1-score   support

       

# Ensemble Learning

In [75]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
clf1 = SVC(kernel ='rbf',C= 1, gamma= 0.1, random_state = 0)
clf2 = KNeighborsClassifier(n_neighbors=10)
clf3 = GaussianNB()

vclf_hard = VotingClassifier(estimators=[('SVC_RBF', clf1), ('KNN', clf2), ('GNB', clf3)], voting='hard')
vclf_soft = VotingClassifier(estimators=[('SVC_RBF', clf1), ('KNN', clf2), ('GNB', clf3)], voting='soft', weights=[2,2,1])
bag = BaggingClassifier(LogisticRegression(random_state=0))
forest = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=1)
adaboost = AdaBoostClassifier(n_estimators=100)
gradboost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
xgb_clf = XGBClassifier()

In [76]:
for classifier ,clf_name in zip ([vclf_hard,bag,forest,adaboost,gradboost,xgb_clf],
                                 ['Voting hard','Bagging','Random forest','AdaBoost','Gradient Tree Boosting','XGBoost']):
    clf_testdata(classifier, clf_name, step_size=0.01)


 Voting hard 0.7939315766844789

               precision    recall  f1-score   support

           0       0.93      0.79      0.85     12435
           1       0.54      0.80      0.65      3846

    accuracy                           0.79     16281
   macro avg       0.74      0.80      0.75     16281
weighted avg       0.84      0.79      0.81     16281


accu on training data with 10-fold cross validation was
model acc : 0.79 (+/- 0.01)%

 Bagging 0.7981082243105461

               precision    recall  f1-score   support

           0       0.92      0.80      0.86     12435
           1       0.55      0.79      0.65      3846

    accuracy                           0.80     16281
   macro avg       0.74      0.79      0.75     16281
weighted avg       0.84      0.80      0.81     16281


accu on training data with 10-fold cross validation was
model acc : 0.79 (+/- 0.01)%

 Random forest 0.7928259934893435

               precision    recall  f1-score   support

           0    

# ANN

In [91]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import regularizers

classifier=Sequential()
classifier.add(Dense(units = 1000, activation = 'relu', input_dim = x_train.shape[1]))
classifier.add(Dense(units = 500, activation = 'relu', kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01)))
classifier.add(Dropout(0.2, seed=0))
classifier.add(Dense(units = 1, activation = 'tanh'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [92]:
classifier.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 1000)              11000     
_________________________________________________________________
dense_9 (Dense)              (None, 500)               500500    
_________________________________________________________________
dropout_4 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 501       
Total params: 512,001
Trainable params: 512,001
Non-trainable params: 0
_________________________________________________________________


In [93]:
classifier.fit(x_train, y_train, batch_size = 10, epochs = 10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x19e40b3a0f0>

In [22]:
y_pred = classifier.predict(x_test)
print(y_pred)
y_pred = (y_pred > 0.5)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(cm)

[[0.7684285 ]
 [0.8344282 ]
 [0.6693604 ]
 ...
 [0.40632403]
 [0.44643345]
 [0.13900912]]


Unnamed: 0,0,1
0,1374,636
1,189,1370


In [23]:
from sklearn.metrics import classification_report

print(classification_report(y_test, (y_pred > 0.5)))

              precision    recall  f1-score   support

           0       0.88      0.68      0.77      2010
           1       0.68      0.88      0.77      1559

    accuracy                           0.77      3569
   macro avg       0.78      0.78      0.77      3569
weighted avg       0.79      0.77      0.77      3569



In [34]:
y_pred = classifier.predict(features_test)
cm = confusion_matrix(test_goals, (y_pred > 0.5))
print("\n",classification_report(test_goals, (y_pred > 0.5)))


               precision    recall  f1-score   support

           0       0.94      0.67      0.79     12435
           1       0.45      0.87      0.59      3846

    accuracy                           0.72     16281
   macro avg       0.70      0.77      0.69     16281
weighted avg       0.83      0.72      0.74     16281

