In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

Read the csv with pandas

In [2]:
df = pd.read_csv('../input/immigration-madrid-2021/Immigration_Madrid_2021.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,Gender,Age,Studies,Nationality,Administrative_situation,Working
0,0,Women,51-65,College,España,Community,No
1,1,Women,51-65,College,Venezuela,ART,No
2,2,Women,18-30,Pre-college,Honduras,Refugee,No
3,3,Women,18-30,Pre-college,Venezuela,Community,No
4,4,Women,18-30,Pre-college,República Dominicana,Community_familiar,No
...,...,...,...,...,...,...,...
1518,1803,Women,18-30,Elementary_school,España,Community,No
1519,1804,Women,18-30,Elementary_school,España,Community,No
1520,1805,Women,18-30,Elementary_school,Marruecos,Community,No
1521,1806,Women,18-30,Elementary_school,Marruecos,Community,No


Our target is quite unbalanced

In [4]:
print(df['Working'].value_counts())


No     1417
Yes     106
Name: Working, dtype: int64


In [5]:
print(df['Nationality'].value_counts())


Ecuador                            201
Venezuela                          183
España                             165
Perú                               141
Colombia                           136
Bolivia                            100
Honduras                            88
República Dominicana                83
Marruecos                           55
El Salvador                         41
Nigeria                             30
Nicaragua                           29
Paraguay                            25
Cuba                                22
Argentina                           22
Filipinas                           18
Guatemala                           18
Senegal                             15
Portugal                            14
Brasil                              12
Ucrania                              9
Sierra Leona                         9
Bangladesh                           8
Malí                                 8
Argelia                              8
China                    

Separate target and variables, we leave 'nationality' out because we need to one hot encode and there are to many countries.

In [6]:
X = df[['Gender','Age', 'Studies', 'Administrative_situation']]
y = df['Working']

One hot encoding, we transform the categories to binary columns

In [7]:
ce_OHE = ce.OneHotEncoder(cols=['Gender','Age', 'Studies', 'Administrative_situation'])

X = ce_OHE.fit_transform(X)

In [8]:
X

Unnamed: 0,Gender_1,Gender_2,Age_1,Age_2,Age_3,Age_4,Age_5,Studies_1,Studies_2,Studies_3,Studies_4,Studies_5,Studies_6,Studies_7,Administrative_situation_1,Administrative_situation_2,Administrative_situation_3,Administrative_situation_4,Administrative_situation_5,Administrative_situation_6
0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
3,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
1519,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
1520,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
1521,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0


Train/Test split

In [9]:
state = 1
test_size = 0.1
  
X_train, X_val, y_train, y_val = train_test_split(X, y,  
    test_size=test_size, random_state=state)

Now we try some learnig rates to search the best one in a GradientBoostingClassifier

In [10]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_val, y_val)))

Learning rate:  0.05
Accuracy score (training): 0.929
Accuracy score (validation): 0.941
Learning rate:  0.075
Accuracy score (training): 0.929
Accuracy score (validation): 0.941
Learning rate:  0.1
Accuracy score (training): 0.929
Accuracy score (validation): 0.941
Learning rate:  0.25
Accuracy score (training): 0.929
Accuracy score (validation): 0.941
Learning rate:  0.5
Accuracy score (training): 0.929
Accuracy score (validation): 0.941
Learning rate:  0.75
Accuracy score (training): 0.929
Accuracy score (validation): 0.941
Learning rate:  1
Accuracy score (training): 0.929
Accuracy score (validation): 0.941


Plot confusion matrix and visualize our prediction, we see the balance problem.

In [11]:
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=1, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_val)

print("Confusion Matrix:")
print(confusion_matrix(y_val, predictions))

print("Classification Report")
print(classification_report(y_val, predictions))

Confusion Matrix:
[[144   0]
 [  9   0]]
Classification Report
              precision    recall  f1-score   support

          No       0.94      1.00      0.97       144
         Yes       0.00      0.00      0.00         9

    accuracy                           0.94       153
   macro avg       0.47      0.50      0.48       153
weighted avg       0.89      0.94      0.91       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


We try a xgboost but we expect the results are going to be the same.
For this classifier we need to change Yes/No for 1/0

In [12]:
y_train = y_train.replace(['Yes','No'],[1,0])
y_val = y_val.replace(['Yes','No'],[1,0])
y_train = y_train.astype("int")
y_val = y_val.astype("int")

In [13]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [14]:
score = xgb_clf.score(X_val, y_val)
print(score)

0.9411764705882353


We get the same accuracy as the GradientBoostingClassifier.

Print the prediction to see our problem.

In [15]:
y_pred = xgb_clf.predict(X_val)
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0]


One more time we see the balance problem, its not the model its our data.

In future notebooks we will use libraries to change our data and try to solve this problem better.