In [1]:
from imblearn.over_sampling import SMOTE,SMOTENC
from imblearn.under_sampling import NearMiss,RandomUnderSampler
import os
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, RandomizedSearchCV, learning_curve
from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder,StandardScaler
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score, roc_auc_score, log_loss
from xgboost import XGBClassifier

In [2]:
in_folder = r'C:\Users\eduar\OneDrive\Área de Trabalho\EXPERIMENTOS DATA SCIENCE E GIS\CASE KG\infolder\optimization'

In [3]:
target = 'PROSTITUTION'

In [4]:
X_train = pd.read_csv(os.path.join(in_folder,"X_train.csv"))
X_test = pd.read_csv(os.path.join(in_folder,"X_test.csv"))
y_train = pd.read_csv(os.path.join(in_folder,"y_train.csv"))
y_test = pd.read_csv(os.path.join(in_folder,"y_test.csv"))

In [5]:
def dropar_coluna(df):
    lista_drop = [x for x in df.columns if x not in cat_cols and x not in num_cols]
    df.drop(lista_drop,axis=1,inplace = True)

In [6]:
cat_cols = ["DayOfWeek","AV","Block","crossing","PdDistrict","ST","cluster","night_time",'late_night', 'evening']
num_cols = ["X","Y",'dist_police',"Month","Year","hour",'dist_bar','dist_nightclub']

In [7]:
dropar_coluna(X_train)
dropar_coluna(X_test)

In [8]:
#Transformers
pipe_cat_features = (
    'onehot_encoder',
    OneHotEncoder(handle_unknown='ignore'),
    cat_cols
)


pipe_num_features = (
    'MinMaxScaler',
    MinMaxScaler(),
    num_cols
)

In [9]:
# Creating the transformers list
transformers = [pipe_cat_features, pipe_num_features]
pipe_transformers = ColumnTransformer(transformers)

In [10]:
over = SMOTE(random_state=123,n_jobs = 6,sampling_strategy=0.15)
under = RandomUnderSampler(sampling_strategy=0.15)

In [11]:
# A parameter grid for XGBoost
params = {
        'model__min_child_weight': [1, 5, 10],
        'model__gamma': [0.5, 1, 1.5, 2, 5],
        'model__subsample': [0.6, 0.8, 1.0],
        'model__colsample_bytree': [0.6, 0.8, 1.0],
        'model__max_depth': [3, 4, 5,6,7,8,9,10,11],
        'model__eta' : [0.02,0.15,0.2]
        
        }


In [12]:
from imblearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('pre_processor', pipe_transformers),('o', over),("u",under),('model', XGBClassifier(**params))])

In [13]:
xgb = XGBClassifier(predictor = "gpu_predictor",tree_method="gpu_hist", n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1,random_state=123)

In [14]:
folds = 5
param_comb = 200

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 123)

random_search = RandomizedSearchCV(pipeline, param_distributions=params, n_iter=param_comb,
                                   scoring='f1', n_jobs=6, cv=skf.split(X_train,y_train),
                                   verbose=1, random_state=123 )


random_search.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Parameters: { "model__colsample_bytree", "model__eta", "model__gamma", "model__max_depth", "model__min_child_weight", "model__subsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x000002582C0CAAC0>,
                   estimator=Pipeline(steps=[('pre_processor',
                                              ColumnTransformer(transformers=[('onehot_encoder',
                                                                               OneHotEncoder(handle_unknown='ignore'),
                                                                               ['DayOfWeek',
                                                                                'AV',
                                                                                'Block',
                                                                                'crossing',
                                                                                'PdDistrict',
                                                                                'ST',
                                                                                'cluster',
         

In [15]:
print(random_search.best_estimator_)

Pipeline(steps=[('pre_processor',
                 ColumnTransformer(transformers=[('onehot_encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['DayOfWeek', 'AV', 'Block',
                                                   'crossing', 'PdDistrict',
                                                   'ST', 'cluster',
                                                   'night_time', 'late_night',
                                                   'evening']),
                                                 ('MinMaxScaler',
                                                  MinMaxScaler(),
                                                  ['X', 'Y', 'dist_police',
                                                   'Month', 'Year', 'hour',
                                                   'dist_bar',
                                                   'dist_nightclub'])])),
                ('o'

In [16]:
print(random_search.best_score_)

0.6630795271184154


In [17]:
print(random_search.best_params_)

{'model__subsample': 1.0, 'model__min_child_weight': 5, 'model__max_depth': 10, 'model__gamma': 2, 'model__eta': 0.2, 'model__colsample_bytree': 1.0}


In [18]:
over = SMOTE(random_state=123,n_jobs = 6,sampling_strategy=0.15)
under = RandomUnderSampler(sampling_strategy=0.15)


#Creating a single pipeline with XGboost classifier
parametros_otimizados = {
 'tree_method': 'gpu_hist',
 'predictor': 'gpu_predictor',
 'max_depth': 11,
 'eta': 0.2,
 'objective': 'binary:logistic',
 'min_child_weight': 5,
 'random_state': 123,'gamma': 1,'colsample_bytree': 1.0,'subsample': 0.8,
 'eval_metric': 'auc','sampling_method': 'gradient_based','booster': 'dart'}

#pipe_xg = create_pipe(XGBClassifier(**parametros_otimizados), pipe_transformers)
pipe = Pipeline(steps=[('pre_processor', pipe_transformers),('o', over),("u",under),('model', XGBClassifier(**parametros_otimizados))])

xgb_clf = pipe 
xgb_clf.fit(X_train, y_train)
score = xgb_clf.score(X_test, y_test)
print(score)

#Predicting the test dataset
y_pred = xgb_clf.predict(X_test)

#Print the classification report
results_log = classification_report(y_test, y_pred)
print(results_log)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.9925982403363819
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    262262
         1.0       0.62      0.67      0.65      2675

    accuracy                           0.99    264937
   macro avg       0.81      0.83      0.82    264937
weighted avg       0.99      0.99      0.99    264937

