In [1]:
import os
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, RandomizedSearchCV, learning_curve
from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder,StandardScaler
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score, roc_auc_score, log_loss
from xgboost import XGBClassifier

In [2]:
target = 'PROSTITUTION'

In [3]:
out_folder = r'C:\Users\eduar\OneDrive\Área de Trabalho\EXPERIMENTOS DATA SCIENCE E GIS\CASE KG\infolder\optimization'

In [4]:
#Reading feature engineering result.
df_train = pd.read_csv(r"C:\Users\eduar\OneDrive\Área de Trabalho\EXPERIMENTOS DATA SCIENCE E GIS\CASE KG\infolder\pre_processing\df_train.csv")
df_test= pd.read_csv(r"C:\Users\eduar\OneDrive\Área de Trabalho\EXPERIMENTOS DATA SCIENCE E GIS\CASE KG\infolder\pre_processing\df_test.csv")

In [5]:
X_train, y_train = df_train.drop(target, axis=1), df_train[target]
X_test, y_test = df_test.drop(target, axis=1), df_test[target]

# PRE-PROCESSING

Dropping useles columns.

In [6]:
def dropar_coluna(df):
    lista_drop = [x for x in df.columns if x not in cat_cols and x not in num_cols]
    df.drop(lista_drop,axis=1,inplace = True)

In [7]:
cat_cols = ["DayOfWeek","AV","Block","crossing","PdDistrict","ST","cluster","night_time",'late_night', 'evening']
num_cols = ["X","Y",'dist_police',"Month","Year","hour",'dist_bar','dist_nightclub']

In [8]:
dropar_coluna(X_train)
dropar_coluna(X_test)

To solve this classification problem, I decided to use the XGboost algorithm.

In [9]:
# Configure the parameters
param = {'booster': 'dart', 
         'tree_method': 'gpu_hist',
         'predictor': 'gpu_predictor',
         'max_depth': 7, 
         'eta': 0.15, 
         'objective': 'binary:logistic', 
         'eval_metric': 'auc', 
         'num_round': 500,
         'feature_selector':'greedy', 
         'sampling_method': 'gradient_based',
         'random_state':123
        }

Creating a pipeline

In [10]:
#Transformers
pipe_cat_features = (
    'onehot_encoder',
    OneHotEncoder(handle_unknown='ignore'),
    cat_cols
)


pipe_num_features = (
    'MinMaxScaler',
    MinMaxScaler(),
    num_cols
)



In [11]:
# Creating the transformers list
transformers = [pipe_cat_features, pipe_num_features]
pipe_transformers = ColumnTransformer(transformers)

#Creating the pipe.
pipe = Pipeline(
        steps=[
            ('pre_processor', pipe_transformers),
            ('model', XGBClassifier(**param))])

In [15]:
#Creating our model.
xgb_clf = pipe 
xgb_clf.fit(X_train, y_train.ravel())
score = xgb_clf.score(X_test, y_test.ravel())

#Predicting the test dataset
y_pred = xgb_clf.predict(X_test)

#Print the classification report
results_log = classification_report(y_test, y_pred)
print(results_log)



Parameters: { "feature_selector", "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    262262
         1.0       0.71      0.55      0.62      2675

    accuracy                           0.99    264937
   macro avg       0.85      0.77      0.81    264937
weighted avg       0.99      0.99      0.99    264937



In [16]:
#creating a text file to save the results log.
text_file = open(r"C:\Users\eduar\OneDrive\Área de Trabalho\EXPERIMENTOS DATA SCIENCE E GIS\CASE KG\logs\PRE_PROCESSING", "w")

log = text_file.write(results_log)
#close file
text_file.close()

#verifying if the file is empty
print(log)

326


Saving our progress

In [17]:
X_train.to_csv(os.path.join(out_folder,"X_train.csv"),index=False)
X_test.to_csv(os.path.join(out_folder,"X_test.csv"),index=False)
y_train.to_csv(os.path.join(out_folder,"y_train.csv"),index=False)
y_test.to_csv(os.path.join(out_folder,"y_test.csv"),index=False)