In [1]:
import os, sys
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import shutil
import collections
from pathlib import Path

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Loading packages

In [3]:
import sys
from pathlib import Path

here_path = Path().resolve()
repo_path = here_path.parents[1]
sys.path.append(str(repo_path))

In [4]:
from py.utils import verifyDir, verifyFile, verifyType

In [5]:
from py.config import Config

cfg = Config()

np.random.seed(cfg.RANDOM_STATE)
cfg.DATA_PATH, cfg.MODEL_PATH

('/media/felipe/DATA19/datasets/', '/media/felipe/DATA19/models/')

In [6]:
QSCORE_PATH=f"{cfg.DATA_PATH}pp2/{cfg.SCORING_METHOD}/{cfg.PLACE_LEVEL}/"
IMAGES_PATH = f"{cfg.DATA_PATH}pp2/images/"

FEATURES_PATH = f"{cfg.DATA_PATH}pp2/segmentations/{cfg.DATASET_SEG_NAME}/{cfg.MODEL_SEG_NAME}/"
MODEL_PATH = f"{cfg.MODEL_PATH}pp2/{cfg.ML_TYPE}/{cfg.SCORING_METHOD}/"

In [7]:
verifyDir(MODEL_PATH)

### Loading data

In [8]:
features_df = pd.read_csv(f"{FEATURES_PATH}segmentations.csv", sep=";", low_memory=False)
features_df = features_df.loc[:, (features_df != 0).any(axis=0)].copy()
feature_names = features_df.columns.tolist()
feature_names.remove("image_id")
features_df

Unnamed: 0,wall,building,sky,floor,tree,ceiling,road,windowpane,grass,sidewalk,...,tank,trade_name,pot,bicycle,sculpture,traffic_light,ashcan,pier,flag,image_id
0,0.000000,1.574167,42.730833,0.0,5.031667,0.0,36.835833,0.0,0.033333,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0,513d677cfdc9f035870040af
1,0.232500,5.849167,8.832500,0.0,37.846667,0.0,31.366667,0.0,7.345000,3.849167,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0,513d6c8ffdc9f03587004fcc
2,0.125833,21.213333,13.651667,0.0,15.863333,0.0,25.864167,0.0,0.789167,11.385833,...,0.0,0.0,1.377500,0.0,0.0,0.0,0.000,0.0,0.0,513d6b38fdc9f03587004c82
3,0.000000,0.015833,41.790833,0.0,16.505833,0.0,32.465000,0.0,0.000000,3.591667,...,0.0,0.0,0.193333,0.0,0.0,0.0,0.205,0.0,0.0,513d6bb6fdc9f03587004db0
4,0.199167,0.106667,6.445000,0.0,46.457500,0.0,13.627500,0.0,18.599167,9.519167,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0,513d67a8fdc9f03587004125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296,0.000000,12.243333,37.321667,0.0,5.678333,0.0,31.678333,0.0,0.000000,0.475000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0,513d6ba6fdc9f03587004d8e
1297,0.000000,38.336667,19.234167,0.0,0.928333,0.0,38.718333,0.0,1.423333,1.359167,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0,513d686ffdc9f03587004385
1298,0.000000,44.598333,15.456667,0.0,1.174167,0.0,23.995000,0.0,0.000000,7.303333,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0,513d6a2afdc9f0358700499f
1299,17.496667,0.057500,38.990833,0.0,7.129167,0.0,34.227500,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0,513d6845fdc9f035870042f7


In [9]:
%%time
data_df = pd.read_csv(f"{QSCORE_PATH}scores.csv", sep=";", low_memory=False)
data_df["image_path"] = f"{IMAGES_PATH}" + data_df["image_path"]
data_df.sort_values(by=[cfg.PERCEPTION_METRIC], ascending=False, inplace=True)
data_df = pd.merge(data_df, features_df, how="inner", on=["image_id"])
data_df

CPU times: user 166 ms, sys: 45 ms, total: 211 ms
Wall time: 211 ms


Unnamed: 0,image_id,lat,long,city,country,continent,safety,beautiful,wealthy,lively,...,minibike,tank,trade_name,pot,bicycle,sculpture,traffic_light,ashcan,pier,flag
0,513d677cfdc9f035870040af,42.370774,-71.126977,Boston,USA,North America,8.583389,5.333333,6.055556,5.029020,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0
1,513d6c8ffdc9f03587004fcc,42.385246,-71.173652,Boston,USA,North America,8.222581,6.075926,5.277778,5.432792,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0
2,513d6b38fdc9f03587004c82,42.353972,-71.063971,Boston,USA,North America,8.016417,7.222222,7.500000,3.333333,...,0.0,0.0,0.0,1.377500,0.0,0.0,0.0,0.000,0.0,0.0
3,513d6bb6fdc9f03587004db0,42.315775,-71.034048,Boston,USA,North America,7.945597,5.303030,3.888889,7.810280,...,0.0,0.0,0.0,0.193333,0.0,0.0,0.0,0.205,0.0,0.0
4,513d67a8fdc9f03587004125,42.301764,-71.114991,Boston,USA,North America,7.918534,5.277778,6.333333,3.948302,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296,513d6ba6fdc9f03587004d8e,42.412917,-71.106858,Boston,USA,North America,1.764706,3.845679,2.777778,3.918651,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0
1297,513d6a2afdc9f0358700499f,42.374378,-71.119751,Boston,USA,North America,1.666667,5.611111,6.833333,6.494829,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0
1298,513d686ffdc9f03587004385,42.384908,-70.982451,Boston,USA,North America,1.666667,3.333333,1.666667,3.522727,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0
1299,513d6845fdc9f035870042f7,42.312479,-71.052465,Boston,USA,North America,1.615313,2.148148,2.962963,3.802910,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0


In [10]:
%%time
from py.models.datasets import PlacePulse

pp = PlacePulse(data_df)
pp.DataPreparation(delta=cfg.DELTA, emotion=cfg.PERCEPTION_METRIC)
pp.TaskPreparation(task_type=cfg.ML_TASK)
pp.DataSplit()

print(f"Train samples: {len(pp.train_df)}")
print(f"Test samples: {len(pp.test_df)}")

Applying delta 0.42.
Top max: 8.583389450056117, min:5.280795156526829, size: 546
Bot max: 4.855792355792356, min:0.8333333333333334, size: 546
Train samples: 819
Test samples: 273
CPU times: user 647 ms, sys: 84.2 ms, total: 731 ms
Wall time: 735 ms


In [11]:
label_map = dict(zip(pp.data_df["target"], pp.data_df["label"]))
labels = list(label_map.values())
classes = list(label_map.keys())
label_map

{1: 'safety', 0: 'not safety'}

In [12]:
X_train = np.array(pp.train_df[feature_names].values.tolist())
X_test = np.array(pp.test_df[feature_names].values.tolist())
y_train = np.array(pp.train_df["target"].tolist())
y_test = np.array(pp.test_df["target"].tolist())
X_train.shape, X_test.shape

((819, 65), (273, 65))

In [13]:
collections.Counter(y_train), collections.Counter(y_test)

(Counter({1: 410, 0: 409}), Counter({0: 137, 1: 136}))

### GridSearch

In [14]:
from py.models.classification.linear import LinearClassifier
from py.models.classification.ensemble import EnsembleClassifier

model_search = LinearClassifier() if cfg.ML_TYPE=="linear" else EnsembleClassifier()
model_search.model_zoo()

Model zoo: ['decision_tree', 'random_forest', 'gradient_boosting', 'adaboost', 'extra_trees', 'hist_gradient_boosting', 'bagging', 'xgboost', 'lightgbm', 'catboost'] 



#### Train

In [15]:
%%time
results = model_search.fit_all(X_train, y_train)


Fitting decision_tree...
Fitting 5 folds for each of 256 candidates, totalling 1280 fits

Fitting random_forest...
Fitting 5 folds for each of 512 candidates, totalling 2560 fits

Fitting gradient_boosting...
Fitting 5 folds for each of 768 candidates, totalling 3840 fits

Fitting adaboost...
Fitting 5 folds for each of 25 candidates, totalling 125 fits

Fitting extra_trees...
Fitting 5 folds for each of 1024 candidates, totalling 5120 fits

Fitting hist_gradient_boosting...
Fitting 5 folds for each of 288 candidates, totalling 1440 fits

Fitting bagging...
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Fitting xgboost...
Fitting 5 folds for each of 243 candidates, totalling 1215 fits

Fitting lightgbm...
Fitting 5 folds for each of 486 candidates, totalling 2430 fits





Fitting catboost...
Fitting 5 folds for each of 162 candidates, totalling 810 fits
CPU times: user 32.4 s, sys: 4.01 s, total: 36.4 s
Wall time: 7min 23s


In [16]:
model_search.print_results()


GRID SEARCH RESULTS SUMMARY

EXTRA_TREES:
  Best CV Score: 0.6373
  Best Parameters:
    classifier__bootstrap: True
    classifier__class_weight: balanced
    classifier__max_depth: 10
    classifier__max_features: sqrt
    classifier__min_samples_leaf: 3
    classifier__min_samples_split: 3
    classifier__n_estimators: 200

GRADIENT_BOOSTING:
  Best CV Score: 0.6336
  Best Parameters:
    classifier__learning_rate: 0.01
    classifier__max_depth: 3
    classifier__max_features: sqrt
    classifier__min_samples_leaf: 3
    classifier__min_samples_split: 3
    classifier__n_estimators: 100
    classifier__subsample: 0.8

RANDOM_FOREST:
  Best CV Score: 0.6325
  Best Parameters:
    classifier__bootstrap: True
    classifier__class_weight: None
    classifier__max_depth: None
    classifier__max_features: sqrt
    classifier__min_samples_leaf: 3
    classifier__min_samples_split: 3
    classifier__n_estimators: 200

ADABOOST:
  Best CV Score: 0.6300
  Best Parameters:
    classifier__

In [17]:
summary_df = model_search.get_results_summary()
summary_df

Unnamed: 0,model,best_score,best_params
4,extra_trees,0.637308,"{'classifier__bootstrap': True, 'classifier__c..."
2,gradient_boosting,0.63365,"{'classifier__learning_rate': 0.01, 'classifie..."
1,random_forest,0.63249,"{'classifier__bootstrap': True, 'classifier__c..."
3,adaboost,0.629976,"{'classifier__learning_rate': 0.5, 'classifier..."
9,catboost,0.623954,"{'classifier__auto_class_weights': 'Balanced',..."
6,bagging,0.622779,"{'classifier__bootstrap': False, 'classifier__..."
5,hist_gradient_boosting,0.62028,"{'classifier__class_weight': 'balanced', 'clas..."
7,xgboost,0.610494,"{'classifier__colsample_bytree': 0.7, 'classif..."
8,lightgbm,0.605616,"{'classifier__class_weight': 'balanced', 'clas..."
0,decision_tree,0.569015,"{'classifier__max_depth': 10, 'classifier__max..."


#### Inference

In [18]:
from sklearn.metrics import classification_report

In [19]:
y_pred = model_search.predict(X_test) # model_name='logistic_regression')

In [20]:
clf_lr_report = classification_report(y_test, y_pred, output_dict=True)

print(classification_report(y_test, y_pred, target_names=labels))

              precision    recall  f1-score   support

      safety       0.66      0.58      0.62       137
  not safety       0.62      0.70      0.66       136

    accuracy                           0.64       273
   macro avg       0.64      0.64      0.64       273
weighted avg       0.64      0.64      0.64       273



In [21]:
confusion_matrix = pd.crosstab(
    [ label_map[k] for k in y_test ],
    [ label_map[k] for k in y_pred ],
    rownames=['Real'],
    colnames=['Prediction']
)
confusion_matrix

Prediction,not safety,safety
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
not safety,80,57
safety,41,95


#### Saving

In [22]:
# Save grid search
model_search.save(f'{MODEL_PATH}{cfg.DATASET_SEG_NAME}_{cfg.MODEL_SEG_NAME}_model_search.pkl')

Instance saved to /media/felipe/DATA19/models/pp2/ensemble/Qscores/ADE20k_PSP_ResNet101_model_search.pkl using pickle


In [23]:
# Save just the best trained model
model_search.save_best_model_only(f'{MODEL_PATH}{model_search.get_best_model_name()}_{cfg.DATASET_SEG_NAME}_{cfg.MODEL_SEG_NAME}_best_model.pkl')

Best model (extra_trees) saved to /media/felipe/DATA19/models/pp2/ensemble/Qscores/extra_trees_ADE20k_PSP_ResNet101_best_model.pkl


#### Loading

In [24]:
loaded_search = LinearClassifier.load(f'{MODEL_PATH}{cfg.DATASET_SEG_NAME}_{cfg.MODEL_SEG_NAME}_model_search.pkl')
y_pred = loaded_search.predict(X_test)
confusion_matrix = pd.crosstab(
    [ label_map[k] for k in y_test ],
    [ label_map[k] for k in y_pred ],
    rownames=['Real'],
    colnames=['Prediction']
)
confusion_matrix

Instance loaded from /media/felipe/DATA19/models/pp2/ensemble/Qscores/ADE20k_PSP_ResNet101_model_search.pkl using pickle


Prediction,not safety,safety
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
not safety,80,57
safety,41,95


In [25]:
# Load and use the best model
best_model = LinearClassifier.load_model(f'{MODEL_PATH}{model_search.get_best_model_name()}_{cfg.DATASET_SEG_NAME}_{cfg.MODEL_SEG_NAME}_best_model.pkl')
y_pred = best_model.predict(X_test)
confusion_matrix = pd.crosstab(
    [ label_map[k] for k in y_test ],
    [ label_map[k] for k in y_pred ],
    rownames=['Real'],
    colnames=['Prediction']
)
confusion_matrix

Model loaded from /media/felipe/DATA19/models/pp2/ensemble/Qscores/extra_trees_ADE20k_PSP_ResNet101_best_model.pkl


Prediction,not safety,safety
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
not safety,80,57
safety,41,95
