In [1]:
import os, sys
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import shutil
import collections
from pathlib import Path

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Loading packages

In [3]:
import sys
from pathlib import Path

here_path = Path().resolve()
repo_path = here_path.parents[1]
sys.path.append(str(repo_path))

In [4]:
from py.utils import verifyDir, verifyFile, verifyType

In [5]:
import graphviz
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
from py.config import Config

cfg = Config()

np.random.seed(cfg.RANDOM_STATE)
cfg.DATA_PATH, cfg.MODEL_PATH

('/media/felipe/DATA19/datasets/', '/media/felipe/DATA19/models/')

In [7]:
RANDOM_STATE=42
CATEGORY="safety"
N_JOBS=5
DELTA=0.45
GEO_LEVEL="city"
NAME_LEVEL="Rio De Janeiro"
SEGMENT_MODEL="OneFormer_Swin_large/2025-03-29_10:33:12/"

In [8]:
QSCORE_PATH=f"{cfg.DATA_PATH}pp1/Qscores/"
IMAGES_PATH = f"{cfg.DATA_PATH}pp1/images/"
FEATURES_PATH = f"{cfg.DATA_PATH}pp1/features/{cfg.YEAR_STUDIED}/"
MODEL_PATH = f"{cfg.MODEL_PATH}pp1/{cfg.YEAR_STUDIED}/models/"
EXPLAIN_PATH = f"{cfg.MODEL_PATH}pp1/{cfg.YEAR_STUDIED}/explanations/"

In [9]:
verifyDir(MODEL_PATH)
verifyDir(EXPLAIN_PATH)

### Loading data

In [10]:
%%time
import pickle

with open(f"{FEATURES_PATH}features.pkl", "rb") as f:
    features_dict = pickle.load(f)
del features_dict["sift"]

CPU times: user 262 ms, sys: 533 ms, total: 795 ms
Wall time: 795 ms


In [11]:
features_df = pd.DataFrame(data=features_dict)
features_df["image_id"] = features_df["image_id"].apply(str)
features_df

Unnamed: 0,image_id,image_path,gist,fisher
0,4340,/media/felipe/DATA19/datasets/pp1/images/2011/...,"[0.0008650855893544305, -0.0016413003778488725...","[0.009395621333424064, 0.007253973775243361, -..."
1,4196,/media/felipe/DATA19/datasets/pp1/images/2011/...,"[-0.0007681568709080227, 0.0014784010874829717...","[0.006656489039275545, 0.01332182310770414, 0...."
2,4189,/media/felipe/DATA19/datasets/pp1/images/2011/...,"[0.0003149138732036082, 0.00019596856103487522...","[0.006104844073772749, -0.008694585662540609, ..."
3,3527,/media/felipe/DATA19/datasets/pp1/images/2011/...,"[0.001540033237028629, 0.0020671068933780044, ...","[0.0057307966478143705, 0.012152723374981512, ..."
4,4727,/media/felipe/DATA19/datasets/pp1/images/2011/...,"[0.0005254321209228526, -0.0003138953361233465...","[-0.006714747711235776, -0.013377853242195396,..."
...,...,...,...,...
2451,3520,/media/felipe/DATA19/datasets/pp1/images/2011/...,"[0.0009518040725277828, 0.0007872933647441974,...","[0.018740677427759696, 0.017024552002813317, -..."
2452,4725,/media/felipe/DATA19/datasets/pp1/images/2011/...,"[0.0007986571357105439, 0.0007759619659164433,...","[0.0010412029276445124, -0.007840056165476398,..."
2453,4904,/media/felipe/DATA19/datasets/pp1/images/2011/...,"[-0.001921776299813869, 0.0029886991437241177,...","[-0.010865904188049917, -0.0024485266854510097..."
2454,4162,/media/felipe/DATA19/datasets/pp1/images/2011/...,"[0.0004141868588518098, 0.0006710074009996883,...","[0.004714490565311111, 0.006047252483389254, 0..."


In [12]:
%%time
data_df = pd.read_csv(f"{QSCORE_PATH}scores.csv", sep=";", low_memory=False)
data_df["image_path"] = f"{IMAGES_PATH}{cfg.YEAR_STUDIED}/" + data_df["image_path"]
data_df["image_id"] = data_df["image_id"].apply(str)
data_df.sort_values(by=[cfg.PRECEPTION_METRIC], ascending=False, inplace=True)
data_df

CPU times: user 3.89 ms, sys: 861 μs, total: 4.75 ms
Wall time: 4.41 ms


Unnamed: 0,image_id,safety,safety_err,uniquely,uniquely_err,wealthy,wealthy_err,lat,long,city,Heading,Pitch,image_path,continent,country
1130,4340,8.350865,0.621166,4.528970,0.668607,7.245005,0.577930,42.3354,-71.1407,Boston,85.0,-3.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
891,4196,8.327766,0.556107,5.022737,0.833578,7.498167,0.656081,40.6780,-73.9703,New York,329.0,1.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
1117,4189,8.327241,0.602638,7.108436,0.738665,7.594670,0.634063,40.6948,-73.9712,New York,33.0,11.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
370,3527,8.245561,0.456143,7.760654,0.836348,8.142057,0.693277,40.6795,-73.9731,New York,324.0,3.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
303,4727,8.236001,0.510115,7.601103,0.631219,9.164302,0.627555,42.3417,-71.0792,Boston,105.0,6.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,3520,0.762055,0.460119,2.167670,0.572258,1.766562,0.470312,40.6307,-74.0047,New York,346.0,-1.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
301,4725,0.560845,0.582538,2.764210,0.757413,0.966390,0.523086,42.3812,-71.0890,Boston,15.0,-3.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
2276,4904,0.517857,0.527475,2.454082,0.681900,1.576652,0.381375,42.3888,-71.0872,Boston,8.0,-2.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
1188,4162,0.465477,0.462276,1.970528,0.661715,1.627882,0.514609,40.7096,-73.9278,New York,28.0,4.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA


In [13]:
data_df = pd.merge(data_df, features_df, how="inner", on=["image_id", "image_path"])
data_df

Unnamed: 0,image_id,safety,safety_err,uniquely,uniquely_err,wealthy,wealthy_err,lat,long,city,Heading,Pitch,image_path,continent,country,gist,fisher
0,4340,8.350865,0.621166,4.528970,0.668607,7.245005,0.577930,42.3354,-71.1407,Boston,85.0,-3.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA,"[0.0008650855893544305, -0.0016413003778488725...","[0.009395621333424064, 0.007253973775243361, -..."
1,4196,8.327766,0.556107,5.022737,0.833578,7.498167,0.656081,40.6780,-73.9703,New York,329.0,1.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA,"[-0.0007681568709080227, 0.0014784010874829717...","[0.006656489039275545, 0.01332182310770414, 0...."
2,4189,8.327241,0.602638,7.108436,0.738665,7.594670,0.634063,40.6948,-73.9712,New York,33.0,11.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA,"[0.0003149138732036082, 0.00019596856103487522...","[0.006104844073772749, -0.008694585662540609, ..."
3,3527,8.245561,0.456143,7.760654,0.836348,8.142057,0.693277,40.6795,-73.9731,New York,324.0,3.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA,"[0.001540033237028629, 0.0020671068933780044, ...","[0.0057307966478143705, 0.012152723374981512, ..."
4,4727,8.236001,0.510115,7.601103,0.631219,9.164302,0.627555,42.3417,-71.0792,Boston,105.0,6.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA,"[0.0005254321209228526, -0.0003138953361233465...","[-0.006714747711235776, -0.013377853242195396,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2451,3520,0.762055,0.460119,2.167670,0.572258,1.766562,0.470312,40.6307,-74.0047,New York,346.0,-1.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA,"[0.0009518040725277828, 0.0007872933647441974,...","[0.018740677427759696, 0.017024552002813317, -..."
2452,4725,0.560845,0.582538,2.764210,0.757413,0.966390,0.523086,42.3812,-71.0890,Boston,15.0,-3.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA,"[0.0007986571357105439, 0.0007759619659164433,...","[0.0010412029276445124, -0.007840056165476398,..."
2453,4904,0.517857,0.527475,2.454082,0.681900,1.576652,0.381375,42.3888,-71.0872,Boston,8.0,-2.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA,"[-0.001921776299813869, 0.0029886991437241177,...","[-0.010865904188049917, -0.0024485266854510097..."
2454,4162,0.465477,0.462276,1.970528,0.661715,1.627882,0.514609,40.7096,-73.9278,New York,28.0,4.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA,"[0.0004141868588518098, 0.0006710074009996883,...","[0.004714490565311111, 0.006047252483389254, 0..."


##### Selecting samples

In [14]:
top_n = int(len(data_df) * cfg.DELTA)
top_df = data_df.nlargest(top_n, cfg.PRECEPTION_METRIC)
top_df["labels"] = "safe"
bot_df = data_df.nsmallest(top_n, cfg.PRECEPTION_METRIC)
bot_df["labels"] = "unsafe"
print(f"Top max: {top_df[cfg.PRECEPTION_METRIC].max()}, min:{top_df[cfg.PRECEPTION_METRIC].min()}\nBot max: {bot_df[cfg.PRECEPTION_METRIC].max()}, min:{bot_df[cfg.PRECEPTION_METRIC].min()}")

Top max: 8.35086458651, min:5.02335188938
Bot max: 4.54807929686, min:0.211543520683


In [15]:
delta_df = pd.concat([top_df, bot_df], ignore_index=True)
delta_df.sort_values(by=[cfg.PRECEPTION_METRIC], inplace=True,ascending=False)
delta_df["classes"] = delta_df[cfg.PRECEPTION_METRIC].apply(lambda x: 1 if x > 5 else 0)
delta_df["labels"].value_counts()

labels
safe      1031
unsafe    1031
Name: count, dtype: int64

In [16]:
label_map = dict(zip(delta_df["classes"], delta_df["labels"]))
labels = list(label_map.values())
classes = list(label_map.keys())
label_map

{1: 'safe', 0: 'unsafe'}

#### Train Test split

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
train_df, test_df = train_test_split(delta_df, 
                                    stratify=delta_df["classes"],
                                    test_size = 0.25, 
                                    random_state = cfg.RANDOM_STATE)

In [19]:
X_train = np.array(train_df["gist"].tolist())
X_test = np.array(test_df["gist"].tolist())
y_train = np.array(train_df["classes"].tolist())
y_test = np.array(test_df["classes"].tolist())
X_train.shape, X_test.shape

((1546, 256), (516, 256))

In [20]:
collections.Counter(y_train), collections.Counter(y_test)

(Counter({np.int64(1): 773, np.int64(0): 773}),
 Counter({np.int64(0): 258, np.int64(1): 258}))

### GridSearch

In [21]:
from py.models.classification.linear import LinearClassifier

In [22]:
model_search = LinearClassifier()

#### Train

In [23]:
%%time
results = model_search.fit_all(X_train, y_train)


Fitting logistic_regression...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

Fitting ridge...
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Fitting linear_svc...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

Fitting svm...
Fitting 5 folds for each of 140 candidates, totalling 700 fits
CPU times: user 4 s, sys: 195 ms, total: 4.2 s
Wall time: 28.1 s


In [24]:
model_search.print_results()


GRID SEARCH RESULTS SUMMARY

SVM:
  Best CV Score: 0.6450
  Best Parameters:
    classifier__C: 1.0
    classifier__class_weight: balanced
    classifier__gamma: scale
    classifier__kernel: rbf

LOGISTIC_REGRESSION:
  Best CV Score: 0.5596
  Best Parameters:
    classifier__C: 0.01
    classifier__class_weight: None
    classifier__penalty: l2

LINEAR_SVC:
  Best CV Score: 0.5557
  Best Parameters:
    classifier__C: 0.01
    classifier__class_weight: balanced
    classifier__loss: hinge
    classifier__penalty: l2

RIDGE:
  Best CV Score: 0.5498
  Best Parameters:
    classifier__alpha: 100.0
    classifier__class_weight: balanced

BEST OVERALL MODEL: SVM
Best Score: 0.6450


In [25]:
summary_df = model_search.get_results_summary()
summary_df

Unnamed: 0,model,best_score,best_params
3,svm,0.644977,"{'classifier__C': np.float64(1.0), 'classifier..."
0,logistic_regression,0.559581,"{'classifier__C': np.float64(0.01), 'classifie..."
2,linear_svc,0.555681,"{'classifier__C': np.float64(0.01), 'classifie..."
1,ridge,0.549837,"{'classifier__alpha': np.float64(100.0), 'clas..."


#### Inference

In [26]:
from sklearn.metrics import classification_report

In [27]:
y_pred = model_search.predict(X_test) # model_name='logistic_regression')

In [28]:
clf_lr_report = classification_report(y_test, y_pred, output_dict=True)

print(classification_report(y_test, y_pred, target_names=labels))

              precision    recall  f1-score   support

        safe       0.62      0.65      0.64       258
      unsafe       0.63      0.60      0.62       258

    accuracy                           0.63       516
   macro avg       0.63      0.63      0.63       516
weighted avg       0.63      0.63      0.63       516



In [29]:
confusion_matrix = pd.crosstab(
    [ label_map[k] for k in y_test ],
    [ label_map[k] for k in y_pred ],
    rownames=['Real'],
    colnames=['Prediction']
)
confusion_matrix

Prediction,safe,unsafe
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
safe,155,103
unsafe,90,168


#### Saving

In [30]:
# Save grid search
model_search.save(f'{MODEL_PATH}model_search.pkl')

Instance saved to /media/felipe/DATA19/models/pp1/2011/models/model_search.pkl using pickle


In [31]:
# Save just the best trained model
model_search.save_best_model_only(f'{MODEL_PATH}best_model.pkl')

Best model (svm) saved to /media/felipe/DATA19/models/pp1/2011/models/best_model.pkl


#### Loading

In [36]:
loaded_search = LinearClassifier.load(f'{MODEL_PATH}model_search.pkl')
y_pred = loaded_search.predict(X_test)
confusion_matrix = pd.crosstab(
    [ label_map[k] for k in y_test ],
    [ label_map[k] for k in y_pred ],
    rownames=['Real'],
    colnames=['Prediction']
)
confusion_matrix

Instance loaded from /media/felipe/DATA19/models/pp1/2011/models/model_search.pkl using pickle


Prediction,safe,unsafe
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
safe,155,103
unsafe,90,168


In [37]:
# Load and use the best model
best_model = LinearClassifier.load_model(f'{MODEL_PATH}best_model.pkl')
y_pred = best_model.predict(X_test)
confusion_matrix = pd.crosstab(
    [ label_map[k] for k in y_test ],
    [ label_map[k] for k in y_pred ],
    rownames=['Real'],
    colnames=['Prediction']
)
confusion_matrix

Model loaded from /media/felipe/DATA19/models/pp1/2011/models/best_model.pkl


Prediction,safe,unsafe
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
safe,155,103
unsafe,90,168
