In [1]:
import os, sys
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import shutil
import collections
from pathlib import Path

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Loading packages

In [3]:
import sys
from pathlib import Path

here_path = Path().resolve()
repo_path = here_path.parents[1]
sys.path.append(str(repo_path))

In [4]:
from py.utils import verifyDir, verifyFile, verifyType

In [5]:
import graphviz
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
from py.config import Config

cfg = Config()

np.random.seed(cfg.RANDOM_STATE)
cfg.DATA_PATH, cfg.MODEL_PATH

('/media/felipe/DATA19/datasets/', '/media/felipe/DATA19/models/')

In [7]:
QSCORE_PATH=f"{cfg.DATA_PATH}pp2/Qscores/"
IMAGES_PATH = f"{cfg.DATA_PATH}pp2/images/"
FEATURES_PATH = f"{cfg.MODEL_PATH}pp2/features/cnn/"
MODEL_PATH = f"{cfg.MODEL_PATH}pp2/linear/"

In [8]:
verifyDir(MODEL_PATH)

### Loading data

In [9]:
%%time
import pickle

with open(f"{FEATURES_PATH}{cfg.MODEL_FEATURE_NAME}_features.pkl", "rb") as f:
    features_dict = pickle.load(f)

CPU times: user 18.3 ms, sys: 6.99 ms, total: 25.3 ms
Wall time: 25.1 ms


In [10]:
features_train_df = pd.DataFrame(data=features_dict["train"])
features_train_df["image_id"] = features_train_df["image_id"].apply(str)
features_train_df

Unnamed: 0,image_id,image_path,features,target,label
0,50f5eaf5fdc9f065f0007ee4,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.29300129413604736, 0.0, 0.0,...",0,not safety
1,50f5eb18fdc9f065f0007f6b,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.10629373788833618, 0.0, 0.0, 0.0, 0.0, 0.0,...",0,not safety
2,50f5ec11fdc9f065f0008699,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.11749027669429779,...",1,safety
3,50f5eb67fdc9f065f0008299,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3147827982902...",0,not safety
4,50f5ebcefdc9f065f00085a7,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1126988828182...",0,not safety
...,...,...,...,...,...
2290,50f5ec1bfdc9f065f0008757,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.3260393738746643, 0.18124359846115112,...",1,safety
2291,50f5eaccfdc9f065f0007d21,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7497653961181...",0,not safety
2292,50f5eb61fdc9f065f0008221,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.13098570704460144, 0.0, 0.16...",0,not safety
2293,50f5ea5cfdc9f065f0007ac7,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.48762276768684387, 0.0, 0.067462086677...",0,not safety


In [11]:
features_test_df = pd.DataFrame(data=features_dict["test"])
features_test_df["image_id"] = features_test_df["image_id"].apply(str)
features_test_df

Unnamed: 0,image_id,image_path,features,target,label
0,50f5ec1cfdc9f065f000876e,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.22204318642616272, 0.18190762400627136, 0.0...",0,not safety
1,50f5eb4bfdc9f065f00081da,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1607591509819...",0,not safety
2,50f5ec33fdc9f065f00087b8,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3678600788116...",0,not safety
3,50f5eaaafdc9f065f0007bd8,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.628642201423645, 0.0, 0.0, 0.0, 0.0, 0...",1,safety
4,50f5eaf3fdc9f065f0007eac,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.4599553644657135, ...",0,not safety
...,...,...,...,...,...
760,50f5eb4afdc9f065f00081c0,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.23565949499607086, 0.0, 0.0, 0.0, 0.0, 0.34...",0,not safety
761,50f5eb1afdc9f065f0007f96,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5528129339218...",0,not safety
762,50f5ebc9fdc9f065f000853b,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1389075517654...",0,not safety
763,50f5ec1efdc9f065f0008799,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4825475513935...",0,not safety


In [12]:
%%time
data_df = pd.read_csv(f"{QSCORE_PATH}scores.csv", sep=";", low_memory=False)
data_df["image_path"] = f"{IMAGES_PATH}" + data_df["image_path"]
data_df["image_id"] = data_df["image_id"].apply(str)
data_df.sort_values(by=[cfg.PERCEPTION_METRIC], ascending=False, inplace=True)
data_df

CPU times: user 154 ms, sys: 30.9 ms, total: 185 ms
Wall time: 185 ms


Unnamed: 0,image_id,lat,long,city,country,continent,safety,beautiful,wealthy,lively,boring,depressing,image_path
98091,51414746fdc9f04926006a00,44.961377,-93.271491,Minneapolis,USA,North America,8.780423,7.500000,7.777778,6.278770,5.000000,3.333333,/media/felipe/DATA19/datasets/pp2/images/Minne...
42815,513d677cfdc9f035870040af,42.370774,-71.126977,Boston,USA,North America,8.583389,5.333333,6.055556,5.029020,3.333333,9.166667,/media/felipe/DATA19/datasets/pp2/images/Bosto...
50565,513d7c38fdc9f03587006e0f,33.805683,-84.293833,Atlanta,USA,North America,8.505291,6.342593,8.452381,2.291667,0.000000,3.333333,/media/felipe/DATA19/datasets/pp2/images/Atlan...
16840,50f5642cfdc9f065f00060f0,47.583294,-122.287884,Seattle,USA,North America,8.478006,7.166667,7.500000,5.538597,3.333333,2.777778,/media/felipe/DATA19/datasets/pp2/images/Seatt...
32864,513cc19efdc9f035870014bd,29.751635,-95.466232,Houston,USA,North America,8.440027,7.592593,6.653439,4.321429,2.708333,1.666667,/media/felipe/DATA19/datasets/pp2/images/Houst...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1768,50f42c0bfdc9f065f0001786,52.242431,20.898414,Warsaw,Poland,Europe,0.392157,5.899471,2.833333,4.872958,5.740741,3.333333,/media/felipe/DATA19/datasets/pp2/images/Warsa...
67419,513e6bb5fdc9f0358700c081,35.646141,139.812366,Tokyo,Japan,Asia,0.277778,3.333333,3.333333,4.538332,5.277778,7.438272,/media/felipe/DATA19/datasets/pp2/images/Tokyo...
15585,50f562dafdc9f065f0005ae6,1.292158,103.808340,Singapore,Singapore,Asia,0.256410,4.305556,3.333333,6.666667,5.333333,4.444444,/media/felipe/DATA19/datasets/pp2/images/Singa...
68591,513e6f20fdc9f0358700c51f,35.743776,139.773877,Tokyo,Japan,Asia,0.196078,5.092593,3.080808,6.603175,3.055556,2.500000,/media/felipe/DATA19/datasets/pp2/images/Tokyo...


In [13]:
train_df = pd.merge(data_df, features_train_df, how="inner", on=["image_id", "image_path"])
train_df

Unnamed: 0,image_id,lat,long,city,country,continent,safety,beautiful,wealthy,lively,boring,depressing,image_path,features,target,label
0,50f5eaecfdc9f065f0007e32,-22.901318,-43.178843,Rio De Janeiro,Brasil,South America,8.074111,7.222222,6.521164,5.259259,2.444444,5.000000,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.18660153448581696,...",1,safety
1,50f5eb4bfdc9f065f00081e0,-22.924839,-43.387912,Rio De Janeiro,Brasil,South America,7.990805,3.015873,3.333333,3.767806,2.500000,2.261905,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.3834789991378784, 0.0, 0.0, 0.0, 0.0, 0.049...",1,safety
2,50f5eb44fdc9f065f0008149,-22.946271,-43.371826,Rio De Janeiro,Brasil,South America,7.845118,5.000000,3.333333,3.347885,4.796296,5.555556,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6077491641044...",1,safety
3,50f5eaa3fdc9f065f0007b62,-22.848260,-43.265251,Rio De Janeiro,Brasil,South America,7.784127,2.222222,3.333333,4.777778,0.000000,4.500000,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.6598587036132812, 0.0, 0.0483124256134...",1,safety
4,50f5ebaafdc9f065f0008430,-22.911389,-43.194286,Rio De Janeiro,Brasil,South America,7.758611,5.836508,5.000000,5.695847,4.509259,3.333333,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.2719610333442688, ...",1,safety
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2290,50f5eb26fdc9f065f0008065,-22.912840,-43.252074,Rio De Janeiro,Brasil,South America,0.833333,3.574074,2.666667,2.893366,5.000000,7.296296,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.39527323842048645, 0.0, 0.0, 0.0, 0.0,...",0,not safety
2291,50f5ebcefdc9f065f0008595,-22.875385,-43.278050,Rio De Janeiro,Brasil,South America,0.833333,2.190476,2.500000,1.929293,6.388889,6.314815,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.023813217878341675, 0.0, 0.0, 0.0, 0.3...",0,not safety
2292,50f5eac8fdc9f065f0007ccc,-22.775293,-43.391274,Rio De Janeiro,Brasil,South America,0.833333,2.500000,1.666667,5.274365,2.055556,7.619048,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.11960434913635254, 0.0, 0.0, 0.0, 0.0, 0.0,...",0,not safety
2293,50f5eb27fdc9f065f000807f,-22.853284,-43.319558,Rio De Janeiro,Brasil,South America,0.512821,3.333333,2.222222,4.174539,5.592593,6.333333,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.029851414263248444, 0.0, 0.0, 0.0, 0.0...",0,not safety


In [14]:
test_df = pd.merge(data_df, features_test_df, how="inner", on=["image_id", "image_path"])
test_df

Unnamed: 0,image_id,lat,long,city,country,continent,safety,beautiful,wealthy,lively,boring,depressing,image_path,features,target,label
0,50f5eb65fdc9f065f000826f,-22.829954,-43.374070,Rio De Janeiro,Brasil,South America,7.920455,3.740079,3.117284,3.333333,2.222222,5.000000,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.02342948317527771, 0.0, 0.0, 0.091920152306...",1,safety
1,50f5ec0cfdc9f065f000863e,-22.884331,-43.274540,Rio De Janeiro,Brasil,South America,7.514245,5.637566,5.333333,4.514906,3.333333,4.393939,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.5245117545127869, ...",1,safety
2,50f5eb44fdc9f065f0008156,-22.942290,-43.178092,Rio De Janeiro,Brasil,South America,7.446637,4.696970,2.833333,4.700214,4.777778,3.333333,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.30544620752334595, 0.0, 0.0, 0.0,...",1,safety
3,50f5ec39fdc9f065f0008834,-22.835107,-43.378105,Rio De Janeiro,Brasil,South America,7.424242,2.407407,2.083333,3.117976,4.861111,7.777778,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.516967236995697, 0.0, 0.0, 0.0, 0.0939...",1,safety
4,50f5eba4fdc9f065f00083ca,-22.911988,-43.181605,Rio De Janeiro,Brasil,South America,7.333333,5.555556,6.717172,6.377116,3.333333,3.333333,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.32800471782684326, 0.342740535736084, 0.0, ...",1,safety
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
760,50f5eb1cfdc9f065f0007fbe,-22.896153,-43.421212,Rio De Janeiro,Brasil,South America,1.346154,5.000000,1.833333,4.877405,7.671958,0.000000,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.014338985085487366, 0.021999284625053406, 0...",0,not safety
761,50f5ebcffdc9f065f00085ac,-22.820750,-43.403187,Rio De Janeiro,Brasil,South America,1.282051,3.730159,3.789683,5.090239,7.083333,5.833333,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.677...",0,not safety
762,50f5eae9fdc9f065f0007ded,-22.895573,-43.355747,Rio De Janeiro,Brasil,South America,1.250000,4.166667,4.444444,4.944444,5.277778,2.777778,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3025422096252...",0,not safety
763,50f5eb6cfdc9f065f00082ff,-22.809465,-43.384539,Rio De Janeiro,Brasil,South America,1.188811,2.222222,2.888889,2.331349,1.904762,8.333333,/media/felipe/DATA19/datasets/pp2/images/Rio D...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2720453143119...",0,not safety


In [15]:
print(f"Train samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

Train samples: 2295
Test samples: 765


In [16]:
label_map = dict(zip(train_df["target"], train_df["label"]))
labels = list(label_map.values())
classes = list(label_map.keys())
label_map

{1: 'safety', 0: 'not safety'}

In [17]:
X_train = np.array(train_df["features"].tolist())
X_test = np.array(test_df["features"].tolist())
y_train = np.array(train_df["target"].tolist())
y_test = np.array(test_df["target"].tolist())
X_train.shape, X_test.shape

((2295, 256), (765, 256))

In [18]:
collections.Counter(y_train), collections.Counter(y_test)

(Counter({np.int64(0): 1813, np.int64(1): 482}),
 Counter({np.int64(0): 605, np.int64(1): 160}))

### GridSearch

In [19]:
from py.models.classification.linear import LinearClassifier

In [20]:
model_search = LinearClassifier()

#### Train

In [21]:
%%time
results = model_search.fit_all(X_train, y_train)


Fitting logistic_regression...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

Fitting ridge...
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Fitting linear_svc...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

Fitting svm...
Fitting 5 folds for each of 140 candidates, totalling 700 fits
CPU times: user 10.3 s, sys: 146 ms, total: 10.5 s
Wall time: 19.5 s


In [22]:
model_search.print_results()


GRID SEARCH RESULTS SUMMARY

LOGISTIC_REGRESSION:
  Best CV Score: 0.9821
  Best Parameters:
    classifier__C: 0.1
    classifier__class_weight: balanced
    classifier__penalty: l2

SVM:
  Best CV Score: 0.9808
  Best Parameters:
    classifier__C: 0.01
    classifier__class_weight: None
    classifier__gamma: scale
    classifier__kernel: linear

LINEAR_SVC:
  Best CV Score: 0.9788
  Best Parameters:
    classifier__C: 0.01
    classifier__class_weight: None
    classifier__loss: squared_hinge
    classifier__penalty: l2

RIDGE:
  Best CV Score: 0.9775
  Best Parameters:
    classifier__alpha: 10.0
    classifier__class_weight: balanced

BEST OVERALL MODEL: LOGISTIC_REGRESSION
Best Score: 0.9821


In [23]:
summary_df = model_search.get_results_summary()
summary_df

Unnamed: 0,model,best_score,best_params
0,logistic_regression,0.982097,"{'classifier__C': np.float64(0.1), 'classifier..."
3,svm,0.980821,"{'classifier__C': np.float64(0.01), 'classifie..."
2,linear_svc,0.978831,"{'classifier__C': np.float64(0.01), 'classifie..."
1,ridge,0.97746,"{'classifier__alpha': np.float64(10.0), 'class..."


#### Inference

In [24]:
from sklearn.metrics import classification_report

In [25]:
y_pred = model_search.predict(X_test) # model_name='logistic_regression')

In [26]:
clf_lr_report = classification_report(y_test, y_pred, output_dict=True)

print(classification_report(y_test, y_pred, target_names=labels))

              precision    recall  f1-score   support

      safety       0.81      0.88      0.85       605
  not safety       0.34      0.23      0.27       160

    accuracy                           0.75       765
   macro avg       0.57      0.55      0.56       765
weighted avg       0.71      0.75      0.73       765



In [27]:
confusion_matrix = pd.crosstab(
    [ label_map[k] for k in y_test ],
    [ label_map[k] for k in y_pred ],
    rownames=['Real'],
    colnames=['Prediction']
)
confusion_matrix

Prediction,not safety,safety
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
not safety,534,71
safety,124,36


#### Saving

In [28]:
# Save grid search
model_search.save(f'{MODEL_PATH}model_search.pkl')

Instance saved to /media/felipe/DATA19/models/pp2/linear/model_search.pkl using pickle


In [29]:
# Save just the best trained model
model_search.save_best_model_only(f'{MODEL_PATH}{model_search.get_best_model_name()}_best_model.pkl')

Best model (logistic_regression) saved to /media/felipe/DATA19/models/pp2/linear/logistic_regression_best_model.pkl


#### Loading

In [30]:
loaded_search = LinearClassifier.load(f'{MODEL_PATH}model_search.pkl')
y_pred = loaded_search.predict(X_test)
confusion_matrix = pd.crosstab(
    [ label_map[k] for k in y_test ],
    [ label_map[k] for k in y_pred ],
    rownames=['Real'],
    colnames=['Prediction']
)
confusion_matrix

Instance loaded from /media/felipe/DATA19/models/pp2/linear/model_search.pkl using pickle


Prediction,not safety,safety
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
not safety,534,71
safety,124,36


In [31]:
# Load and use the best model
best_model = LinearClassifier.load_model(f'{MODEL_PATH}{model_search.get_best_model_name()}_best_model.pkl')
y_pred = best_model.predict(X_test)
confusion_matrix = pd.crosstab(
    [ label_map[k] for k in y_test ],
    [ label_map[k] for k in y_pred ],
    rownames=['Real'],
    colnames=['Prediction']
)
confusion_matrix

Model loaded from /media/felipe/DATA19/models/pp2/linear/logistic_regression_best_model.pkl


Prediction,not safety,safety
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
not safety,534,71
safety,124,36
