In [None]:
import os, sys
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import shutil
import collections
from pathlib import Path

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Loading packages

In [None]:
import sys
from pathlib import Path

here_path = Path().resolve()
repo_path = here_path.parents[1]
sys.path.append(str(repo_path))

In [None]:
from py.utils import verifyDir, verifyFile, verifyType

In [None]:
from py.config import Config

cfg = Config()

np.random.seed(cfg.RANDOM_STATE)
cfg.DATA_PATH, cfg.MODEL_PATH

In [None]:
QSCORE_PATH=f"{cfg.DATA_PATH}pp2/{cfg.SCORING_METHOD}/{cfg.PLACE_LEVEL}/"
IMAGES_PATH = f"{cfg.DATA_PATH}pp2/images/"
FEATURES_PATH = f"{cfg.MODEL_PATH}pp2/features/{cfg.SCORING_METHOD}/{cfg.PLACE_LEVEL}/"
MODEL_PATH = f"{cfg.MODEL_PATH}pp2/linear/{cfg.SCORING_METHOD}/{cfg.PLACE_LEVEL}/"

In [None]:
verifyDir(MODEL_PATH)

### Loading data

In [None]:
%%time
import pickle

with open(f"{FEATURES_PATH}{cfg.MODEL_FEATURE_NAME}_features.pkl", "rb") as f:
    features_dict = pickle.load(f)

In [None]:
features_train_df = pd.DataFrame(data=features_dict["train"])
features_train_df["image_id"] = features_train_df["image_id"].apply(str)
features_train_df

In [None]:
features_test_df = pd.DataFrame(data=features_dict["test"])
features_test_df["image_id"] = features_test_df["image_id"].apply(str)
features_test_df

In [None]:
%%time
data_df = pd.read_csv(f"{QSCORE_PATH}scores.csv", sep=";", low_memory=False)
data_df["image_path"] = f"{IMAGES_PATH}" + data_df["image_path"]
data_df["image_id"] = data_df["image_id"].apply(str)
data_df.sort_values(by=[cfg.PERCEPTION_METRIC], ascending=False, inplace=True)
data_df

In [None]:
train_df = pd.merge(data_df, features_train_df, how="inner", on=["image_id", "image_path"])
train_df

In [None]:
test_df = pd.merge(data_df, features_test_df, how="inner", on=["image_id", "image_path"])
test_df

In [None]:
print(f"Train samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

In [None]:
label_map = dict(zip(train_df["target"], train_df["label"]))
labels = list(label_map.values())
classes = list(label_map.keys())
label_map

In [None]:
X_train = np.array(train_df["features"].tolist())
X_test = np.array(test_df["features"].tolist())
y_train = np.array(train_df["target"].tolist())
y_test = np.array(test_df["target"].tolist())
X_train.shape, X_test.shape

In [None]:
collections.Counter(y_train), collections.Counter(y_test)

### GridSearch

In [None]:
from py.models.classification.linear import LinearClassifier

In [None]:
model_search = LinearClassifier()

#### Train

In [None]:
%%time
results = model_search.fit_all(X_train, y_train)

In [None]:
model_search.print_results()

In [None]:
summary_df = model_search.get_results_summary()
summary_df

#### Inference

In [None]:
from sklearn.metrics import classification_report

In [None]:
y_pred = model_search.predict(X_test) # model_name='logistic_regression')

In [None]:
clf_lr_report = classification_report(y_test, y_pred, output_dict=True)

print(classification_report(y_test, y_pred, target_names=labels))

In [None]:
confusion_matrix = pd.crosstab(
    [ label_map[k] for k in y_test ],
    [ label_map[k] for k in y_pred ],
    rownames=['Real'],
    colnames=['Prediction']
)
confusion_matrix

#### Saving

In [None]:
# Save grid search
model_search.save(f'{MODEL_PATH}{cfg.MODEL_FEATURE_NAME}_model_search.pkl')

In [None]:
# Save just the best trained model
model_search.save_best_model_only(f'{MODEL_PATH}{model_search.get_best_model_name()}_{cfg.MODEL_FEATURE_NAME}_best_model.pkl')

#### Loading

In [None]:
loaded_search = LinearClassifier.load(f'{MODEL_PATH}{cfg.MODEL_FEATURE_NAME}_model_search.pkl')
y_pred = loaded_search.predict(X_test)
confusion_matrix = pd.crosstab(
    [ label_map[k] for k in y_test ],
    [ label_map[k] for k in y_pred ],
    rownames=['Real'],
    colnames=['Prediction']
)
confusion_matrix

In [None]:
# Load and use the best model
best_model = LinearClassifier.load_model(f'{MODEL_PATH}{model_search.get_best_model_name()}_{cfg.MODEL_FEATURE_NAME}_best_model.pkl')
y_pred = best_model.predict(X_test)
confusion_matrix = pd.crosstab(
    [ label_map[k] for k in y_test ],
    [ label_map[k] for k in y_pred ],
    rownames=['Real'],
    colnames=['Prediction']
)
confusion_matrix