# Imports

In [None]:
import os
import numpy as np
import pandas as pd
import cv2
import matplotlib.pylab as plt
import plotly.graph_objects as go
import plotly.express as px
from sklearn.feature_selection import mutual_info_classif
import warnings


warnings.filterwarnings("ignore")

# Config, utils

In [None]:
PATHS = {
    'data': os.path.join(os.getcwd(), 'data'),
    'train': os.path.join(os.getcwd(), 'data', 'train'),
    'test': os.path.join(os.getcwd(), 'data', 'test')
}

def load_datafile_path(file: str) -> str: return os.path.join(PATHS['data'], file)
def load_train_image_path(file: str) -> str: return os.path.join(PATHS['train'], file)
def load_test_image_path(file: str) -> str: return os.path.join(PATHS['test'], file)

In [None]:
os.listdir(PATHS['data'])

# Recognize train data
1. In whole data ther is a disproportion in labels. It looks like imbalance data.
    - 0 label stands for 50 % of labels
    - 2 stands for 38%
    - 1 stands for 11%
2. Label 1 appears only after 2012 year. Before, there are only 0 and 2. 
    - maybe two models are needed for differetn time range?
3. Year, Latitude, Longitude are very weakly correlated with label.

In [None]:
train_info = pd.read_feather(load_datafile_path('train.ftr'))
train_info

In [None]:
(train_info['label'].value_counts() / len(train_info) * 100).round(2)

In [None]:
(train_info['year'].value_counts() / len(train_info) * 100).round(2)

In [None]:
train_info['label_copy'] = train_info['label']
df_graph = (
    train_info
    .groupby(['year', 'label'])
    .agg({'label_copy': 'count'})
    .reset_index()
)
df_graph['label'] = df_graph['label'].astype(str)


fig = px.bar(
    df_graph,
    x='year',
    y='label_copy',
    color='label'
)
fig.update_layout(
    title='Labels distribution in time',
)
fig.show(renderer='notebook')

In [None]:
df_graph = train_info[['latitude', 'longitude', 'label']].copy()
df_graph['label'] = df_graph['label'].astype(str)

fig = px.scatter(
    df_graph,
    x='longitude',
    y='latitude',
    color='label'
)
fig.update_layout(
    title='Labels distribution lat x long',
    width=700,
    height=600
)
fig.show(renderer='notebook')

In [None]:
df_graph = train_info[['latitude', 'longitude', 'year', 'label']].copy()
df_graph['label'] = df_graph['label'].astype(str)
df_graph.sort_values('year', inplace=True)

fig = px.scatter(
    df_graph,
    x='latitude',
    y='longitude',
    color='label',
    facet_col='year',
    facet_col_wrap=4
)

fig.update_layout(
    height=1000
)
fig.show()

In [None]:
cols = ['year', 'latitude', 'longitude']
X = train_info[cols].to_numpy()
y = train_info['label'].to_numpy()

mi = mutual_info_classif(X, y)
mutual_dict = {feature:round(m, 2) for feature, m in zip(cols, mi)}

In [None]:
mutual_dict

In [None]:
(
    train_info[['label'] + cols]
    .corr('spearman')
    .round(2)
    .applymap(lambda x: x if x != 1 else '-')
)

# Recognize test data
1. Year distribution is similar to train data.

In [None]:
test_info = pd.read_feather(load_datafile_path('test.ftr'))
test_info

In [None]:
df_graph = (
    test_info
    .value_counts('year')
    .reset_index()
    .rename(columns={
        0: 'count'
    })
)
df_graph['before_2012'] = np.select(
    condlist=[df_graph['year'] < 2012],
    choicelist=[True],
    default=False
)


fig = px.bar(
    df_graph,
    x='year',
    y='count',
    color='before_2012'
)
fig.show()