In [1]:
#!c1.8
import os
import pickle
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
import sklearn
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_validate
from sklearn.compose import make_column_transformer

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

In [2]:
#!c1.8
DATA_DIR = "data"

covid_data_train = pd.read_csv(os.path.join(DATA_DIR, "covid_data_train.csv"), sep=",", index_col=0).rename_axis(index=None)
covid_data_test = pd.read_csv(os.path.join(DATA_DIR, "covid_data_test.csv"), sep=",")
answer = covid_data_test[["Unnamed: 0"]]
covid_data_test = covid_data_test.drop(columns=["Unnamed: 0"])

In [3]:
#!c1.8
def filter_is_not_nan_targat(dataframe, target_name):
    return dataframe[dataframe[target_name].notnull()]

In [4]:
#!c1.8
def drop_defect_rows(dataset, drop=False, unique=False):
    dataset = dataset.copy()
    
    drops = [
        "Алушта", "Евпатория", "Керчь"
    ]
    
    uniques = [
        ("Белогорск", "Южный"), ("Благовещенск", "Приволжский"), 
        ("Гурьевск", "Сибирский"), ("Заречный", "Приволжский"), 
        ("Киров", "Приволжский"), ("Красноармейск", "Приволжский"),
        ("Краснослободск", "Приволжский")
    ]
    
    dataset = dataset.drop_duplicates()
    
    if drop:
        dataset = dataset.query(f"name not in {drops}")
    else:
        for drop in drops:
            drop_rows = dataset.query(f"name == '{drop}'")
            inf_rate = np.mean(drop_rows["inf_rate"])
            drop_rows = drop_rows.iloc[0]
            drop_rows["inf_rate"] = inf_rate
            dataset = dataset.query(f"name != '{drop}'")
            dataset = dataset.append(drop_rows, ignore_index=True)
    
    if unique:
        for name, district in uniques:
            dataset = dataset.query(f"name != '{name}' or district != '{district}'")
        
    return dataset

In [5]:
#!c1.8
covid_data_train = filter_is_not_nan_targat(covid_data_train, "inf_rate")
covid_data_train = drop_defect_rows(covid_data_train, drop=True, unique=True)

In [6]:
#!c1.8
covid_data_train = covid_data_train.drop(columns=["name", "region_x"])
covid_data_test = covid_data_test.drop(columns=["name", "region_x"])

In [7]:
#!c1.8
covid_data_train["has_metro"] = covid_data_train["has_metro"].astype(int)
covid_data_test["has_metro"] = covid_data_test["has_metro"].astype(int)

In [8]:
#!c1.8
nans_count_train = covid_data_train.drop(columns=["inf_rate"]).isnull().sum(axis=0).sort_values(ascending=False)
nans_count_test = covid_data_test.drop(columns=["inf_rate"]).isnull().sum(axis=0).sort_values(ascending=False)

In [9]:
#!c1.8
columns_is_nan_train = nans_count_train[nans_count_train > 0].index
columns_is_nan_test = nans_count_test[nans_count_test > 0].index

columns_is_nan = np.union1d(columns_is_nan_train, columns_is_nan_test)

clean_covid_data_train = covid_data_train.drop(columns=columns_is_nan)
clean_covid_data_test = covid_data_test.drop(columns=columns_is_nan)

In [10]:
#!c1.8
target_column = ["inf_rate"]
categories_columns = ["district", "subject", "has_metro"]
num_columns = clean_covid_data_train.columns.drop(categories_columns + target_column).tolist()

# Обучение модели

In [11]:
#!c1.8
def predict_mean_subject_inf_rate(model, scaler, covid_data_train, covid_data_test, num_columns, target_column):
    data_train = covid_data_train.groupby('subject')[num_columns + target_column].apply(np.mean)
    data_test = covid_data_test.groupby('subject')[num_columns + target_column].apply(np.mean)
    
    X_train = data_train.drop(columns=target_column)
    y_train = data_train[target_column]

    X_test = data_test.drop(columns=target_column)

    X_train[X_train.columns] = scaler.fit_transform(X_train)
    X_test[X_test.columns] = scaler.transform(X_test)
    
    model.fit(X_train, y_train)
    y_pred_train = pd.Series(data=model.predict(X_train).ravel(), index=X_train.index, name="inf_rate_subject")
    y_pred_test = pd.Series(data=model.predict(X_test).ravel(), index=X_test.index, name="inf_rate_subject")
    
    return y_pred_train, y_pred_test

In [12]:
#!c1.8
subject_inf_rate_model = Ridge(alpha=0.05)
sub_scaler = StandardScaler()

In [13]:
#!c1.8
subject_inf_rate_train, subject_inf_rate_test = predict_mean_subject_inf_rate(subject_inf_rate_model, sub_scaler, covid_data_train, clean_covid_data_test, num_columns, target_column)

In [14]:
#!c1.8
data_train = clean_covid_data_train.drop(columns=target_column)
target_train = clean_covid_data_train[target_column]

data_test = clean_covid_data_test.drop(columns=target_column)

data_train = data_train.join(subject_inf_rate_train, on='subject')
data_test = data_test.join(subject_inf_rate_test, on='subject')

In [15]:
#!c1.8
def transform_cat_features(dataset_input, transformer):
    dataset = dataset_input.copy()

    transformed = transformer.transform(dataset).toarray()
    new_columns = transformer.get_feature_names_out().tolist()
    
    for i in range(len(new_columns)):
        dataset[new_columns[i]] = transformed[:, i]
    
    dataset = dataset.drop(columns=['district', 'subject'])

    return dataset

In [16]:
#!c1.8
district_subject_dataset = pd.concat([data_train, data_test], sort=False)[['district', 'subject']]

In [17]:
#!c1.8
X_cat_train = data_train[categories_columns]
X_num_train = data_train.drop(columns=categories_columns)
y_train = target_train.values

X_cat_test = data_test[categories_columns]
X_num_test = data_test.drop(columns=categories_columns)

In [18]:
#!c1.8
main_scaler = StandardScaler()
transformer = make_column_transformer((OneHotEncoder(), ['district', 'subject'])).fit(district_subject_dataset)

X_cat_train = transform_cat_features(X_cat_train, transformer=transformer)
X_num_train = main_scaler.fit_transform(X_num_train)
X_train = np.hstack((X_num_train, X_cat_train))

X_cat_test = transform_cat_features(X_cat_test, transformer=transformer)
X_num_test = main_scaler.transform(X_num_test)
X_test = np.hstack((X_num_test, X_cat_test))

In [19]:
#!c1.8
main_model = Ridge(alpha=0.01)
cv_score_lin = cross_validate(main_model, X_train, y_train, cv=30, scoring="neg_mean_absolute_error")
print(f"{np.mean(-cv_score_lin['test_score']):.6f}")

0.001611


In [20]:
#!c1.8
main_model.fit(X_train, y_train)
y_test = np.clip(main_model.predict(X_test), 0.5, 5)

In [21]:
#!c1.8
y_test.min(), y_test.max()

(0.5, 5.0)

In [22]:
#!c1.8
y_train.min(), y_train.max()

(0.6931471805599453, 4.718498871295094)

In [23]:
#!c1.8
answer["inf_rate"] = y_test
answer.to_csv("answer.csv", index=False)

In [24]:
#!c1.8
with open('main_model.pickle', 'wb') as file:
    pickle.dump(main_model, file)

with open('main_scaler.pickle', 'wb') as file:
    pickle.dump(main_scaler, file)  
    
with open('transformer.pickle', 'wb') as file:
    pickle.dump(transformer, file)    
    
with open('subject_inf_rate_model.pickle', 'wb') as file:
     pickle.dump(subject_inf_rate_model, file)
        
with open('sub_scaler.pickle', 'wb') as file:
    pickle.dump(sub_scaler, file) 