In [None]:
import warnings
warnings.filterwarnings('ignore')

## Data loading

In [None]:
import pandas as pd
import numpy as np

DATA_PATH = '/content/drive/MyDrive/work/Цифровой прорыв/data/'
DATASET = DATA_PATH+'covid_data_train.csv'
df = pd.read_csv(DATASET)

## Drop bad values

In [None]:
df = df[df['inf_rate'].notnull()]

In [None]:
df.drop([362, 363, 364], inplace=True)
df.drop([40, 39, 38], inplace=True)
df.drop([372, 371, 370], inplace=True)
df.drop([283, 282, 281], inplace=True)
df.drop([120, 119, 118], inplace=True)
df.drop([243, 242, 241], inplace=True)
df.drop([68], inplace=True)
df.drop([230], inplace=True)

## Fillna and Normalize

In [None]:
fillna_zero_cols_list = ['ivl_per_100k',
 'ivl_number',
 'ekmo_per_100k',
 'ekmo_number',
 'epirank_avia',
 'epirank_bus',
 'epirank_train',
 'epirank_avia_cat',
 'epirank_bus_cat',
 'epirank_train_cat']


fillna_median_cols_list = ['num_patients_tubercul_1992',
 'num_patients_tubercul_1993',
 'num_patients_tubercul_1994',
 'num_patients_tubercul_1995',
 'num_patients_tubercul_1996',
 'num_patients_tubercul_1997',
 'num_patients_tubercul_1998',
 'num_patients_tubercul_1999',
 'num_patients_tubercul_2000',
 'num_patients_tubercul_2001',
 'num_patients_tubercul_2002',
 'num_patients_tubercul_2003',
 'num_patients_tubercul_2004',
 'num_patients_tubercul_2005',
 'num_patients_tubercul_2006',
 'num_patients_tubercul_2007',
 'num_patients_tubercul_2008',
 'num_patients_tubercul_2009',
 'num_patients_tubercul_2010',
 'num_patients_tubercul_2011',
 'num_patients_tubercul_2012',
 'num_patients_tubercul_2013',
 'num_patients_tubercul_2014',
 'num_patients_tubercul_2015',
 'num_patients_tubercul_2016',
 'num_patients_tubercul_2017',
 'cleanness',
 'public_services',
 'neighbourhood',
 'children_places',
 'sport_and_outdoor',
 'shops_and_malls',
 'public_transport',
 'security',
 'life_costs',
 'life_quality_place_rating',
 'ecology']

trash_cols_list = ["subject", "has_metro"]

In [None]:
def clean_df(df, fillna_zero_cols_list, fillna_median_cols_list, trash_cols_list):


    for col in fillna_zero_cols_list:
        df[col] = df[col].fillna(0)
    
    for col in fillna_median_cols_list :#+ fillna_zero_cols_list:
        df[col] = df[col].fillna(df[col].median())

    #убираем trash_cols
    df = df.drop(columns=trash_cols_list)

    return df

In [None]:
df = clean_df(df, fillna_zero_cols_list, fillna_median_cols_list,trash_cols_list)

## Load test

In [None]:
DATA_PATH = '/content/drive/MyDrive/work/Цифровой прорыв/data/'
DATASET = DATA_PATH+'covid_data_test.csv'
df_test = pd.read_csv(DATASET)

df_test = clean_df(df_test, fillna_zero_cols_list, fillna_median_cols_list,trash_cols_list)

## Features grupping  


In [None]:
TARGET = ['inf_rate']
TARGET_COL = 'inf_rate'

# Float features
LAT_LNG = ['lat', 'lng']
URBAN_RURAL = ['urban', 'rural'] # количество городских и сельских жителей
HAS_METRO = ['has_metro']
POPULATION = ['population', 'whole_population']
DENSITY = ['density'] # Плотность населения  
RATINGS = ['cleanness', 'public_services', 'neighbourhood', 'children_places', 'sport_and_outdoor', 
           'shops_and_malls', 'public_transport', 'security', 'life_costs'] # Рейтинги 
VENTILATOR = ['ivl_per_100k', 'ivl_number', 'ekmo_per_100k', 'ekmo_number'] # ИВЛ
WEATHER = [ 'avg_temp_min', 'avg_temp_max', 'avg_temp_std', 'avg_temp_median', 'humidity_min', 
           'humidity_max', 'humidity_std', 'humidity_median', 'pressure_min', 'pressure_max',
           'pressure_std', 'pressure_median', 'wind_speed_ms_min', 'wind_speed_ms_max',
           'wind_speed_ms_std', 'wind_speed_ms_median'] # Погода
RESIDENTS_AGE = ['urban_50-54_years', 'urban_55-59_years', 'urban_60-64_years', 'urban_65-69_years',
                 'urban_70-74_years', 'urban_75-79_years', 'urban_80-84_years', 'urban_85-89_years', 
                 'urban_90-94_years', 'rural_50-54_years', 'rural_55-59_years', 'rural_60-64_years',
                 'rural_65-69_years', 'rural_70-74_years', 'rural_75-79_years', 'rural_80-84_years',
                 'rural_85-89_years', 'rural_90-94_years'] # Количество жителей по возрастам 
POPULATION_WORK = ['work_ratio_15-72_years', 'work_ratio_55-64_years', 'work_ratio_15-24_years', 
                   'work_ratio_15-64_years', 'work_ratio_25-54_years',] # Занятость населения по возрастам
HISTORICAL_TUBERCUL = [ 'num_patients_tubercul_1992', 'num_patients_tubercul_1993', 'num_patients_tubercul_1994', 
                       'num_patients_tubercul_1995', 'num_patients_tubercul_1996', 'num_patients_tubercul_1997', 
                       'num_patients_tubercul_1998', 'num_patients_tubercul_1999', 'num_patients_tubercul_2000', 
                       'num_patients_tubercul_2001', 'num_patients_tubercul_2002', 'num_patients_tubercul_2003', 
                       'num_patients_tubercul_2004', 'num_patients_tubercul_2005', 'num_patients_tubercul_2006', 
                       'num_patients_tubercul_2007', 'num_patients_tubercul_2008', 'num_patients_tubercul_2009', 
                       'num_patients_tubercul_2010', 'num_patients_tubercul_2011', 'num_patients_tubercul_2012', 
                       'num_patients_tubercul_2013', 'num_patients_tubercul_2014', 'num_patients_tubercul_2015', 
                       'num_patients_tubercul_2016', 'num_patients_tubercul_2017']  # Тубик
ECONOMIC_VALUE = ['volume_serv_household_2017', 'volume_serv_chargeable_2017', 'volume_serv_transport_2017',
                  'volume_serv_post_2017', 'volume_serv_accommodation_2017', 'volume_serv_telecom_2017', 
                  'volume_serv_others_2017', 'volume_serv_veterinary_2017', 'volume_serv_housing_2017', 
                  'volume_serv_education_2017', 'volume_serv_medicine_2017', 'volume_serv_disabled_2017', 
                  'volume_serv_culture_2017', 'volume_serv_sport_2017', 'volume_serv_hotels_2017', 
                  'volume_serv_tourism_2017', 'volume_serv_sanatorium_2017']  # Экономическая активность
NUM_PHONES = ['num_phones_rural_2018', 'num_phones_urban_2019'] # количество телефонов в разбивке по городским и сельским районам
BAS_TRAVEL = ['bus_march_travel_18', 'bus_april_travel_18'] # пассажирооборот автобусов по маршрутам регулярных перевозок (тысяча пассажиро-километров)
ENVIROMENTAL_SAFETY = ['epirank_avia', 'epirank_bus', 'epirank_train', 'epirank_avia_cat', 
                       'epirank_bus_cat', 'epirank_train_cat'] # рейтинг экологической безопасности, индекс epirank


# Cat features
DISTRICT = ['district'] # Округ
SUBJECT = ['subject'] # Регион
TOWN = ['name'] # Город
REGION = ['region_x']

In [None]:
added_features = ['income_passengers']

float_features = LAT_LNG + URBAN_RURAL + POPULATION + DENSITY \
                 + RATINGS + VENTILATOR + WEATHER + RESIDENTS_AGE + POPULATION_WORK \
                 + HISTORICAL_TUBERCUL + ECONOMIC_VALUE + NUM_PHONES + BAS_TRAVEL \
                 + ENVIROMENTAL_SAFETY # + DISTRICT + REGION + added_features

## Validation


In [None]:
from sklearn.model_selection import train_test_split, KFold

X = df[float_features]
y = df[TARGET_COL]


X, X_final_test, y, y_final_test = train_test_split(
    X, y, test_size=0.05, random_state=432)

In [None]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.kernel_ridge import KernelRidge
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler

In [None]:

from sklearn.metrics import mean_absolute_error





def run_and_check_models(X_train, X_val, y_train, y_val):

  other_models = {
    'lasso' : Lasso(alpha =0.005),
    'knn5' : KNeighborsRegressor(n_neighbors=5),
    'knn3' : KNeighborsRegressor(n_neighbors=3),
    'knn2' : KNeighborsRegressor(n_neighbors=2),
    'knn7' : KNeighborsRegressor(n_neighbors=7),
    'knn1' : KNeighborsRegressor(n_neighbors=1),
    'knn1_p1' : KNeighborsRegressor(n_neighbors=1, p=1),
    'knn1_p3' : KNeighborsRegressor(n_neighbors=1, p=3),
    'knn1_p4' : KNeighborsRegressor(n_neighbors=1, p=4),
    'rf' : RandomForestRegressor(),
    'rf5' : RandomForestRegressor(max_depth=4),
    'tree9' : DecisionTreeRegressor(max_depth=9),
    'tree7' : DecisionTreeRegressor(max_depth=7),
    'tree' : DecisionTreeRegressor(max_depth=None),
    'tree16' : DecisionTreeRegressor(max_depth=16),
  }
  run_results = {}


  scaler = MinMaxScaler()
  X_train = scaler.fit_transform(X_train.fillna(0))
  X_val = scaler.transform(X_val.fillna(0))
  for model_name in other_models.keys():
    model = other_models[model_name]
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    model_mae = mean_absolute_error(y_val,preds)
    run_results[model_name] = model_mae



  final_model1 = DecisionTreeRegressor(max_depth=None)
  final_model2 = KNeighborsRegressor(n_neighbors=1, p=1)
  final_model1.fit(X_train, y_train)
  final_model2.fit(X_train, y_train)
  preds1 = final_model1.predict(X_val) 
  preds2 = final_model2.predict(X_val).reshape(len(X_val))
  preds = (preds1+preds2)/2
  model_mae = mean_absolute_error(y_val,preds)
  run_results['final_model'] = model_mae
  
  return run_results

In [None]:
NUM_KFOLD_VALIDATIONS = 3
N_SPLITS = 10 

results = []
for random_state in range(NUM_KFOLD_VALIDATIONS):
  kf = KFold(n_splits = N_SPLITS, shuffle = True, random_state = random_state)
  kf.get_n_splits(X)
  for train_index, test_index in kf.split(X):

    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]

    run_results = run_and_check_models(X_train, X_val, y_train, y_val)
    results.append(run_results)
results = pd.DataFrame(results)

In [None]:
pd.DataFrame(results).mean()

lasso          0.320318
knn5           0.168931
knn3           0.095217
knn2           0.056388
knn7           0.221536
knn1           0.034601
knn1_p1        0.021086
knn1_p3        0.043308
knn1_p4        0.055824
rf             0.055663
rf5            0.200800
tree9          0.033550
tree7          0.065484
tree           0.034201
tree16         0.031898
final_model    0.024049
dtype: float64

## Final check MAE

In [None]:
run_and_check_models(X, X_final_test, y, y_final_test)

{'final_model': 0.046706154905320346,
 'knn1': 0.0467061549053203,
 'knn1_p1': 0.0467061549053203,
 'knn1_p3': 0.12902526601298306,
 'knn1_p4': 0.12902526601298306,
 'knn2': 0.08251752149523159,
 'knn3': 0.0901343597884431,
 'knn5': 0.15975723924493984,
 'knn7': 0.21483030146994656,
 'lasso': 0.38737993215999017,
 'rf': 0.08363131359313727,
 'rf5': 0.22805078285151503,
 'tree': 0.04670615490532052,
 'tree16': 0.04670615490532052,
 'tree7': 0.09320578452080512,
 'tree9': 0.04670615490532047}

## Final test

In [None]:
np.random.seed(42)
from sklearn.model_selection import train_test_split, KFold

X = df[float_features]
X_test = df_test[float_features]
y = df[TARGET_COL]


final_scaler = MinMaxScaler()
X = final_scaler.fit_transform(X)
X_test = final_scaler.transform(X_test)


final_model1 = DecisionTreeRegressor(max_depth=None, random_state = 42)
final_model2 = KNeighborsRegressor(n_neighbors=1, p=1)
final_model1.fit(X, y)
final_model2.fit(X, y)
preds1 = final_model1.predict(X_test) 
preds2 = final_model2.predict(X_test).reshape(len(X_test))
preds = (preds1+preds2)/2
df_test['inf_rate'] = preds


In [None]:
df_test[['Unnamed: 0','inf_rate']].to_csv('pygeeks.csv',index = False)

In [None]:
df_test[['Unnamed: 0','inf_rate']]

Unnamed: 0.1,Unnamed: 0,inf_rate
0,451,2.708050
1,452,0.693147
2,453,0.693147
3,454,1.609438
4,455,1.386294
...,...,...
415,899,1.791759
416,900,2.564949
417,901,0.693147
418,902,1.386294
