In [1]:
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import train_test_split
pd.options.display.max_columns = None
%matplotlib inline
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from pyod.models.abod import ABOD
from pyod.models.hbos import HBOS
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
import itertools
import pickle, gzip
import glob
from sklearn.preprocessing import StandardScaler
from tsfresh.feature_extraction import extract_features
from sklearn.cluster import DBSCAN
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='once')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)
train['isTrain'] = True
test['isTrain'] = False
#data = train.append(test, sort=False)
data = pd.read_csv('data/data_with_geo.csv')
data = data.drop('Unnamed: 0', 1)

In [3]:
counts = data.groupby('address_rus')['id'].count().reset_index().rename(columns={'id':'count'})
counts_cities = data.groupby('geo_city')['id'].count().reset_index().rename(columns={'id':'count_cities'})
counts_regions = data.groupby('region')['id'].count().reset_index().rename(columns={'id':'count_regions'})
data = pd.merge(data, counts, how='left', on='address_rus')
data = pd.merge(data, counts_cities, how='left', on='geo_city')
data = pd.merge(data, counts_regions, how='left', on='region')

In [4]:
R = 6373.0 # радиус земли в километрах

def distance(x,y):
    """
    Параметры
    ----------
    x : tuple, широта и долгота первой геокоординаты 
    y : tuple, широта и долгота второй геокоординаты 
    
    Результат
    ----------
    result : дистанция в километрах между двумя геокоординатами
    """
    lat_a, long_a, lat_b, long_b = map(radians, [*x,*y])    
    dlon = long_b - long_a
    dlat = lat_b - lat_a
    a = sin(dlat/2)**2 + cos(lat_a) * cos(lat_b) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

def bearing_array(x,y):
    lat1, lng1, lat2, lng2 = map(radians, [*x,*y])
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = lng2 - lng1
    #lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [5]:
knc = KNeighborsClassifier(metric=distance)
dots = data[['lat','long']].dropna()
knc.fit(X=dots , y=np.ones(dots.shape[0]))
distances, indexes = knc.kneighbors(X=dots,n_neighbors=6)
knc = KNeighborsClassifier(metric=bearing_array)
knc.fit(X=dots , y=np.ones(dots.shape[0]))
distancesbear, indexesbear = knc.kneighbors(X=dots,n_neighbors=6)

for i in range(1,6):
    dots['distance_%s'%i] = distances[:,i]
    dots['indexes_%s'%i] = indexes[:,i]
    dots['distance_bear%s'%i] = distancesbear[:,i]
    dots['indexes_bear%s'%i] = indexesbear[:,i]
    dots['log_distance_%s'%i] = np.log(dots['distance_%s'%i]+1)

In [6]:
from sklearn.decomposition import PCA

dots['mean'] = dots.iloc[:,dots.columns.str.contains('distance')].mean(axis=1)
dots['median'] = dots.iloc[:,dots.columns.str.contains('distance')].median(axis=1)
dots['std'] = dots.iloc[:,dots.columns.str.contains('distance')].std(axis=1)
dots_pca = data[['lat','long']].dropna()
pca = PCA(n_components=1)
dots['coords_pca'] = pca.fit_transform(dots_pca)

data = pd.concat([data,dots.drop(['lat', 'long'], 1)], axis=1)

data['city'] = data[~data.address_rus.isnull()].address_rus.apply(lambda x: x.split(',')[2]) 
rare_cities = data.city.value_counts()[(data.city.value_counts() < 20) ==True].index
data['city_type'] = data.city.apply(lambda x: 'RARE' if x in rare_cities else x)
data['street'] = data[~data.address_rus.isnull()].address_rus.apply(lambda x: x.split(',')[0]) 
data['street'] = data['city'].astype(str) + '_' + data['street'].astype('str')
data['city_rank'] = data.city.rank().fillna(-1)
data['city_type_rank'] = data.city_type.rank().fillna(-1)

In [7]:
from sklearn import cluster, mixture              
def cluster_model(newdata, data, model_name, input_param):
    ds = data
    params = input_param
    if str.lower(model_name) == 'kmeans':                                
        cluster_obj = cluster.KMeans(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('MiniBatchKMeans'):            
        cluster_obj = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('SpectralClustering'):         
        cluster_obj = cluster.SpectralClustering(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('MeanShift'):                  
        cluster_obj = cluster.MeanShift(bandwidth=params['bandwidth'])
    if str.lower(model_name) == str.lower('DBSCAN'):                     
        cluster_obj = cluster.DBSCAN(eps=params['eps'])
    if str.lower(model_name) == str.lower('AffinityPropagation'):        
        cluster_obj = cluster.AffinityPropagation(damping=params['damping'], 
                                                  preference=params['preference'])
        cluster_obj.fit(ds)
    if str.lower(model_name) == str.lower('Birch'):                      
        cluster_obj = cluster.Birch(n_clusters=input_param['n_clusters'])
    if str.lower(model_name) == str.lower('GaussianMixture'):            
        cluster_obj = mixture.GaussianMixture(n_components=params['n_clusters'], 
                                              covariance_type='full')
        cluster_obj.fit(ds)
    
    
    if str.lower(model_name) in ['affinitypropagation', 'gaussianmixture']:
        model_result = cluster_obj.predict(ds)
    else:
        model_result = cluster_obj.fit_predict(ds)
    
    newdata[model_name] = pd.DataFrame(model_result)
    
    return(newdata)

cluster_list = ["KMeans", "MiniBatchKMeans", "DBSCAN", "Birch", "MeanShift"]
                #"GaussianMixture",
                #"SpectralClustering", 
                #"AffinityPropagation"]
input_param = {'n_clusters':140, 'bandwidth':0.1, "damping":0.9, "eps":1, 'min_samples':3,
               "preference":-200}
pca_comp = data[['lat','long']].fillna(0)
for i in cluster_list:
    data = cluster_model(data, pca_comp, i, input_param)
# newdata - основной датасет
# pca_comp - датасет для класьеризации

In [8]:
dots = data[['lat','long']].dropna()
db = DBSCAN(eps=1, min_samples=2).fit(dots)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 94
Estimated number of noise points: 42


In [9]:
data_no_nan = data.dropna(subset=['lat', 'long'])
data_no_nan['dbcluster'] = labels
data = pd.merge(data, data_no_nan[['id', 'dbcluster']], how='left', on='id')
data['dbcluster'] = np.where(data.lat.isnull(), -999, data.dbcluster)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
X_forest = data.drop(['address', 'address_rus', 'isTrain', 'target', 'geo_city', 'region', 
                      'city_rank', 'city_type', 'city', 'city_type_rank', 'street'], 1)
X_forest.fillna(0, inplace=True)
alg = IForest(n_estimators=int(np.round(np.sqrt(X_forest.shape[0]),0)) * 20)

if_mdlLst = alg.fit(X_forest)

if_y_pred = if_mdlLst.predict_proba(X_forest)[:, 1]

In [15]:
data['propab_isolation'] = if_y_pred

aggs = {
    'distance_1': ['min', 'max', 'mean', 'median', 'std','skew'],
    'distance_2': ['min', 'max', 'mean', 'median', 'std','skew'],
    'distance_3': ['min', 'max', 'mean', 'median', 'std','skew'],
    'distance_4': ['min', 'max', 'mean', 'median', 'std','skew'],
    'distance_4': ['min', 'max', 'mean', 'median', 'std','skew'],
    'propab_isolation': ['min', 'max', 'mean', 'median', 'std','skew'],
}
agg_df = data.groupby('atm_group').agg(aggs)
new_columns = [
    k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
agg_df.columns = new_columns
data  = pd.merge(data, agg_df, left_on='atm_group',  right_on='atm_group', how='left') 
fcp = {
        'distance_1': {
            'longest_strike_above_mean': None,
            'longest_strike_below_mean': None,
            'mean_change': None,
            'mean_abs_change': None,
            'absolute_sum_of_changes': None,
            'count_above_mean': None,
            'mean_second_derivative_central': None,
            'kurtosis': None,
            'length': None,
        },
        'distance_2': {
            'longest_strike_above_mean': None,
            'longest_strike_below_mean': None,
            'mean_change': None,
            'mean_abs_change': None,
            'absolute_sum_of_changes': None,
            'count_above_mean': None,
            'mean_second_derivative_central': None,
            'kurtosis': None,
            'length': None},
    'distance_3': {
            'longest_strike_above_mean': None,
            'longest_strike_below_mean': None,
            'mean_change': None,
            'mean_abs_change': None,
            'absolute_sum_of_changes': None,
            'count_above_mean': None,
            'mean_second_derivative_central': None,
            'kurtosis': None,
            'length': None},
    'distance_4': {
            'longest_strike_above_mean': None,
            'longest_strike_below_mean': None,
            'mean_change': None,
            'mean_abs_change': None,
            'absolute_sum_of_changes': None,
            'count_above_mean': None,
            'mean_second_derivative_central': None,
            'kurtosis': None,
            'length': None},
    
}



data.fillna(0, inplace=True)
agg_df_ts = extract_features(data, column_id='atm_group', column_value = 'distance_1', default_fc_parameters = fcp['distance_1'], n_jobs=4)
agg_df_2 = extract_features(data, column_id='atm_group', column_value = 'distance_2', default_fc_parameters = fcp['distance_2'], n_jobs=4)
agg_df_3 = extract_features(data, column_id='atm_group', column_value = 'distance_3', default_fc_parameters = fcp['distance_3'], n_jobs=4)
agg_df_4 = extract_features(data, column_id='atm_group', column_value = 'distance_4', default_fc_parameters = fcp['distance_4'], n_jobs=4)

agg_df_ts = agg_df_ts.reset_index()
agg_df_2 = agg_df_2.reset_index()
agg_df_3 = agg_df_3.reset_index()
agg_df_4 = agg_df_4.reset_index()

data  = pd.merge(data, agg_df_ts, left_on='atm_group',  right_on='id', how='left')
data  = pd.merge(data, agg_df_2, left_on='atm_group',  right_on='id', how='left')
data  = pd.merge(data, agg_df_3, left_on='atm_group',  right_on='id', how='left')
data  = pd.merge(data, agg_df_4, left_on='atm_group',  right_on='id', how='left')



Feature Extraction: 100%|██████████| 7/7 [00:00<00:00, 505.02it/s]
Feature Extraction: 100%|██████████| 7/7 [00:00<00:00, 469.79it/s]
Feature Extraction: 100%|██████████| 7/7 [00:00<00:00, 430.55it/s]
Feature Extraction: 100%|██████████| 7/7 [00:00<00:00, 509.72it/s]


In [16]:
newdata = data.copy()
newdata.to_csv("data/full_data.csv", index = False)

In [17]:
newdata.head()

Unnamed: 0,id_x,atm_group,address,address_rus,lat,long,target,isTrain,city_lat,city_long,geo_city,region,diff_from_centre,count,count_cities,count_regions,distance_1,indexes_1,distance_bear1,indexes_bear1,log_distance_1,distance_2,indexes_2,distance_bear2,indexes_bear2,log_distance_2,distance_3,indexes_3,distance_bear3,indexes_bear3,log_distance_3,distance_4,indexes_4,distance_bear4,indexes_bear4,log_distance_4,distance_5,indexes_5,distance_bear5,indexes_bear5,log_distance_5,mean,median,std,coords_pca,city,city_type,street,city_rank,city_type_rank,KMeans,MiniBatchKMeans,DBSCAN,Birch,MeanShift,dbcluster,propab_isolation,distance_1_min,distance_1_max,distance_1_mean,distance_1_median,distance_1_std,distance_1_skew,distance_2_min,distance_2_max,distance_2_mean,distance_2_median,distance_2_std,distance_2_skew,distance_3_min,distance_3_max,distance_3_mean,distance_3_median,distance_3_std,distance_3_skew,distance_4_min,distance_4_max,distance_4_mean,distance_4_median,distance_4_std,distance_4_skew,propab_isolation_min,propab_isolation_max,propab_isolation_mean,propab_isolation_median,propab_isolation_std,propab_isolation_skew,id_y,distance_1__absolute_sum_of_changes,distance_1__count_above_mean,distance_1__kurtosis,distance_1__length,distance_1__longest_strike_above_mean,distance_1__longest_strike_below_mean,distance_1__mean_abs_change,distance_1__mean_change,distance_1__mean_second_derivative_central,id_x.1,distance_2__absolute_sum_of_changes,distance_2__count_above_mean,distance_2__kurtosis,distance_2__length,distance_2__longest_strike_above_mean,distance_2__longest_strike_below_mean,distance_2__mean_abs_change,distance_2__mean_change,distance_2__mean_second_derivative_central,id_y.1,distance_3__absolute_sum_of_changes,distance_3__count_above_mean,distance_3__kurtosis,distance_3__length,distance_3__longest_strike_above_mean,distance_3__longest_strike_below_mean,distance_3__mean_abs_change,distance_3__mean_change,distance_3__mean_second_derivative_central,id,distance_4__absolute_sum_of_changes,distance_4__count_above_mean,distance_4__kurtosis,distance_4__length,distance_4__longest_strike_above_mean,distance_4__longest_strike_below_mean,distance_4__mean_abs_change,distance_4__mean_change,distance_4__mean_second_derivative_central
0,8526.0,32.0,"EMELYANOVA,34 Y-SAKHALINSK","улица А.О. Емельянова, 34, Южно-Сахалинск, Сах...",46.940995,142.738319,0.0115,True,46.95407,142.73603,Yuzhno-Sakhalinsk,Sakhalin,84.555515,1.0,52.0,62.0,0.171318,8286.0,-31.996624,8071.0,0.15813,0.406053,5775.0,-31.995913,4182.0,0.340786,0.447444,3575.0,-31.990134,5607.0,0.369799,0.46143,2802.0,-31.982927,871.0,0.379416,0.558967,8335.0,-31.98092,7955.0,0.444024,-10.413943,0.340786,15.791868,4.767821e+18,Южно-Сахалинск,Южно-Сахалинск,Южно-Сахалинск_улица А.О. Емельянова,7322.0,5272.0,0,12,0,77,37,0.0,0.346887,0.0,6.29425,0.559075,0.221691,1.038131,3.832853,0.0,6.835024,0.818977,0.447444,1.209491,3.380069,0.08853,16.229077,1.323493,0.638045,2.415692,4.705997,0.141586,31.300282,1.91201,0.727554,4.569964,5.331258,0.062163,0.611065,0.271232,0.308873,0.14026,0.458076,32.0,45.98677,12.0,17.923743,62.0,2.0,14.0,0.753881,-0.002808,-0.006965,32.0,60.297248,15.0,13.140742,62.0,2.0,13.0,0.988479,-0.006657,-0.010902,32.0,100.123898,17.0,26.956353,62.0,2.0,13.0,1.641375,-0.00351,-0.011297,32.0,158.3821,7.0,32.933815,62.0,2.0,21.0,2.596428,0.011343,-0.003517
1,8532.0,32.0,"KOMSOMOLSKAYA,259B Y.SAKHALINSK","Комсомольская улица, 259, Южно-Сахалинск, Саха...",46.937353,142.753348,0.02971,True,46.95407,142.73603,Yuzhno-Sakhalinsk,Sakhalin,152.145602,1.0,52.0,62.0,0.243277,1647.0,-31.988475,8071.0,0.217751,0.649501,416.0,-31.987762,4182.0,0.500473,0.720783,7889.0,-31.981983,5607.0,0.542779,0.721148,6023.0,-31.974777,871.0,0.542992,0.903549,4770.0,-31.972774,7955.0,0.64372,-10.28132,0.500473,15.883565,1.5849e+19,Южно-Сахалинск,Южно-Сахалинск,Южно-Сахалинск_Комсомольская улица,7322.0,5272.0,0,12,0,77,37,0.0,0.360368,0.0,6.29425,0.559075,0.221691,1.038131,3.832853,0.0,6.835024,0.818977,0.447444,1.209491,3.380069,0.08853,16.229077,1.323493,0.638045,2.415692,4.705997,0.141586,31.300282,1.91201,0.727554,4.569964,5.331258,0.062163,0.611065,0.271232,0.308873,0.14026,0.458076,32.0,45.98677,12.0,17.923743,62.0,2.0,14.0,0.753881,-0.002808,-0.006965,32.0,60.297248,15.0,13.140742,62.0,2.0,13.0,0.988479,-0.006657,-0.010902,32.0,100.123898,17.0,26.956353,62.0,2.0,13.0,1.641375,-0.00351,-0.011297,32.0,158.3821,7.0,32.933815,62.0,2.0,21.0,2.596428,0.011343,-0.003517
2,8533.0,32.0,"KOMMUN. PR., 32 YUZHNO SAKHAL","Коммунистический проспект, Южно-Сахалинск, Сах...",46.959413,142.741113,0.00954,True,46.95407,142.73603,Yuzhno-Sakhalinsk,Sakhalin,46.688184,1.0,52.0,62.0,0.221691,4028.0,-31.999341,8071.0,0.200236,0.221691,2397.0,-31.998627,4182.0,0.200236,0.30445,3521.0,-31.992846,5607.0,0.265781,0.339491,1648.0,-31.98564,871.0,0.292289,0.495823,6738.0,-31.98364,7955.0,0.402676,-10.467715,0.221691,15.754367,1.584846e+19,Сахалинская область,RARE,Сахалинская область_Коммунистический проспект,5703.5,6926.5,0,12,0,77,37,0.0,0.342544,0.0,6.29425,0.559075,0.221691,1.038131,3.832853,0.0,6.835024,0.818977,0.447444,1.209491,3.380069,0.08853,16.229077,1.323493,0.638045,2.415692,4.705997,0.141586,31.300282,1.91201,0.727554,4.569964,5.331258,0.062163,0.611065,0.271232,0.308873,0.14026,0.458076,32.0,45.98677,12.0,17.923743,62.0,2.0,14.0,0.753881,-0.002808,-0.006965,32.0,60.297248,15.0,13.140742,62.0,2.0,13.0,0.988479,-0.006657,-0.010902,32.0,100.123898,17.0,26.956353,62.0,2.0,13.0,1.641375,-0.00351,-0.011297,32.0,158.3821,7.0,32.933815,62.0,2.0,21.0,2.596428,0.011343,-0.003517
3,8684.0,32.0,"LENINGRADSKIY PR.,76A MOSCOW","Ленинградский проспект, 76А, Москва, Россия, 1...",55.805827,37.515146,-0.094035,True,55.8,37.51667,Sokol,Moscow,37.814604,1.0,17.0,636.0,0.371956,6172.0,-89.919208,4559.0,0.316237,0.6427,7507.0,-89.610644,5741.0,0.496341,0.701862,6198.0,-89.542079,4182.0,0.531723,0.701862,6852.0,-89.534729,5607.0,0.531723,0.820951,4767.0,-89.482737,8071.0,0.599359,-29.491646,0.496341,44.008171,-5.213109e+18,Москва,Москва,Москва_Ленинградский проспект,3282.5,2212.5,40,121,1,136,0,1.0,0.102683,0.0,6.29425,0.559075,0.221691,1.038131,3.832853,0.0,6.835024,0.818977,0.447444,1.209491,3.380069,0.08853,16.229077,1.323493,0.638045,2.415692,4.705997,0.141586,31.300282,1.91201,0.727554,4.569964,5.331258,0.062163,0.611065,0.271232,0.308873,0.14026,0.458076,32.0,45.98677,12.0,17.923743,62.0,2.0,14.0,0.753881,-0.002808,-0.006965,32.0,60.297248,15.0,13.140742,62.0,2.0,13.0,0.988479,-0.006657,-0.010902,32.0,100.123898,17.0,26.956353,62.0,2.0,13.0,1.641375,-0.00351,-0.011297,32.0,158.3821,7.0,32.933815,62.0,2.0,21.0,2.596428,0.011343,-0.003517
4,37.0,32.0,"GVARDEYSKAYA PL., 2 NORILSK","Гвардейская площадь, 2, Норильск, Красноярский...",69.343541,88.211228,0.079277,True,69.3535,88.2027,Norilsk,Krasnoyarskiy,82.607129,1.0,24.0,370.0,0.199516,4569.0,-81.20216,8071.0,0.181918,0.367687,3417.0,-81.200029,4182.0,0.313121,0.394887,3390.0,-81.189925,5607.0,0.332813,0.610316,2196.0,-81.178211,871.0,0.47643,0.878445,1013.0,-81.176263,7955.0,0.630444,-26.770734,0.313121,39.8307,5.344311e+18,Норильск,Норильск,Норильск_Гвардейская площадь,4385.5,3152.5,21,40,2,10,74,2.0,0.290895,0.0,6.29425,0.559075,0.221691,1.038131,3.832853,0.0,6.835024,0.818977,0.447444,1.209491,3.380069,0.08853,16.229077,1.323493,0.638045,2.415692,4.705997,0.141586,31.300282,1.91201,0.727554,4.569964,5.331258,0.062163,0.611065,0.271232,0.308873,0.14026,0.458076,32.0,45.98677,12.0,17.923743,62.0,2.0,14.0,0.753881,-0.002808,-0.006965,32.0,60.297248,15.0,13.140742,62.0,2.0,13.0,0.988479,-0.006657,-0.010902,32.0,100.123898,17.0,26.956353,62.0,2.0,13.0,1.641375,-0.00351,-0.011297,32.0,158.3821,7.0,32.933815,62.0,2.0,21.0,2.596428,0.011343,-0.003517
