# Install environment

In [1]:
RANDOM_STATE = 42
COLAB = False # если на колабе то тру соответственно 

In [2]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    !pip install catboost
    !pip install sktime
    !pip install tqdm

In [3]:
import numpy as np
import pandas as pd

from collections import Counter
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sktime.transformations.panel.rocket import MiniRocket
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import RidgeClassifier

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# Model code

In [4]:
import datetime 
from math import cos, asin, sqrt, pi
from tracks.tracks_preprocessing import Tracks_preprocessing
from nlp.nlp_model import get_model
from tsfresh import select_features

# TODO: добавить фичи по превышению скорости, добавить фичи из нлп гены(он там все настроил уже по идее), натреинровать, затестить

# formula using pythogorian theorem
# (as distances are not large, we can approximate earth rounding)
def get_distance(lat1,lon1,lat2,lon2):
    delta_lat = pow(lat2-lat1,2)
    delta_lon = pow(lon2-lon1,2)
    return np.round(sqrt(delta_lat+delta_lon)*100, 3)    

def get_speed(lat1, lon1, lat2, lon2, dt1: str, dt2: str) -> float:
    distance = get_distance(lat1, lon1, lat2, lon2).tolist()
    format = "%Y-%m-%d %H:%M:%S"
    dt1=datetime.datetime.strptime(dt1, format)
    dt2=datetime.datetime.strptime(dt2, format)
    time = (dt2-dt1).total_seconds()/3600 # convert timedelta into hours
    if time==0:
        return 0
    return distance/time


class Model:
    def __init__(self):        
        self.model = None
        self.model_tracks = None

        self.counter_words = {}
    
    def count_words(self, x):
        return len(x.split(" "))

    def check_sentence(self, sentence, words_type):
        words_count = 0
        for word in sentence.split(" "):
            word = word.lower().replace(',', '').replace('.', '')

            if (word not in list(self.counter_words.keys()) or len(self.counter_words[word]) == 2): continue

            if (words_type == self.counter_words[word][2]): 
                words_count += 1
        return words_count
        
    def NLP_features(self, X: pd.DataFrame ,y: pd.Series):
        # гена там короче все настроил надо проверить тока(получить фичи и засунуть к остальным, натренировать катбуст)
        return get_model()

    def add_features(self, X):
        comment_phrases = list(X.comment.value_counts().index[: 5]) + ["---"]
        
        X["is_comment"] = (~np.isin(X.comment, comment_phrases)).astype(int)
        X['dttm'] = pd.to_datetime(X.dttm)
        X['hour'] = X.dttm.apply(lambda x: x.hour)
        X['traff_jam'] = ((X.hour > 6) & (X.hour < 10)) | ((X.hour > 17) & (X.hour < 23))
        X['traff_jam'] = X.traff_jam.astype(int)
        X['weekday'] = X.dttm.apply(lambda x: x.weekday())
        X['holiday'] = (X.weekday >= 5).astype(int)
        X["count_words"] = [-1] * X.shape[0]
        X.loc[X.is_comment == True, "count_words"] = X[X.is_comment == True].comment.apply(lambda x: self.count_words(x))
        X["speed"] = X.distance / (X.duration / 60)
        X['agg_words'] = X.comment.apply(lambda x: self.check_sentence(x, "aggressive"))
        X['normal_words'] = X.comment.apply(lambda x: self.check_sentence(x, "normal"))
        X['distance_thresh'] = ((X.distance > 5) & (X.distance < 20)).astype(int)
        
        return X
    
    def gen_speed(self, tracks):
        tracks['speed'] = np.zeros(tracks.shape[0])
        for i in tqdm(range(1, len(tracks))):
            tracks.iloc[i, tracks.columns.get_loc('speed')] = get_speed(tracks.iloc[i-1, tracks.columns.get_loc('lat_')], tracks.iloc[i-1, tracks.columns.get_loc('lon_')],
                                        tracks.iloc[i, tracks.columns.get_loc('lat_')], tracks.iloc[i, tracks.columns.get_loc('lon_')], tracks.iloc[i-1, tracks.columns.get_loc('dt')], tracks.iloc[i, tracks.columns.get_loc('dt')])
        return tracks
    
    def estimate(self, X, y):
        return roc_auc_score(y, self.model.predict_proba(X).T[1])
    
    def train_test_split_(self, X, y, test_size, X_ss=None, y_ss=None, random_state=RANDOM_STATE):
        assert X.shape[0] == y.shape[0]
        if (X_ss is not None):
            X_ss_full, y_ss_full = self.label_shuffle(X, y, X_ss, y_ss, random_state = random_state)
            
            len_train = len(X_ss_full) - round(len(X_ss_full) * test_size)
            
            x_train = X_ss_full[: len_train]
            x_train.drop('ss', axis = 1, inplace = True)
            
            x_test = X_ss_full.iloc[len_train + 1:]
            x_test = x_test[x_test.ss == 0]
            x_test.drop('ss', axis = 1, inplace = True)
            
            y_train = y_ss_full[: len_train]
            y_train.drop('ss', axis = 1, inplace = True)
            
            y_test = y_ss_full.iloc[len_train + 1:]
            y_test = y_test[y_test.ss == 0]
            y_test.drop('ss', axis = 1, inplace = True)
            
            return (x_train, x_test, y_train, y_test)
        
        len_train = len(X) - round(len(X) * test_size)
        
        X = X.sample(frac=1, random_state=random_state)
        y = y.sample(frac=1, random_state=random_state)
        
        return (X[: len_train], X[len_train :], y[: len_train], y[len_train :])
    
    def train(self, X_train, X_test, y_train, y_test, categorical_feature, random_state=RANDOM_STATE):
        print(f"Train size: {X_train.shape}")
        print(f"Test size: {X_test.shape}")
        print(f'TRAIN HEAD: \n {X_train.head()}')

        print('y_TRAIN')
        print(y_train.head())
        print('X_TRAIN')
        print(X_train.head())
        non_aggressive = len(y_train)-sum(y_train)
        aggressive = sum(y_train)
        class_weights = (1, int(non_aggressive/aggressive))

        self.model = CatBoostClassifier(iterations=4000,
                           depth=3,
                           silent=False,
                           loss_function='Logloss',
                           class_weights=class_weights,
                           random_state=random_state)
        #self.model.select_features(X_train, y_train,(X_test, y_test), features_for_select = '')
        self.model.fit(X_train, y_train, cat_features=categorical_feature, save_snapshot=True,)
        
        print(y_test.shape)

        return self.estimate(X_test, y_test)
    
    def label_shuffle(self, X, y, X_ss, y_ss, random_state=RANDOM_STATE):
        X_ss['ss'] = 1
        y_ss = y_ss.to_frame()
        y_ss['ss'] = 1

        X['ss'] = 0
        y['ss'] = 0

        X_ss_full = pd.concat([X, X_ss]).sample(frac=1, random_state=random_state)
        y_ss_full = pd.concat([y, y_ss]).sample(frac=1, random_state=random_state)
        
        return (X_ss_full, y_ss_full)
    
    def train_cross_validation(self, X, y, k, categorical_features, X_ss=None, y_ss=None, random_state=RANDOM_STATE):
        chunk_size = len(X) / k
        chunks_size = [(i*chunk_size, i*chunk_size + chunk_size) for i in range(k)]
        
        result_score = []
        
        print(f"Part size: {chunk_size}")
        
        if (X_ss is not None):
            X_ss_full, y_ss_full = self.label_shuffle(X, y, X_ss, y_ss, random_state = random_state)
            
            for chunkIndex in range(len(chunks_size)):
                x_test = X_ss_full[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                y_test = y_ss_full[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                
                x_train = X_ss_full.drop(x_test.index, axis = 0)
                y_train = y_ss_full.drop(y_test.index, axis = 0)
                
                x_test = x_test[x_test.ss == 0]
                y_test = y_test[y_test.ss == 0]
                
                x_train.drop('ss', axis = 1, inplace = True)
                y_train.drop('ss', axis = 1, inplace = True)
                x_test.drop('ss', axis = 1, inplace = True)
                y_test.drop('ss', axis = 1, inplace = True)
                
                score = self.train(x_train, x_test, y_train, y_test, categorical_features)
                
                print(f"Chunk {chunkIndex}; Score: {score}")
                
                result_score.append((chunks_size[chunkIndex], score))
        else:            
            for chunkIndex in range(len(chunks_size)):
                x_test = X[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                y_test = y[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                
                x_train = X.drop(x_test.index, axis = 0)
                y_train = y.drop(y_test.index, axis = 0)
                
                score = self.train(x_train, x_test, y_train, y_test, categorical_features)
                
                print(f"Chunk {chunkIndex}; Score: {score}")
                
                result_score.append((chunks_size[chunkIndex], score))
            
        print(f"Mean score: {sum(list(map(lambda x: x[1], result_score))) / k}")
        
        return result_score
    
    def fit_ss(self, X, y, numeric_features, categorial_features, X_ss, y_ss, cross_validation=False, random_state=RANDOM_STATE):
        self.counter_words = {}
        
        X_ = X
        y_ = y
        
        self.NLP_preprocess(pd.concat([X_, X_ss]), pd.concat([y_, y_ss]))
        X_ = self.add_features(X_)[numeric_features + categorical_features]
        
        X_ss = self.add_features(X_ss)[numeric_features + categorical_features]
        
        if (not cross_validation):
            X_train, X_test, y_train, y_test = self.train_test_split_(X_, y_, test_size=0.2, X_ss=X_ss, y_ss=y_ss, random_state=random_state)
            return self.train(X_train, X_test, y_train, y_test, categorical_features)
        else:
            return self.train_cross_validation(X_, y_, 5, categorical_features, X_ss=X_ss, y_ss=y_ss, random_state=random_state)
        
    def fit(self, X, numeric_features, categorical_features, tracks = None, cross_validation=False, random_state=RANDOM_STATE):
        if tracks is not None:
            preprocessing = Tracks_preprocessing() 
            tracks_train, tracks_y_train = preprocessing.preprocess(tracks)
        
        self.counter_words = {}
        
        X_ = X.set_index('order_id')
        y_ = tracks_y_train
        X_ = self.add_features(X_)[numeric_features + categorical_features]

        features = self.NLP_features(X_, y_)
        X_ = X_.merge(features, on='order_id', how='left')
        print(X_.head())

        print(f"Table data matrix shape: {X_.shape}")
        print(f"Tracks data matrix shape: {tracks_train.shape}")
        # у датасета в виде индекса ордер айди, колонки - фичи заказа
        res_matrix = tracks_train.merge(X_, right_index=True,left_index=True)
        print('RES MATRIX')
        print(res_matrix)
        res_matrix.fillna(0, inplace=True)
        #res_matrix = select_features(res_matrix, y_)
        #res_matrix.drop(['order_id'], axis=1, inplace=True)

        print(f"Result matrix shape: {res_matrix.shape}")

        if (not cross_validation):
            X_train, X_test, y_train, y_test = self.train_test_split_(res_matrix, y_, test_size=0.2, random_state=random_state)
            return self.train(X_train, X_test, y_train, y_test, categorical_features)
        else:
            return self.train_cross_validation(X_, y_, 5, categorical_features, random_state=random_state)

    def predict_proba(self, X, add_feat=True, tracks=None):
        if (add_feat): X = self.add_features(X)
        if tracks is not None:
            # TODO: из инфы про расположение получить инфу про знаки и лимиты, получить фичи по скоростям из feature_extraction
            ...
        
        X = self.add_features(X)[numeric_features + categorical_features + ["order_id"]]

        res_matrix = X.merge(tracks_train, how='left', on='order_id')
        res_matrix.fillna(0, inplace=True)
        res_matrix.drop(['order_id'], axis=1, inplace=True)

        return self.model.predict_proba(res_matrix).T[1]
    
    def predict(self, X, add_feat=True, tracks=None):
        if (add_feat): X = self.add_features(X)
        if tracks is not None:
            tracks_train, order_ids = self.tracks_transform(tracks, labled=False)
            tracks_train['order_id'] = order_ids
            tracks_train = tracks_train.drop_duplicates('order_id', keep='last')
        
        X = self.add_features(X)[numeric_features + categorical_features + ["order_id"]]

        res_matrix = X.merge(tracks_train, how='left', on='order_id')
        res_matrix.fillna(0, inplace=True)
        res_matrix.drop(['order_id'], axis=1, inplace=True)

        return self.model.predict(res_matrix)
    
    def predict_thresh(self, X, thresh_above, thresh_below):
        y_unlab_full = self.predict_proba(X)
        
        y_unlab = pd.Series([-1 for i in range(len(X))])
        
        print("Thresh above: {}".format(sum(y_unlab_full >= thresh_above) / len(y_unlab_full)))
        print("Thresh below: {}".format(sum(y_unlab_full <= thresh_below) / len(y_unlab_full)))
        
        y_unlab.iloc[np.where(y_unlab_full >= thresh_above)] = 1
        y_unlab.iloc[np.where(y_unlab_full <= thresh_below)] = 0
        
        return y_unlab


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/porosenok/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Train on labled data

In [5]:
if COLAB:
    path = '/content/drive/MyDrive/aiijc_transport_simpleteam/'
else:
    path = './'

train_labled = pd.read_csv(path + 'data/base_files/labled_train_data.csv', index_col=0, sep='\t', comment='#')
tracks_labled = pd.read_csv(path+"data/labled_train_tracks_speed.csv", index_col=0, sep=',', comment='#')
tracks_unlabled = pd.read_csv(path +"data/unlabled_train_tracks_speed.csv", index_col=0, sep=',', comment='#')

X_ = train_labled
y_ = train_labled.iloc[:, -1:]

X_['client_rate_ride'] = X_['client_rate_ride'].fillna(X_['client_rate_ride'].mean())
X_['client_rides_cnt'] = X_['client_rides_cnt'].fillna(X_['client_rides_cnt'].mean())
X_['driver_rides_cnt'] = X_['driver_rides_cnt'].fillna(X_['driver_rides_cnt'].mean())

In [6]:
numeric_features = ['distance', 'arrived_distance', 'arrived_duration', 'duration', 'driver_rides_cnt', 'client_rides_cnt', 'client_rate_ride', 'count_words']

categorical_features = ['mark', 'is_comment', 'hour', 'weekday', 'agg_words', 'normal_words']
model_supervised = Model()

model_supervised.fit(X_, numeric_features, categorical_features, tracks=tracks_labled)

                                  speed__fft_coefficient__attr_"abs"__coeff_3  \
001662da857b5a39bb402aacf3145f86                                   175.189443   
001a8da3b83fe77dd0f3a406acb414c2                                    18.445105   
001eaa75ad97eff199d914f67b8e32ef                                   112.883308   
001f7deb75b27d8ef5a68204a0bd86a2                                   156.402613   
002946d95ca1bc6954858b63c202bbb1                                   382.243667   

                                  speed__absolute_sum_of_changes  \
001662da857b5a39bb402aacf3145f86                      651.486763   
001a8da3b83fe77dd0f3a406acb414c2                      307.701308   
001eaa75ad97eff199d914f67b8e32ef                      797.382464   
001f7deb75b27d8ef5a68204a0bd86a2                      303.249206   
002946d95ca1bc6954858b63c202bbb1                     2241.758557   

                                  speed__cid_ce__normalize_False  \
001662da857b5a39bb402aacf3145f86    

TypeError: fit() got an unexpected keyword argument 'categorical'

In [None]:
predictions = model_supervised.predict(X_, add_feat=True, tracks=tracks_labled)

Preprocessing tracks data...
    Begin shape: (395687, 7)


100%|██████████| 9000/9000 [03:51<00:00, 38.90it/s]


    Shape after preprocessing: (9257, 1)
Training MiniRocket...
MiniRocket transforming...
    Shape before transform: (9257, 1)


In [None]:
predictions.sum()

23

In [None]:
model_cv = Model()

model_cv.fit(X_, y_, numeric_features, categorical_features, cross_validation = True)

# Semi-supervised train

## Preprocessing

In [None]:
X_unlab = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/unlabled_train_data.csv', index_col=0, sep='\t', comment='#')
comments_unlabeled = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/unlabled_train_comments.csv', index_col=0, sep='\t', comment='#')
tracks_unlabeled = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/unlabled_train_tracks.csv', index_col=0, sep='\t', comment='#')

X_unlab['client_rate_ride'] = X_unlab['client_rate_ride'].fillna(X_unlab['client_rate_ride'].mean())
X_unlab['client_rides_cnt'] = X_unlab['client_rides_cnt'].fillna(X_unlab['client_rides_cnt'].mean())
X_unlab['driver_rides_cnt'] = X_unlab['driver_rides_cnt'].fillna(X_unlab['driver_rides_cnt'].mean())

In [None]:
sum(X_unlab.comment.isna()) / len(X_unlab.comment)

0.64866629360194

In [None]:
np.where(X_unlab.comment.isna())[0]

array([ 2762,  3239,  3574, ..., 10719, 10720, 10721])

In [None]:
for nanIndex in np.where(X_unlab.comment.isna())[0]:
    obj_comment = comments_unlabeled.loc[nanIndex]
    
    if (len(obj_comment) != 0):
        X_unlab.comment.iloc[nanIndex] = obj_comment.comment

In [None]:
sum(X_unlab.comment.isna()) / len(X_unlab.comment)

0.00018653236336504383

In [None]:
X_unlab.comment.iloc[np.where(X_unlab.comment.isna())[0]] = "---"

## Prediction&filling

In [None]:
y_unlab = model_supervised.predict_thresh(X_unlab, 0.99, 0.001)

Thresh above: 0.003357582540570789
Thresh below: 0.0


In [None]:
y_unlab.name = "is_aggressive"

In [None]:
y_unlab.value_counts()

-1    10686
 1       36
Name: is_aggressive, dtype: int64

In [None]:
X_unlab_lab = X_unlab.iloc[np.where(y_unlab != -1)]
y_unlab_lab = y_unlab.iloc[np.where(y_unlab != -1)]

In [None]:
X_unlab_lab.comment

7        1)Водитель играл в «шашки» на дороге и игрался...
202              2 раза списали деньги, верните пожалуйста
351      засыпал за рулём,  съезжал с дороги, приходило...
551               резкие повороты. водитель резко тормозил
568      Водитель смотрит кино на втором телефоне, проп...
705      Вел медленно по бордовым дорогам, сказал «пешк...
1317     Водитель резко тормозил, обгонял, кричал на др...
1378                             Водитель вел себя грубо. 
1634     Водитель постоянно громко разговаривал на своё...
1667     Водитель в возрасте, несколько раз отвечал на ...
1707     Водитель 2 раза поругался с другими таксистами...
1975     Водитель опасно вел автомобиль. Было ощущение ...
2221     Водитель явно засыпал за рулём, постоянно зева...
2261      Приехала другая машина вместо указанной в заказе
2302                       Водитель неадекватно себя вёл, 
2346     Проехал на красный свет дважды на перекрестках...
2472                      водит опасно и очень неаккурат

In [None]:
X_unlab_lab.is_comment.value_counts()

1    36
Name: is_comment, dtype: int64

## Train

In [None]:
model_semisupervised = Model()

model_semisupervised.fit_ss(X_, y_, numeric_features, categorical_features, X_unlab_lab, y_unlab_lab)

Train size: (7229, 14)
Test size: (1798, 14)


0.8488724212067151

In [None]:
model_ss_cv = Model()

model_ss_cv.fit_ss(X_, y_, numeric_features, categorical_features, X_unlab_lab, y_unlab_lab, cross_validation=True)

Part size: 1800.0
Train size: (7224, 14)
Test size: (1791, 14)
Chunk 0; Score: 0.7396306537625914
Train size: (7228, 14)
Test size: (1797, 14)
Chunk 1; Score: 0.742049078955933
Train size: (7224, 14)
Test size: (1794, 14)
Chunk 2; Score: 0.7889097744360902
Train size: (7225, 14)
Test size: (1790, 14)
Chunk 3; Score: 0.7370420937809273
Train size: (7226, 14)
Test size: (1792, 14)
Chunk 4; Score: 0.85435199720914
Mean score: 0.7723967196289363


[((0.0, 1800.0), 0.7396306537625914),
 ((1800.0, 3600.0), 0.742049078955933),
 ((3600.0, 5400.0), 0.7889097744360902),
 ((5400.0, 7200.0), 0.7370420937809273),
 ((7200.0, 9000.0), 0.85435199720914)]

# Prediction

In [None]:
X_test = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/labled_test_data.csv', index_col=0, sep='\t', comment='#')
tracks_test = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/labled_test_tracks.csv', index_col=0, sep='\t', comment='#')

In [None]:
tracks_test.groupby('order_id').size()

order_id
000d9cf4365ad8be9b559951d0d945c7     12
00287e34dd884a2a69c80346541d2aef     64
00307c7812842b1159781c2c6375944a     41
0061e7abbe5544c40781ba2816b3e026     61
0074b0c828084e05c28035487ad2a130     82
                                   ... 
ff209045501b1f25e8729a96a215a3d2     97
ff4c5997ed87ff37a3c215bab2c0916e     49
ff6873cfaccafec937bbed29e317d3e2     91
ff9745e14cda84a4550b528a8d9aa4de    103
ffd2c55165c42430793423c93211bd46     53
Length: 1272, dtype: int64

In [None]:
result_series = pd.Series(model_supervised.predict(X_test, add_feat=True, tracks=tracks_test))

Preprocessing tracks data...
    Begin shape: (71432, 5)


100%|██████████| 1272/1272 [00:09<00:00, 133.88it/s]


    Shape after preprocessing: (1966, 1)
Training MiniRocket...
MiniRocket transforming...
    Shape before transform: (1966, 1)


In [None]:
result_series.sum()

40

In [None]:
result_series = pd.Series(model_supervised.predict(X_test))
# result_series = pd.Series(model_semisupervised.predict(X_test))
# result_series = pd.Series(np.zeros(X_test.shape[0]))

In [None]:
result_series.shape[0], tracks_result.shape[0]

(1272, 1966)

In [None]:
abs(tracks_result - result_series)

ValueError: ignored

In [None]:
result_series.name = 'is_aggressive'

In [None]:
result_series.sum()

40

In [None]:
result_series.to_csv("result.csv")