# Install environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install catboost
!pip install sktime
!pip install tqdm

Collecting catboost
  Downloading catboost-0.26.1-cp37-none-manylinux1_x86_64.whl (67.4 MB)
[K     |████████████████████████████████| 67.4 MB 64 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26.1
Collecting sktime
  Downloading sktime-0.7.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.4 MB/s 
Collecting scikit-learn>=0.24.0
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.1 MB/s 
Collecting statsmodels>=0.12.1
  Downloading statsmodels-0.12.2-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)
[K     |████████████████████████████████| 9.5 MB 59.8 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, statsmodels, scikit-learn, sktime
  Attempting uninstall: statsmodels
    Found existing installation: statsmodels 0.10.

In [3]:
import numpy as np
import pandas as pd

from collections import Counter
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sktime.transformations.panel.rocket import MiniRocket
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import RidgeClassifier

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")



In [4]:
RANDOM_STATE = 42

# Speed tracks

## Dowload and add <code>aggressive</code> column

In [None]:
speed_tracks = pd.read_csv("/content/drive/MyDrive/aiijc_transport_simpleteam/data/labled_train_tracks_speed.csv")

In [None]:
speed_tracks

In [None]:
speed_tracks.speed.mean()

In [None]:
speed_tracks.groupby('order_id').speed.mean()

In [None]:
(speed_tracks.speed == 0).sum() / speed_tracks.shape[0]

In [None]:
from tqdm.notebook import tqdm

speed_tracks['is_aggressive'] = np.zeros(speed_tracks.shape[0])

for obj in tqdm(train_labeled.values):
    speed_tracks["is_aggressive"].loc[(speed_tracks.order_id == obj[0])] = obj[-1]

In [None]:
speed_tracks.to_csv("labled_train_tracks_speed.csv")
!cp labled_train_tracks_speed.csv /content/drive/MyDrive/aiijc_transport_simpleteam/data/

## Tracks model

In [None]:
tracks_labled = pd.read_csv("/content/drive/MyDrive/aiijc_transport_simpleteam/data/labled_train_tracks_speed.csv", index_col=0, sep=',', comment='#')

In [None]:
from functools import lru_cache

# @lru_cache(maxsize=None)
def split(arr, chunk_size = 15):
    result = []
    # get right length of arr so that it equally splits into chunks
    length = len(arr)
    split_length = length - (length%chunk_size)

    for i in range(split_length)[chunk_size::chunk_size]:
        result.append(arr[i-chunk_size:i])

    return np.array(result)

train_labels = []
X_train = []

for order in tracks_labled['order_id'].unique():
    order_df = tracks_labled[tracks_labled['order_id']==order]
    order_df.loc[0,'speed']=0
    
    chunk_size = 15
    if order_df.shape[0]<chunk_size:
        continue

    splitted_arrs = split(order_df.values, chunk_size)
    for arr in splitted_arrs:
        is_aggressive = arr[0][7]
        train_labels.append(is_aggressive)
        speed_series = []
        for row in arr:  
            # append only speed and dt values 
            speed_series.append(row[6])
        X_train.append(pd.Series(speed_series))

X_train = pd.DataFrame({'speed':X_train})
y_train = np.array(train_labels)

TypeError: ignored

In [None]:
X_train

Unnamed: 0,speed
0,0 0.000000 1 5.906441 2 15.69600...
1,0 7.432941 1 3.720000 2 8.04600...
2,0 -0.091778 1 3.325714 2 2.92800...
3,0 36.148235 1 173.160000 2 57.96...
4,0 0.024739 1 4.012500 2 9.07826...
...,...
15207,0 92.244706 1 104.805000 2 112.82...
15208,0 103.214118 1 107.100000 2 96.22...
15209,0 104.602500 1 89.301176 2 97.51...
15210,0 -0.035823 1 8.820000 2 0.86087...


In [None]:
y_train

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
from sklearn.model_selection import KFold  
from sklearn.pipeline import make_pipeline
from sktime.transformations.panel.rocket import MiniRocket
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifierCV
from sklearn.semi_supervised import SelfTrainingClassifier

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

rocket = MiniRocket()
rocket.fit(X_train, y_train)

X_train_transform = rocket.transform(X_train,y_train)

classifier = RidgeClassifierCV(alphas = np.logspace(-3, 3, 10), normalize = True)
classifier.fit(X_train_transform, y_train)

X_test_transform = rocket.transform(X_test)
classifier.score(X_test_transform, y_test)

0.9368525896414343

# Model code

In [403]:
import datetime 
from math import cos, asin, sqrt, pi

# Haversine formula for calculating distances between two points
def get_distance(lat1, lon1, lat2, lon2):
    r = 6371
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) *   np.sin(delta_lambda / 2)**2
    res = r * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))
    return np.round(res, 4)
    

def get_speed(lat1, lon1, lat2, lon2, dt1: str, dt2: str) -> float:
    distance = get_distance(lat1, lon1, lat2, lon2).tolist()
    format = "%Y-%m-%d %H:%M:%S"
    dt1=datetime.datetime.strptime(dt1, format)
    dt2=datetime.datetime.strptime(dt2, format)
    time = (dt2-dt1).total_seconds()/3600 # convert timedelta into hours
    if time==0:
        return 0
    return distance/time


class Model:
    def __init__(self):        
        self.model = None
        self.model_tracks = None
        self.model_rocket = None

        self.counter_words = {}

        self.TRACKS_CHUNK_SIZE = 20
        self.TRACKS_MULTIPLIER = 1
    
    def count_words(self, x):
        return len(x.split(" "))

    def check_sentence(self, sentence, words_type):
        words_count = 0
        for word in sentence.split(" "):
            word = word.lower().replace(',', '').replace('.', '')

            if (word not in list(self.counter_words.keys()) or len(self.counter_words[word]) == 2): continue

            if (words_type == self.counter_words[word][2]): 
                words_count += 1
        return words_count
        
    def NLP_preprocess(self, X ,y):
        dataset_joined = X.join(y)
        comment_phrases = list(dataset_joined.comment.value_counts().index[: 10])
        
        dataset_joined['is_comment'] = (~np.isin(dataset_joined.comment, comment_phrases)).astype(int)
        
        aggressive_comments = dataset_joined[(dataset_joined['is_comment'] == True) & (dataset_joined.is_aggressive == True)].comment.values
        normal_comments = dataset_joined[(dataset_joined['is_comment'] == True) & (dataset_joined.is_aggressive == False)].comment.values
        
        stop_words = ['на', 'по', 'с', 'в', 'что', 'и', 'а']

        for sentence in normal_comments:
            for word in sentence.split(" "):
                word = word.lower().replace(',', '').replace('.', '')
                if (word in stop_words): continue
                if (word in self.counter_words.keys()):
                    self.counter_words[word][0] += 1
                else: self.counter_words[word] = [1, 0]

        for sentence in aggressive_comments:
            for word in sentence.split(" "):
                word = word.lower().replace(',', '').replace('.', '')
                if (word in stop_words): continue
                if (word in self.counter_words.keys()):
                    self.counter_words[word][1] += 1
                else: self.counter_words[word] = [0, 1]
        
        
        count_all_words = np.array(list(map(lambda x: np.array(x), np.array(list(self.counter_words.items())).T[1]))).T
        
        count_normal_words = count_all_words[0].sum()
        count_aggressive_words = count_all_words[1].sum()

        for word_pair in list(self.counter_words.items()):
            if (word_pair[1][1] == 0 and word_pair[1][0] > 0):
                self.counter_words[word_pair[0]].append("normal")
                continue

            if (word_pair[1][0] == 0 and word_pair[1][1] > 0):
                self.counter_words[word_pair[0]].append("aggressive")
                continue

            ratio_aggressive = word_pair[1][1] / count_aggressive_words
            ratio_normal = word_pair[1][0] / count_normal_words

            if (ratio_aggressive / ratio_normal >= 3):
                self.counter_words[word_pair[0]].append("aggressive")
                continue

            if (ratio_normal / ratio_aggressive >= 3):
                self.counter_words[word_pair[0]].append("normal")
                continue

            self.counter_words[word_pair[0]].append("neutral")

    def add_features(self, X):
        comment_phrases = list(X.comment.value_counts().index[: 5]) + ["---"]
        
        X["is_comment"] = (~np.isin(X.comment, comment_phrases)).astype(int)
        X['dttm'] = pd.to_datetime(X.dttm)
        X['hour'] = X.dttm.apply(lambda x: x.hour)
        X['traff_jam'] = ((X.hour > 6) & (X.hour < 10)) | ((X.hour > 17) & (X.hour < 23))
        X['traff_jam'] = X.traff_jam.astype(int)
        X['weekday'] = X.dttm.apply(lambda x: x.weekday())
        X['holiday'] = (X.weekday >= 5).astype(int)
        X["count_words"] = [-1] * X.shape[0]
        X.loc[X.is_comment == True, "count_words"] = X[X.is_comment == True].comment.apply(lambda x: self.count_words(x))
        X["speed"] = X.distance / (X.duration / 60)
        X['agg_words'] = X.comment.apply(lambda x: self.check_sentence(x, "aggressive"))
        X['normal_words'] = X.comment.apply(lambda x: self.check_sentence(x, "normal"))
        X['distance_thresh'] = ((X.distance > 5) & (X.distance < 20)).astype(int)
        
        return X
    
    def gen_speed(self, tracks):
        tracks['speed'] = np.zeros(tracks.shape[0])
        for i in tqdm(range(1, len(tracks))):
            tracks.iloc[i, tracks.columns.get_loc('speed')] = get_speed(tracks.iloc[i-1, tracks.columns.get_loc('lat_')], tracks.iloc[i-1, tracks.columns.get_loc('lon_')],
                                        tracks.iloc[i, tracks.columns.get_loc('lat_')], tracks.iloc[i, tracks.columns.get_loc('lon_')], tracks.iloc[i-1, tracks.columns.get_loc('dt')], tracks.iloc[i, tracks.columns.get_loc('dt')])
        return tracks
    
    def estimate(self, X, y):
        return roc_auc_score(y, self.model.predict_proba(X).T[1])
    
    def train_test_split_(self, X, y, test_size, X_ss=None, y_ss=None, random_state=RANDOM_STATE):
        assert X.shape[0] == y.shape[0]
        if (X_ss is not None):
            X_ss_full, y_ss_full = self.label_shuffle(X, y, X_ss, y_ss, random_state = random_state)
            
            len_train = len(X_ss_full) - round(len(X_ss_full) * test_size)
            
            x_train = X_ss_full[: len_train]
            x_train.drop('ss', axis = 1, inplace = True)
            
            x_test = X_ss_full.iloc[len_train + 1:]
            x_test = x_test[x_test.ss == 0]
            x_test.drop('ss', axis = 1, inplace = True)
            
            y_train = y_ss_full[: len_train]
            y_train.drop('ss', axis = 1, inplace = True)
            
            y_test = y_ss_full.iloc[len_train + 1:]
            y_test = y_test[y_test.ss == 0]
            y_test.drop('ss', axis = 1, inplace = True)
            
            return (x_train, x_test, y_train, y_test)
        
        len_train = len(X) - round(len(X) * test_size)
        
        X = X.sample(frac=1, random_state=random_state)
        y = y.sample(frac=1, random_state=random_state)
        
        return (X[: len_train], X[len_train :], y[: len_train], y[len_train :])
    
    def train(self, X_train, X_test, y_train, y_test, categorical_feature, random_state=RANDOM_STATE):
        print(f"Train size: {X_train.shape}")
        print(f"Test size: {X_test.shape}")
        self.model = CatBoostClassifier(iterations=4000,
                           depth=2,
                           silent=False,
                           loss_function='Logloss',
                           class_weights=(1, 5),
                           random_state=random_state)

        self.model.fit(X_train, y_train, cat_features=categorical_features)
        
        print(y_test.shape)

        return self.estimate(X_test, y_test)
    
    def label_shuffle(self, X, y, X_ss, y_ss, random_state=RANDOM_STATE):
        X_ss['ss'] = 1
        y_ss = y_ss.to_frame()
        y_ss['ss'] = 1

        X['ss'] = 0
        y['ss'] = 0

        X_ss_full = pd.concat([X, X_ss]).sample(frac=1, random_state=random_state)
        y_ss_full = pd.concat([y, y_ss]).sample(frac=1, random_state=random_state)
        
        return (X_ss_full, y_ss_full)
    
    def train_cross_validation(self, X, y, k, categorical_features, X_ss=None, y_ss=None, random_state=RANDOM_STATE):
        chunk_size = len(X) / k
        chunks_size = [(i*chunk_size, i*chunk_size + chunk_size) for i in range(k)]
        
        result_score = []
        
        print(f"Part size: {chunk_size}")
        
        if (X_ss is not None):
            X_ss_full, y_ss_full = self.label_shuffle(X, y, X_ss, y_ss, random_state = random_state)
            
            for chunkIndex in range(len(chunks_size)):
                x_test = X_ss_full[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                y_test = y_ss_full[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                
                x_train = X_ss_full.drop(x_test.index, axis = 0)
                y_train = y_ss_full.drop(y_test.index, axis = 0)
                
                x_test = x_test[x_test.ss == 0]
                y_test = y_test[y_test.ss == 0]
                
                x_train.drop('ss', axis = 1, inplace = True)
                y_train.drop('ss', axis = 1, inplace = True)
                x_test.drop('ss', axis = 1, inplace = True)
                y_test.drop('ss', axis = 1, inplace = True)
                
                score = self.train(x_train, x_test, y_train, y_test, categorical_features)
                
                print(f"Chunk {chunkIndex}; Score: {score}")
                
                result_score.append((chunks_size[chunkIndex], score))
        else:            
            for chunkIndex in range(len(chunks_size)):
                x_test = X[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                y_test = y[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                
                x_train = X.drop(x_test.index, axis = 0)
                y_train = y.drop(y_test.index, axis = 0)
                
                score = self.train(x_train, x_test, y_train, y_test, categorical_features)
                
                print(f"Chunk {chunkIndex}; Score: {score}")
                
                result_score.append((chunks_size[chunkIndex], score))
            
        print(f"Mean score: {sum(list(map(lambda x: x[1], result_score))) / k}")
        
        return result_score
    
    def fit_ss(self, X, y, numeric_features, categorial_features, X_ss, y_ss, cross_validation=False, random_state=RANDOM_STATE):
        self.counter_words = {}
        
        X_ = X
        y_ = y
        
        self.NLP_preprocess(pd.concat([X_, X_ss]), pd.concat([y_, y_ss]))
        X_ = self.add_features(X_)[numeric_features + categorical_features]
        
        X_ss = self.add_features(X_ss)[numeric_features + categorical_features]
        
        if (not cross_validation):
            X_train, X_test, y_train, y_test = self.train_test_split_(X_, y_, test_size=0.2, X_ss=X_ss, y_ss=y_ss, random_state=random_state)
            return self.train(X_train, X_test, y_train, y_test, categorical_features)
        else:
            return self.train_cross_validation(X_, y_, 5, categorical_features, X_ss=X_ss, y_ss=y_ss, random_state=random_state)
        
    def fit(self, X, y, numeric_features, categorial_features, tracks = None, cross_validation=False, random_state=RANDOM_STATE):
        if tracks is not None:
            tracks_train, order_ids = self.tracks_transform(tracks)
            tracks_train['order_id'] = order_ids
            tracks_train = tracks_train.drop_duplicates('order_id', keep='last')
        
        self.counter_words = {}
        
        X_ = X
        y_ = y
        
        self.NLP_preprocess(X_, y_)
        X_ = self.add_features(X_)[numeric_features + categorical_features + ["order_id"]]

        print(f"Table data matrix shape: {X_.shape}")
        print(f"Tracks data matrix shape: {tracks_train.shape}")

        res_matrix = X_.merge(tracks_train, how='left', on='order_id')
        res_matrix.fillna(0, inplace=True)
        res_matrix.drop(['order_id'], axis=1, inplace=True)

        print(f"Result matrix shape: {res_matrix.shape}")

        if (not cross_validation):
            X_train, X_test, y_train, y_test = self.train_test_split_(res_matrix, y_, test_size=0.2, random_state=random_state)
            return self.train(X_train, X_test, y_train, y_test, categorical_features)
        else:
            return self.train_cross_validation(X_, y_, 5, categorical_features, random_state=random_state)

    def tracks_transform(self, tracks, labled=True):
        print("Preprocessing tracks data...")
        print(f"    Begin shape: {tracks.shape}")
        if labled:
            X, y, order_ids = self.tracks_preprocess(tracks, self.TRACKS_CHUNK_SIZE, self.TRACKS_MULTIPLIER)
        else:
            X, order_ids = self.tracks_preprocess(tracks, self.TRACKS_CHUNK_SIZE, self.TRACKS_MULTIPLIER, labled=False)

        print(f"    Shape after preprocessing: {X.shape}")

        print("Training MiniRocket...")
        self.model_rocket = MiniRocket()
        self.model_rocket.fit(X)

        print("MiniRocket transforming...")
        print(f"    Shape before transform: {X.shape}")
        X_train_transform = self.model_rocket.transform(X)

        return X_train_transform, order_ids

    def predict_proba(self, X, add_feat=True, tracks=None):
        if (add_feat): X = self.add_features(X)
        if tracks is not None:
            tracks_train, order_ids = self.tracks_transform(tracks, labled=False)
            tracks_train['order_id'] = order_ids
            tracks_train = tracks_train.drop_duplicates('order_id', keep='last')
        
        X = self.add_features(X)[numeric_features + categorical_features + ["order_id"]]

        res_matrix = X.merge(tracks_train, how='left', on='order_id')
        res_matrix.fillna(0, inplace=True)
        res_matrix.drop(['order_id'], axis=1, inplace=True)

        return self.model.predict_proba(res_matrix).T[1]
    
    def predict(self, X, add_feat=True, tracks=None):
        if (add_feat): X = self.add_features(X)
        if tracks is not None:
            tracks_train, order_ids = self.tracks_transform(tracks, labled=False)
            tracks_train['order_id'] = order_ids
            tracks_train = tracks_train.drop_duplicates('order_id', keep='last')
        
        X = self.add_features(X)[numeric_features + categorical_features + ["order_id"]]

        res_matrix = X.merge(tracks_train, how='left', on='order_id')
        res_matrix.fillna(0, inplace=True)
        res_matrix.drop(['order_id'], axis=1, inplace=True)

        return self.model.predict(res_matrix)
    
    def predict_thresh(self, X, thresh_above, thresh_below):
        y_unlab_full = self.predict_proba(X)
        
        y_unlab = pd.Series([-1 for i in range(len(X))])
        
        print("Thresh above: {}".format(sum(y_unlab_full >= thresh_above) / len(y_unlab_full)))
        print("Thresh below: {}".format(sum(y_unlab_full <= thresh_below) / len(y_unlab_full)))
        
        y_unlab.iloc[np.where(y_unlab_full >= thresh_above)] = 1
        y_unlab.iloc[np.where(y_unlab_full <= thresh_below)] = 0
        
        return y_unlab

    # undersampling method deletes some extra non aggressive values
    def undersampling(self, X, multiplier):
        aggressive_count = sum(X.is_aggressive==1)
        non_aggressive_ind = X[X.is_aggressive==0].index

        # number of aggressive and non-aggressive labels is the same
        random_indices = np.random.choice(non_aggressive_ind, int(aggressive_count*multiplier), replace=False)
        return pd.concat([X.loc[random_indices], X[X.is_aggressive==1]])

    def split(self, arr, chunk_size = 15):
        result = []
        #get right length of arr so that it equally splits into chunks
        length = len(arr)
        split_length = length - (length%chunk_size)
                
        for i in range(split_length)[chunk_size::chunk_size]:
            result.append(arr[i-chunk_size:i])

        return np.array(result)

    # make df, so that each row has whole order speeds time series
    def make_nested(self, tracks, chunk_size, multiplier, labled, drop_duplicates):
        orders = tracks.copy()
        if drop_duplicates:
            orders = tracks.drop_duplicates('order_id', keep='last')
        if labled and False:
            orders = self.undersampling(orders, multiplier)
        y_labels = []
        X_train = []
        order_ids = []
        for order in tqdm(orders['order_id']):
            order_df = tracks[tracks.order_id == order]
            order_df.loc[0, 'speed'] = 0

            if order_df.shape[0] < chunk_size:
                continue

            splitted_arrs = self.split(order_df.values, chunk_size)
            for arr in splitted_arrs:
                if labled:
                    is_aggressive = arr[0][6]
                    y_labels.append(is_aggressive)
                speed_series = []
                for row in arr:  
                    # append only speed and dt values 
                    speed_series.append(row[5])
                X_train.append(pd.Series(speed_series))
                order_ids.append(order)
        return X_train, y_labels, order_ids
    
    def tracks_preprocess(self, tracks, chunk_size, multiplier, labled=True, drop_duplicates=True):
        X_train, train_labels, order_ids = self.make_nested(tracks, chunk_size, multiplier, labled, drop_duplicates)

        X_train = pd.DataFrame({'speed': X_train})
        if not labled: return X_train, order_ids
        y_train = np.array(train_labels)

        return X_train, y_train, order_ids

# Train on labled data

In [404]:
train_labled = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/labled_train_data.csv', index_col=0, sep='\t', comment='#')
tracks_labled = pd.read_csv("/content/drive/MyDrive/aiijc_transport_simpleteam/data/labled_train_tracks_speed.csv", index_col=0, sep=',', comment='#')
tracks_unlabled = pd.read_csv("/content/drive/MyDrive/aiijc_transport_simpleteam/data/unlabled_train_tracks_speed.csv", index_col=0, sep=',', comment='#')

X_ = train_labled.iloc[:, :-1]
y_ = train_labled.iloc[:, -1:]

X_['client_rate_ride'] = X_['client_rate_ride'].fillna(X_['client_rate_ride'].mean())
X_['client_rides_cnt'] = X_['client_rides_cnt'].fillna(X_['client_rides_cnt'].mean())
X_['driver_rides_cnt'] = X_['driver_rides_cnt'].fillna(X_['driver_rides_cnt'].mean())

In [405]:
tracks_labled.drop(['Unnamed: 0.1'], axis=1, inplace=True)

In [408]:
numeric_features = ['distance', 'arrived_distance', 'arrived_duration', 'duration', 'driver_rides_cnt', 'client_rides_cnt', 'client_rate_ride', 'count_words']

categorical_features = ['mark', 'is_comment', 'hour', 'weekday', 'agg_words', 'normal_words']

In [409]:
model_supervised = Model()

model_supervised.fit(X_, y_, numeric_features, categorical_features, tracks=tracks_labled)

Preprocessing tracks data...
    Begin shape: (395687, 7)


100%|██████████| 9000/9000 [03:50<00:00, 39.03it/s]


    Shape after preprocessing: (9257, 1)
Training MiniRocket...
MiniRocket transforming...
    Shape before transform: (9257, 1)
Table data matrix shape: (9000, 15)
Tracks data matrix shape: (4011, 9997)
Result matrix shape: (9000, 10010)
Train size: (7200, 10010)
Test size: (1800, 10010)
Learning rate set to 0.006713
0:	learn: 0.6888725	total: 111ms	remaining: 7m 22s
1:	learn: 0.6846797	total: 207ms	remaining: 6m 53s
2:	learn: 0.6805516	total: 302ms	remaining: 6m 42s
3:	learn: 0.6764769	total: 401ms	remaining: 6m 40s
4:	learn: 0.6724992	total: 502ms	remaining: 6m 40s
5:	learn: 0.6685540	total: 596ms	remaining: 6m 36s
6:	learn: 0.6646836	total: 681ms	remaining: 6m 28s
7:	learn: 0.6609534	total: 776ms	remaining: 6m 27s
8:	learn: 0.6572017	total: 872ms	remaining: 6m 26s
9:	learn: 0.6534984	total: 965ms	remaining: 6m 25s
10:	learn: 0.6498648	total: 1.05s	remaining: 6m 22s
11:	learn: 0.6465924	total: 1.15s	remaining: 6m 22s
12:	learn: 0.6432795	total: 1.25s	remaining: 6m 21s
13:	learn: 0.6

0.7754214559386974

In [382]:
predictions = model_supervised.predict(X_, add_feat=True, tracks=tracks_labled)

Preprocessing tracks data...
    Begin shape: (395687, 7)


100%|██████████| 9000/9000 [03:51<00:00, 38.90it/s]


    Shape after preprocessing: (9257, 1)
Training MiniRocket...
MiniRocket transforming...
    Shape before transform: (9257, 1)


In [383]:
predictions.sum()

23

In [None]:
model_cv = Model()

model_cv.fit(X_, y_, numeric_features, categorical_features, cross_validation = True)

# Semi-supervised train

## Preprocessing

In [None]:
X_unlab = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/unlabled_train_data.csv', index_col=0, sep='\t', comment='#')
comments_unlabeled = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/unlabled_train_comments.csv', index_col=0, sep='\t', comment='#')
tracks_unlabeled = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/unlabled_train_tracks.csv', index_col=0, sep='\t', comment='#')

X_unlab['client_rate_ride'] = X_unlab['client_rate_ride'].fillna(X_unlab['client_rate_ride'].mean())
X_unlab['client_rides_cnt'] = X_unlab['client_rides_cnt'].fillna(X_unlab['client_rides_cnt'].mean())
X_unlab['driver_rides_cnt'] = X_unlab['driver_rides_cnt'].fillna(X_unlab['driver_rides_cnt'].mean())

In [None]:
sum(X_unlab.comment.isna()) / len(X_unlab.comment)

0.64866629360194

In [None]:
np.where(X_unlab.comment.isna())[0]

array([ 2762,  3239,  3574, ..., 10719, 10720, 10721])

In [None]:
for nanIndex in np.where(X_unlab.comment.isna())[0]:
    obj_comment = comments_unlabeled.loc[nanIndex]
    
    if (len(obj_comment) != 0):
        X_unlab.comment.iloc[nanIndex] = obj_comment.comment

In [None]:
sum(X_unlab.comment.isna()) / len(X_unlab.comment)

0.00018653236336504383

In [None]:
X_unlab.comment.iloc[np.where(X_unlab.comment.isna())[0]] = "---"

## Prediction&filling

In [None]:
y_unlab = model_supervised.predict_thresh(X_unlab, 0.99, 0.001)

Thresh above: 0.003357582540570789
Thresh below: 0.0


In [None]:
y_unlab.name = "is_aggressive"

In [None]:
y_unlab.value_counts()

-1    10686
 1       36
Name: is_aggressive, dtype: int64

In [None]:
X_unlab_lab = X_unlab.iloc[np.where(y_unlab != -1)]
y_unlab_lab = y_unlab.iloc[np.where(y_unlab != -1)]

In [None]:
X_unlab_lab.comment

7        1)Водитель играл в «шашки» на дороге и игрался...
202              2 раза списали деньги, верните пожалуйста
351      засыпал за рулём,  съезжал с дороги, приходило...
551               резкие повороты. водитель резко тормозил
568      Водитель смотрит кино на втором телефоне, проп...
705      Вел медленно по бордовым дорогам, сказал «пешк...
1317     Водитель резко тормозил, обгонял, кричал на др...
1378                             Водитель вел себя грубо. 
1634     Водитель постоянно громко разговаривал на своё...
1667     Водитель в возрасте, несколько раз отвечал на ...
1707     Водитель 2 раза поругался с другими таксистами...
1975     Водитель опасно вел автомобиль. Было ощущение ...
2221     Водитель явно засыпал за рулём, постоянно зева...
2261      Приехала другая машина вместо указанной в заказе
2302                       Водитель неадекватно себя вёл, 
2346     Проехал на красный свет дважды на перекрестках...
2472                      водит опасно и очень неаккурат

In [None]:
X_unlab_lab.is_comment.value_counts()

1    36
Name: is_comment, dtype: int64

## Train

In [None]:
model_semisupervised = Model()

model_semisupervised.fit_ss(X_, y_, numeric_features, categorical_features, X_unlab_lab, y_unlab_lab)

Train size: (7229, 14)
Test size: (1798, 14)


0.8488724212067151

In [None]:
model_ss_cv = Model()

model_ss_cv.fit_ss(X_, y_, numeric_features, categorical_features, X_unlab_lab, y_unlab_lab, cross_validation=True)

Part size: 1800.0
Train size: (7224, 14)
Test size: (1791, 14)
Chunk 0; Score: 0.7396306537625914
Train size: (7228, 14)
Test size: (1797, 14)
Chunk 1; Score: 0.742049078955933
Train size: (7224, 14)
Test size: (1794, 14)
Chunk 2; Score: 0.7889097744360902
Train size: (7225, 14)
Test size: (1790, 14)
Chunk 3; Score: 0.7370420937809273
Train size: (7226, 14)
Test size: (1792, 14)
Chunk 4; Score: 0.85435199720914
Mean score: 0.7723967196289363


[((0.0, 1800.0), 0.7396306537625914),
 ((1800.0, 3600.0), 0.742049078955933),
 ((3600.0, 5400.0), 0.7889097744360902),
 ((5400.0, 7200.0), 0.7370420937809273),
 ((7200.0, 9000.0), 0.85435199720914)]

# Prediction

In [410]:
X_test = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/labled_test_data.csv', index_col=0, sep='\t', comment='#')
tracks_test = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/labled_test_tracks.csv', index_col=0, sep='\t', comment='#')

In [411]:
tracks_test.groupby('order_id').size()

order_id
000d9cf4365ad8be9b559951d0d945c7     12
00287e34dd884a2a69c80346541d2aef     64
00307c7812842b1159781c2c6375944a     41
0061e7abbe5544c40781ba2816b3e026     61
0074b0c828084e05c28035487ad2a130     82
                                   ... 
ff209045501b1f25e8729a96a215a3d2     97
ff4c5997ed87ff37a3c215bab2c0916e     49
ff6873cfaccafec937bbed29e317d3e2     91
ff9745e14cda84a4550b528a8d9aa4de    103
ffd2c55165c42430793423c93211bd46     53
Length: 1272, dtype: int64

In [414]:
result_series = pd.Series(model_supervised.predict(X_test, add_feat=True, tracks=tracks_test))

Preprocessing tracks data...
    Begin shape: (71432, 5)


100%|██████████| 1272/1272 [00:09<00:00, 133.88it/s]


    Shape after preprocessing: (1966, 1)
Training MiniRocket...
MiniRocket transforming...
    Shape before transform: (1966, 1)


In [415]:
result_series.sum()

40

In [None]:
result_series = pd.Series(model_supervised.predict(X_test))
# result_series = pd.Series(model_semisupervised.predict(X_test))
# result_series = pd.Series(np.zeros(X_test.shape[0]))

In [None]:
result_series.shape[0], tracks_result.shape[0]

(1272, 1966)

In [None]:
abs(tracks_result - result_series)

ValueError: ignored

In [416]:
result_series.name = 'is_aggressive'

In [417]:
result_series.sum()

40

In [418]:
result_series.to_csv("result.csv")