# Install environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install catboost
!pip install sktime
!pip install tqdm

Collecting catboost
  Downloading catboost-0.26.1-cp37-none-manylinux1_x86_64.whl (67.4 MB)
[K     |████████████████████████████████| 67.4 MB 65 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26.1
Collecting sktime
  Downloading sktime-0.7.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 15.3 MB/s 
Collecting scikit-learn>=0.24.0
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 62.6 MB/s 
[?25hCollecting statsmodels>=0.12.1
  Downloading statsmodels-0.12.2-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)
[K     |████████████████████████████████| 9.5 MB 45.0 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, statsmodels, scikit-learn, sktime
  Attempting uninstall: statsmodels
    Found existing installation: statsmode

In [3]:
import numpy as np
import pandas as pd

from collections import Counter
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sktime.transformations.panel.rocket import MiniRocket
from sklearn.linear_model import RidgeClassifierCV

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")



In [4]:
RANDOM_STATE = 42

# Speed tracks

## Dowload and add <code>aggressive</code> column

In [None]:
speed_tracks = pd.read_csv("/content/drive/MyDrive/aiijc_transport_simpleteam/data/labled_train_tracks_speed.csv")

In [None]:
speed_tracks

In [None]:
speed_tracks.speed.mean()

In [None]:
speed_tracks.groupby('order_id').speed.mean()

In [None]:
(speed_tracks.speed == 0).sum() / speed_tracks.shape[0]

In [None]:
from tqdm.notebook import tqdm

speed_tracks['is_aggressive'] = np.zeros(speed_tracks.shape[0])

for obj in tqdm(train_labeled.values):
    speed_tracks["is_aggressive"].loc[(speed_tracks.order_id == obj[0])] = obj[-1]

In [None]:
speed_tracks.to_csv("labled_train_tracks_speed.csv")
!cp labled_train_tracks_speed.csv /content/drive/MyDrive/aiijc_transport_simpleteam/data/

## Tracks model

In [None]:
tracks_labled = pd.read_csv("/content/drive/MyDrive/aiijc_transport_simpleteam/data/labled_train_tracks_speed.csv", index_col=0, sep=',', comment='#')

In [None]:
from functools import lru_cache

# @lru_cache(maxsize=None)
def split(arr, chunk_size = 15):
    result = []
    # get right length of arr so that it equally splits into chunks
    length = len(arr)
    split_length = length - (length%chunk_size)

    for i in range(split_length)[chunk_size::chunk_size]:
        result.append(arr[i-chunk_size:i])

    return np.array(result)

train_labels = []
X_train = []

for order in tracks_labled['order_id'].unique():
    order_df = tracks_labled[tracks_labled['order_id']==order]
    order_df.loc[0,'speed']=0
    
    chunk_size = 15
    if order_df.shape[0]<chunk_size:
        continue

    splitted_arrs = split(order_df.values, chunk_size)
    for arr in splitted_arrs:
        is_aggressive = arr[0][7]
        train_labels.append(is_aggressive)
        speed_series = []
        for row in arr:  
            # append only speed and dt values 
            speed_series.append(row[6])
        X_train.append(pd.Series(speed_series))

X_train = pd.DataFrame({'speed':X_train})
y_train = np.array(train_labels)

TypeError: ignored

In [None]:
X_train

Unnamed: 0,speed
0,0 0.000000 1 5.906441 2 15.69600...
1,0 7.432941 1 3.720000 2 8.04600...
2,0 -0.091778 1 3.325714 2 2.92800...
3,0 36.148235 1 173.160000 2 57.96...
4,0 0.024739 1 4.012500 2 9.07826...
...,...
15207,0 92.244706 1 104.805000 2 112.82...
15208,0 103.214118 1 107.100000 2 96.22...
15209,0 104.602500 1 89.301176 2 97.51...
15210,0 -0.035823 1 8.820000 2 0.86087...


In [None]:
y_train

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
from sklearn.model_selection import KFold  
from sklearn.pipeline import make_pipeline
from sktime.transformations.panel.rocket import MiniRocket
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifierCV
from sklearn.semi_supervised import SelfTrainingClassifier

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

rocket = MiniRocket()
rocket.fit(X_train, y_train)

X_train_transform = rocket.transform(X_train,y_train)

classifier = RidgeClassifierCV(alphas = np.logspace(-3, 3, 10), normalize = True)
classifier.fit(X_train_transform, y_train)

X_test_transform = rocket.transform(X_test)
classifier.score(X_test_transform, y_test)

0.9368525896414343

# Model code

In [214]:
class Model:
    def __init__(self):        
        self.model = None
        self.model_tracks = None
        self.model_rocket = None

        self.counter_words = {}

        self.TRACKS_CHUNK_SIZE = 20
        self.TRACKS_MULTIPLIER = 1
    
    def count_words(self, x):
        return len(x.split(" "))

    def check_sentence(self, sentence, words_type):
        words_count = 0
        for word in sentence.split(" "):
            word = word.lower().replace(',', '').replace('.', '')

            if (word not in list(self.counter_words.keys()) or len(self.counter_words[word]) == 2): continue

            if (words_type == self.counter_words[word][2]): 
                words_count += 1
        return words_count
        
    def NLP_preprocess(self, X ,y):
        dataset_joined = X.join(y)
        comment_phrases = list(dataset_joined.comment.value_counts().index[: 10])
        
        dataset_joined['is_comment'] = (~np.isin(dataset_joined.comment, comment_phrases)).astype(int)
        
        aggressive_comments = dataset_joined[(dataset_joined['is_comment'] == True) & (dataset_joined.is_aggressive == True)].comment.values
        normal_comments = dataset_joined[(dataset_joined['is_comment'] == True) & (dataset_joined.is_aggressive == False)].comment.values
        
        stop_words = ['на', 'по', 'с', 'в', 'что', 'и', 'а']

        for sentence in normal_comments:
            for word in sentence.split(" "):
                word = word.lower().replace(',', '').replace('.', '')
                if (word in stop_words): continue
                if (word in self.counter_words.keys()):
                    self.counter_words[word][0] += 1
                else: self.counter_words[word] = [1, 0]

        for sentence in aggressive_comments:
            for word in sentence.split(" "):
                word = word.lower().replace(',', '').replace('.', '')
                if (word in stop_words): continue
                if (word in self.counter_words.keys()):
                    self.counter_words[word][1] += 1
                else: self.counter_words[word] = [0, 1]
        
        
        count_all_words = np.array(list(map(lambda x: np.array(x), np.array(list(self.counter_words.items())).T[1]))).T
        
        count_normal_words = count_all_words[0].sum()
        count_aggressive_words = count_all_words[1].sum()

        for word_pair in list(self.counter_words.items()):
            if (word_pair[1][1] == 0 and word_pair[1][0] > 0):
                self.counter_words[word_pair[0]].append("normal")
                continue

            if (word_pair[1][0] == 0 and word_pair[1][1] > 0):
                self.counter_words[word_pair[0]].append("aggressive")
                continue

            ratio_aggressive = word_pair[1][1] / count_aggressive_words
            ratio_normal = word_pair[1][0] / count_normal_words

            if (ratio_aggressive / ratio_normal >= 3):
                self.counter_words[word_pair[0]].append("aggressive")
                continue

            if (ratio_normal / ratio_aggressive >= 3):
                self.counter_words[word_pair[0]].append("normal")
                continue

            self.counter_words[word_pair[0]].append("neutral")

    def add_features(self, X):
        comment_phrases = list(X.comment.value_counts().index[: 5]) + ["---"]
        
        X["is_comment"] = (~np.isin(X.comment, comment_phrases)).astype(int)
        X['dttm'] = pd.to_datetime(X.dttm)
        X['hour'] = X.dttm.apply(lambda x: x.hour)
        X['traff_jam'] = ((X.hour > 6) & (X.hour < 10)) | ((X.hour > 17) & (X.hour < 23))
        X['traff_jam'] = X.traff_jam.astype(int)
        X['weekday'] = X.dttm.apply(lambda x: x.weekday())
        X['holiday'] = (X.weekday >= 5).astype(int)
        X["count_words"] = [-1] * X.shape[0]
        X.loc[X.is_comment == True, "count_words"] = X[X.is_comment == True].comment.apply(lambda x: self.count_words(x))
        X["speed"] = X.distance / (X.duration / 60)
        X['agg_words'] = X.comment.apply(lambda x: self.check_sentence(x, "aggressive"))
        X['normal_words'] = X.comment.apply(lambda x: self.check_sentence(x, "normal"))
        X['distance_thresh'] = ((X.distance > 5) & (X.distance < 20)).astype(int)
        
        return X
    
    def estimate(self, X, y):
        return roc_auc_score(y, self.predict_proba(X, add_feat=False))
    
    def train_test_split_(self, X, y, test_size, X_ss=None, y_ss=None, random_state=RANDOM_STATE):
        if (X_ss is not None):
            X_ss_full, y_ss_full = self.label_shuffle(X, y, X_ss, y_ss, random_state = random_state)
            
            len_train = len(X_ss_full) - round(len(X_ss_full) * test_size)
            
            x_train = X_ss_full[: len_train]
            x_train.drop('ss', axis = 1, inplace = True)
            
            x_test = X_ss_full.iloc[len_train + 1:]
            x_test = x_test[x_test.ss == 0]
            x_test.drop('ss', axis = 1, inplace = True)
            
            y_train = y_ss_full[: len_train]
            y_train.drop('ss', axis = 1, inplace = True)
            
            y_test = y_ss_full.iloc[len_train + 1:]
            y_test = y_test[y_test.ss == 0]
            y_test.drop('ss', axis = 1, inplace = True)
            
            return (x_train, x_test, y_train, y_test)
        
        len_train = len(X) - round(len(X) * test_size)
        
        X = X.sample(frac=1, random_state=random_state)
        y = y.sample(frac=1, random_state=random_state)
        
        return (X[: len_train], X[len_train :], y[: len_train], y[len_train :])
    
    def train(self, X_train, X_test, y_train, y_test, categorical_feature, random_state=RANDOM_STATE):
        print(f"Train size: {X_train.shape}")
        print(f"Test size: {X_test.shape}")
        self.model = CatBoostClassifier(iterations=2000,
                           depth=2,
                           silent=True,
                           loss_function='Logloss',
                           class_weights=(1, 2),
                           random_state=random_state)

        self.model.fit(X_train, y_train, cat_features=categorical_features)
        
        return self.estimate(X_test, y_test)
    
    def label_shuffle(self, X, y, X_ss, y_ss, random_state=RANDOM_STATE):
        X_ss['ss'] = 1
        y_ss = y_ss.to_frame()
        y_ss['ss'] = 1

        X['ss'] = 0
        y['ss'] = 0

        X_ss_full = pd.concat([X, X_ss]).sample(frac=1, random_state=random_state)
        y_ss_full = pd.concat([y, y_ss]).sample(frac=1, random_state=random_state)
        
        return (X_ss_full, y_ss_full)
    
    def train_cross_validation(self, X, y, k, categorical_features, X_ss=None, y_ss=None, random_state=RANDOM_STATE):
        chunk_size = len(X) / k
        chunks_size = [(i*chunk_size, i*chunk_size + chunk_size) for i in range(k)]
        
        result_score = []
        
        print(f"Part size: {chunk_size}")
        
        if (X_ss is not None):
            X_ss_full, y_ss_full = self.label_shuffle(X, y, X_ss, y_ss, random_state = random_state)
            
            for chunkIndex in range(len(chunks_size)):
                x_test = X_ss_full[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                y_test = y_ss_full[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                
                x_train = X_ss_full.drop(x_test.index, axis = 0)
                y_train = y_ss_full.drop(y_test.index, axis = 0)
                
                x_test = x_test[x_test.ss == 0]
                y_test = y_test[y_test.ss == 0]
                
                x_train.drop('ss', axis = 1, inplace = True)
                y_train.drop('ss', axis = 1, inplace = True)
                x_test.drop('ss', axis = 1, inplace = True)
                y_test.drop('ss', axis = 1, inplace = True)
                
                score = self.train(x_train, x_test, y_train, y_test, categorical_features)
                
                print(f"Chunk {chunkIndex}; Score: {score}")
                
                result_score.append((chunks_size[chunkIndex], score))
        else:            
            for chunkIndex in range(len(chunks_size)):
                x_test = X[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                y_test = y[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                
                x_train = X.drop(x_test.index, axis = 0)
                y_train = y.drop(y_test.index, axis = 0)
                
                score = self.train(x_train, x_test, y_train, y_test, categorical_features)
                
                print(f"Chunk {chunkIndex}; Score: {score}")
                
                result_score.append((chunks_size[chunkIndex], score))
            
        print(f"Mean score: {sum(list(map(lambda x: x[1], result_score))) / k}")
        
        return result_score
    
    def fit_ss(self, X, y, numeric_features, categorial_features, X_ss, y_ss, cross_validation=False, random_state=RANDOM_STATE):
        self.counter_words = {}
        
        X_ = X
        y_ = y
        
        self.NLP_preprocess(pd.concat([X_, X_ss]), pd.concat([y_, y_ss]))
        X_ = self.add_features(X_)[numeric_features + categorical_features]
        
        X_ss = self.add_features(X_ss)[numeric_features + categorical_features]
        
        if (not cross_validation):
            X_train, X_test, y_train, y_test = self.train_test_split_(X_, y_, test_size=0.2, X_ss=X_ss, y_ss=y_ss, random_state=random_state)
            return self.train(X_train, X_test, y_train, y_test, categorical_features)
        else:
            return self.train_cross_validation(X_, y_, 5, categorical_features, X_ss=X_ss, y_ss=y_ss, random_state=random_state)
        
        
    def fit(self, X, y, numeric_features, categorial_features, cross_validation=False, random_state=RANDOM_STATE):
        self.counter_words = {}
        
        X_ = X
        y_ = y
        
        self.NLP_preprocess(X_, y_)
        X_ = self.add_features(X_)[numeric_features + categorical_features]

        if (not cross_validation):
            X_train, X_test, y_train, y_test = self.train_test_split_(X_, y_, test_size=0.2, random_state=random_state)
            return self.train(X_train, X_test, y_train, y_test, categorical_features)
        else:
            return self.train_cross_validation(X_, y_, 5, categorical_features, random_state=random_state)

    def fit_tracks(self, tracks, random_state=RANDOM_STATE):
        print("Preprocessing tracks data...")
        X, y = self.tracks_preprocess(tracks, self.TRACKS_CHUNK_SIZE, self.TRACKS_MULTIPLIER)

        print("Training MiniRocket...")
        self.model_rocket = MiniRocket()
        self.model_rocket.fit(X, y)

        print("MiniRocket Transforming...")
        X_train_transform = self.model_rocket.transform(X, y)

        print("Training logistic regression...")
        self.model_tracks = RidgeClassifierCV(normalize = True)
        self.model_tracks.fit(X_train_transform, y)
        
        print(f"Score: {self.model_tracks.score(X_train_transform, y)}")
    
    def predict_tracks(self, tracks):
        print("Preprocessing tracks data...")
        X = self.tracks_preprocess(tracks, self.TRACKS_CHUNK_SIZE, self.TRACKS_MULTIPLIER, labled=False)
        
        print("MiniRocket Transforming...")
        X_transform = self.model_rocket.transform(X)

        return self.model_tracks.predict(X_transform)

    def predict_proba(self, X, add_feat=True):
        if (add_feat): X = self.add_features(X)
        
        X = X[numeric_features + categorical_features]
        
        return self.model.predict_proba(X).T[1]
    
    def predict(self, X, add_feat=True):
        if (add_feat): X = self.add_features(X)
        
        X = X[numeric_features + categorical_features]
        
        return self.model.predict(X)
    
    def predict_thresh(self, X, thresh_above, thresh_below):
        y_unlab_full = self.predict_proba(X)
        
        y_unlab = pd.Series([-1 for i in range(len(X))])
        
        print("Thresh above: {}".format(sum(y_unlab_full >= thresh_above) / len(y_unlab_full)))
        print("Thresh below: {}".format(sum(y_unlab_full <= thresh_below) / len(y_unlab_full)))
        
        y_unlab.iloc[np.where(y_unlab_full >= thresh_above)] = 1
        y_unlab.iloc[np.where(y_unlab_full <= thresh_below)] = 0
        
        return y_unlab

    # undersampling method deletes some extra non aggressive values
    def undersampling(self, X, multiplier):
        aggressive_count = sum(X.is_aggressive==1)
        non_aggressive_ind = X[X.is_aggressive==0].index

        # number of aggressive and non-aggressive labels is the same
        random_indices = np.random.choice(non_aggressive_ind, int(aggressive_count*multiplier), replace=False)
        return pd.concat([X.loc[random_indices], X[X.is_aggressive==1]])

    def split(self, arr, chunk_size = 15):
        result = []
        #get right length of arr so that it equally splits into chunks
        length = len(arr)
        split_length = length - (length%chunk_size)
                
        for i in range(split_length)[chunk_size::chunk_size]:
            result.append(arr[i-chunk_size:i])

        return np.array(result)

    # make df, so that each row has whole order speeds time series
    def make_nested(self, tracks, chunk_size, multiplier, labled):
        unique_orders = tracks.drop_duplicates('order_id', keep='last')
        if labled:
            unique_orders = self.undersampling(unique_orders, multiplier)
        y_labels = []
        X_train = []
        for order in tqdm(unique_orders['order_id']):
            order_df = tracks[tracks.order_id == order]
            order_df.loc[0, 'speed'] = 0

            if order_df.shape[0] < chunk_size:
                continue

            splitted_arrs = self.split(order_df.values, chunk_size)
            for arr in splitted_arrs:
                if labled:
                    is_aggressive = arr[0][6]
                    y_labels.append(is_aggressive)
                speed_series = []
                for row in arr:  
                    # append only speed and dt values 
                    speed_series.append(row[5])
                X_train.append(pd.Series(speed_series))
        return X_train, y_labels
    
    def tracks_preprocess(self, tracks, chunk_size, multiplier, labled=True):
        X_train, train_labels = self.make_nested(tracks, chunk_size, multiplier, labled)

        X_train = pd.DataFrame({'speed': X_train})
        if not labled: return X_train
        y_train = np.array(train_labels)

        return X_train, y_train

# Train on labled data

In [215]:
train_labled = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/labled_train_data.csv', index_col=0, sep='\t', comment='#')
tracks_labled = pd.read_csv("/content/drive/MyDrive/aiijc_transport_simpleteam/data/labled_train_tracks_speed.csv", index_col=0, sep=',', comment='#')
tracks_unlabled = pd.read_csv("/content/drive/MyDrive/aiijc_transport_simpleteam/data/unlabled_train_tracks_speed.csv", index_col=0, sep=',', comment='#')

X_ = train_labled.iloc[:, :-1]
y_ = train_labled.iloc[:, -1:]

X_['client_rate_ride'] = X_['client_rate_ride'].fillna(X_['client_rate_ride'].mean())
X_['client_rides_cnt'] = X_['client_rides_cnt'].fillna(X_['client_rides_cnt'].mean())
X_['driver_rides_cnt'] = X_['driver_rides_cnt'].fillna(X_['driver_rides_cnt'].mean())

In [216]:
tracks_labled.drop(['Unnamed: 0.1'], axis=1, inplace=True)

In [217]:
numeric_features = ['distance', 'arrived_distance', 'arrived_duration', 'duration', 'driver_rides_cnt', 'client_rides_cnt', 'client_rate_ride', 'count_words']

categorical_features = ['mark', 'is_comment', 'hour', 'weekday', 'agg_words', 'normal_words']

In [218]:
model_supervised = Model()

model_supervised.fit_tracks(tracks_labled)

Preprocessing tracks data...


100%|██████████| 864/864 [00:21<00:00, 40.05it/s]


Training MiniRocket...
MiniRocket Transforming...
Training logistic regression...
Score: 0.8022388059701493


In [219]:
 tracks = model_supervised.predict_tracks(tracks_labled)

Preprocessing tracks data...


100%|██████████| 9000/9000 [03:43<00:00, 40.19it/s]


MiniRocket Transforming...


In [220]:
tracks.sum()

6290.0

In [221]:
tracks.shape[0]

9257

In [None]:
model_supervised = Model()

model_supervised.fit(X_, y_, numeric_features, categorical_features)

Train size: (7200, 14)
Test size: (1800, 14)


0.8362260536398467

In [None]:
model_cv = Model()

model_cv.fit(X_, y_, numeric_features, categorical_features, cross_validation = True)

Part size: 1800.0
Train size: (7200, 14)
Test size: (1800, 14)
Chunk 0; Score: 0.7501325765301662
Train size: (7200, 14)
Test size: (1800, 14)
Chunk 1; Score: 0.7869791274206916
Train size: (7200, 14)
Test size: (1800, 14)
Chunk 2; Score: 0.7333526906697638
Train size: (7200, 14)
Test size: (1800, 14)
Chunk 3; Score: 0.7684012885885325
Train size: (7200, 14)
Test size: (1800, 14)
Chunk 4; Score: 0.7787730470989159
Mean score: 0.7635277460616141


[((0.0, 1800.0), 0.7501325765301662),
 ((1800.0, 3600.0), 0.7869791274206916),
 ((3600.0, 5400.0), 0.7333526906697638),
 ((5400.0, 7200.0), 0.7684012885885325),
 ((7200.0, 9000.0), 0.7787730470989159)]

# Semi-supervised train

## Preprocessing

In [None]:
X_unlab = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/unlabled_train_data.csv', index_col=0, sep='\t', comment='#')
comments_unlabeled = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/unlabled_train_comments.csv', index_col=0, sep='\t', comment='#')
tracks_unlabeled = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/unlabled_train_tracks.csv', index_col=0, sep='\t', comment='#')

X_unlab['client_rate_ride'] = X_unlab['client_rate_ride'].fillna(X_unlab['client_rate_ride'].mean())
X_unlab['client_rides_cnt'] = X_unlab['client_rides_cnt'].fillna(X_unlab['client_rides_cnt'].mean())
X_unlab['driver_rides_cnt'] = X_unlab['driver_rides_cnt'].fillna(X_unlab['driver_rides_cnt'].mean())

In [None]:
sum(X_unlab.comment.isna()) / len(X_unlab.comment)

0.64866629360194

In [None]:
np.where(X_unlab.comment.isna())[0]

array([ 2762,  3239,  3574, ..., 10719, 10720, 10721])

In [None]:
for nanIndex in np.where(X_unlab.comment.isna())[0]:
    obj_comment = comments_unlabeled.loc[nanIndex]
    
    if (len(obj_comment) != 0):
        X_unlab.comment.iloc[nanIndex] = obj_comment.comment

In [None]:
sum(X_unlab.comment.isna()) / len(X_unlab.comment)

0.00018653236336504383

In [None]:
X_unlab.comment.iloc[np.where(X_unlab.comment.isna())[0]] = "---"

## Prediction&filling

In [None]:
y_unlab = model_supervised.predict_thresh(X_unlab, 0.99, 0.001)

Thresh above: 0.003357582540570789
Thresh below: 0.0


In [None]:
y_unlab.name = "is_aggressive"

In [None]:
y_unlab.value_counts()

-1    10686
 1       36
Name: is_aggressive, dtype: int64

In [None]:
X_unlab_lab = X_unlab.iloc[np.where(y_unlab != -1)]
y_unlab_lab = y_unlab.iloc[np.where(y_unlab != -1)]

In [None]:
X_unlab_lab.comment

7        1)Водитель играл в «шашки» на дороге и игрался...
202              2 раза списали деньги, верните пожалуйста
351      засыпал за рулём,  съезжал с дороги, приходило...
551               резкие повороты. водитель резко тормозил
568      Водитель смотрит кино на втором телефоне, проп...
705      Вел медленно по бордовым дорогам, сказал «пешк...
1317     Водитель резко тормозил, обгонял, кричал на др...
1378                             Водитель вел себя грубо. 
1634     Водитель постоянно громко разговаривал на своё...
1667     Водитель в возрасте, несколько раз отвечал на ...
1707     Водитель 2 раза поругался с другими таксистами...
1975     Водитель опасно вел автомобиль. Было ощущение ...
2221     Водитель явно засыпал за рулём, постоянно зева...
2261      Приехала другая машина вместо указанной в заказе
2302                       Водитель неадекватно себя вёл, 
2346     Проехал на красный свет дважды на перекрестках...
2472                      водит опасно и очень неаккурат

In [None]:
X_unlab_lab.is_comment.value_counts()

1    36
Name: is_comment, dtype: int64

## Train

In [None]:
model_semisupervised = Model()

model_semisupervised.fit_ss(X_, y_, numeric_features, categorical_features, X_unlab_lab, y_unlab_lab)

Train size: (7229, 14)
Test size: (1798, 14)


0.8488724212067151

In [None]:
model_ss_cv = Model()

model_ss_cv.fit_ss(X_, y_, numeric_features, categorical_features, X_unlab_lab, y_unlab_lab, cross_validation=True)

Part size: 1800.0
Train size: (7224, 14)
Test size: (1791, 14)
Chunk 0; Score: 0.7396306537625914
Train size: (7228, 14)
Test size: (1797, 14)
Chunk 1; Score: 0.742049078955933
Train size: (7224, 14)
Test size: (1794, 14)
Chunk 2; Score: 0.7889097744360902
Train size: (7225, 14)
Test size: (1790, 14)
Chunk 3; Score: 0.7370420937809273
Train size: (7226, 14)
Test size: (1792, 14)
Chunk 4; Score: 0.85435199720914
Mean score: 0.7723967196289363


[((0.0, 1800.0), 0.7396306537625914),
 ((1800.0, 3600.0), 0.742049078955933),
 ((3600.0, 5400.0), 0.7889097744360902),
 ((5400.0, 7200.0), 0.7370420937809273),
 ((7200.0, 9000.0), 0.85435199720914)]

# Prediction

In [None]:
X_test = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/labled_test_data.csv', index_col=0, sep='\t', comment='#')
tracks_test = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/labled_test_tracks.csv', index_col=0, sep='\t', comment='#')

In [None]:
tracks_test

Unnamed: 0,driver_id,dt,lat_,lon_,order_id
0,d19911199e1a36b0efcfff74d2e48041,2021-04-09 07:52:33,55.744270,37.495416,a81c18d7605310bcfbaf2100a3dfc996
1,710c3859a47f8214355bcae973cd1fcc,2021-04-09 23:11:32,55.805541,37.582749,c22a9913e7c3f7f95702bccc9eed16a8
2,5cd67d0bcf20c02d6700512c56212e4d,2021-04-09 23:51:36,55.738284,37.628273,86656ff0f2433f53b17b55a0f7429285
3,1a7cc954f3594034a0ac4ce884c0b3cf,2021-04-09 21:13:18,55.690053,37.559565,0eb2d643f9fcf1e6816e17b0a43e6779
4,6ae93e9f1d5aae8bb0e3ebedaf46f60c,2021-04-09 07:09:55,55.693317,37.940072,615760202d3d45f6192830ddd6924267
...,...,...,...,...,...
76658,9a34ad2bc3f54a400bbccc3e13c4aa66,2021-04-03 20:41:12,55.781854,37.727598,5166692186266da273d09ad1cd448361
76659,5afc8eb8b21abdba1128e8e5570995fe,2021-04-03 16:59:35,55.649680,37.835034,bfdd4d1fd140d5ec2f49b634a661ab23
76660,03f4a73e682696649185aaa89c1e4a0e,2021-04-03 23:46:41,55.782393,37.598270,a3a6e9104be084052f5120c3ec5f2da5
76665,eea9eef46e30e047fac89fb588b42c02,2021-04-03 20:56:52,55.828086,37.530405,ebec12c2768f1f5d8239e2911acfea09


In [None]:
model_supervised.tracks_model.predict(tracks_labled)

In [None]:
X_test

Unnamed: 0,order_id,driver_id,client_id,dttm,date,arrived_distance,arrived_duration,distance,duration,from_latitude,from_longitude,to_latitude,to_longitude,mark,client_rate_ride,client_rides_cnt,driver_rides_cnt,comment
0,49430c6531d098d3b288d95d7d1e7f4f,21348747875e0b01bc492d32b49c638d,4d4d06feddc5669339b1cd9d7941a116,2021-04-03 14:49:46,2021-04-03,180.0,1.0,6.1,19.4,55.822578,37.596844,55.792253,37.599762,Skoda Octavia,5.0,36.0,241.0,Больше нечего сказать
1,d3fff1a829a5b5ccbccd40c9895ff4b0,398fe459519b5facba93168b71df0625,2a41e0a45a19a892e960d6c4bef5c27b,2021-04-03 15:02:44,2021-04-03,790.0,2.0,6.4,19.0,55.609624,37.719165,55.596660,37.763549,Hyundai Solaris,5.0,19.0,193.0,Больше нечего сказать
2,f73533b8aab3b1d7b52488d954a46fa0,951410ef679f167e6515ea5e4d5fb92d,9a8017ac4e8b0ce55b0dbd75c2f25445,2021-04-03 15:04:47,2021-04-03,170.0,1.0,9.0,15.4,55.570245,37.576624,55.524880,37.589531,Volkswagen Polo,5.0,13.0,214.0,Все отлично!
3,43c6c249e4751b9ba0cf68de3e40a053,311ac9352166fb9b2f9153581b03ab5b,2f9a136bb3baa4912748a981033fd272,2021-04-03 15:05:58,2021-04-03,140.0,0.0,10.7,27.0,55.747910,37.691090,55.750109,37.585366,Hyundai Solaris,5.0,16.0,510.0,Водитель чихает без маски не прикрывая нос
4,b88dccdeaa7c5c1478915d7532082cda,cccd0c6d3ce5a7c553a1f28086beef5a,121a25147dafdb4565cf4eb6db7302f6,2021-04-03 15:10:29,2021-04-03,40.0,0.0,21.8,38.2,55.849331,37.494747,55.725583,37.449681,Kia Cerato,,40.0,296.0,Больше нечего сказать
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,dde84eeec63c85f1a1dc1f5321c9d424,a6e1800b310773694593ce2f810de76d,a0adfd4e6e617c21e9de8f9d006d143a,2021-04-09 23:37:09,2021-04-09,770.0,3.0,7.0,11.6,55.768613,37.580378,55.732811,37.535191,Kia Rio,5.0,27.0,25.0,"Ставит прибытие, не доезжая до А\nСтартует тач..."
1268,86a1d75a3366f9ccf0dd95459ef36c58,c79d0c270adf8cfc92570ab608d228b4,a5832f040002a98f67c884cf0e507a67,2021-04-09 23:40:11,2021-04-09,50.0,0.0,5.6,9.3,55.805612,37.522111,55.790116,37.496172,Toyota Camry,5.0,33.0,104.0,Больше нечего сказать
1269,ef394c9f24453dd4e7d9a563a761f4d8,3694cb26b900e5bc06f0b3fd62e7897b,d48161326ee0534ffc2eebfc4a77f569,2021-04-09 23:42:47,2021-04-09,140.0,1.0,15.9,24.2,55.709959,37.622206,55.780273,37.535656,Skoda Rapid,5.0,8.0,252.0,Больше нечего сказать
1270,010927d83eb31965dd0c63013ee125c4,09563133175aa456e9001c81dc331bdc,3f1c4dc975ec1464e7571c514cd707bc,2021-04-09 23:47:22,2021-04-09,20.0,0.0,2.7,7.0,55.578766,37.665268,55.597149,37.666500,Kia Ceed,5.0,25.0,304.0,Прокуреный салон.


In [None]:
# result_series = pd.Series(model_semisupervised.predict(X_test))
result_series = pd.Series(np.zeros(X_test.shape[0]))

In [None]:
result_series.name = 'is_aggressive'

In [None]:
result_series[5] = 1

result_series

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
1267    0.0
1268    0.0
1269    0.0
1270    0.0
1271    0.0
Name: is_aggressive, Length: 1272, dtype: float64

In [None]:
result_series.sum()

1.0

In [None]:
result_series.to_csv("result.csv")