In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2 MB)
[K     |████████████████████████████████| 69.2 MB 12 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26


In [3]:
import numpy as np
import pandas as pd

from collections import Counter

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

In [94]:
RANDOM_STATE = 42

# Speed tracks

## Dowload and add <code>aggressive</code> column

In [None]:
speed_tracks = pd.read_csv("/content/drive/MyDrive/aiijc_transport_simpleteam/data/labled_train_tracks_speed.csv")

In [None]:
speed_tracks

Unnamed: 0.1,Unnamed: 0,driver_id,dt,lat_,lon_,order_id,speed
0,0,b76545fa3cc14acd6a69ac13c1edac33,2021-02-09 21:43:41,55.792710,37.545409,001662da857b5a39bb402aacf3145f86,
1,1,b76545fa3cc14acd6a69ac13c1edac33,2021-02-09 21:44:40,55.792013,37.544481,001662da857b5a39bb402aacf3145f86,5.906441
2,2,b76545fa3cc14acd6a69ac13c1edac33,2021-02-09 21:45:00,55.791365,37.543695,001662da857b5a39bb402aacf3145f86,15.696000
3,3,b76545fa3cc14acd6a69ac13c1edac33,2021-02-09 21:45:20,55.791267,37.543512,001662da857b5a39bb402aacf3145f86,2.862000
4,4,b76545fa3cc14acd6a69ac13c1edac33,2021-02-09 21:45:37,55.791175,37.543634,001662da857b5a39bb402aacf3145f86,2.710588
...,...,...,...,...,...,...,...
675219,675219,1c11e4febcc32f17e7fbbd20a86b9a59,2021-03-04 19:38:57,55.833093,37.491756,fffdfda358f35cf2d7f9d87d205655a7,8.517273
675220,675220,1c11e4febcc32f17e7fbbd20a86b9a59,2021-03-04 19:39:14,55.832650,37.493704,fffdfda358f35cf2d7f9d87d205655a7,27.783529
675221,675221,1c11e4febcc32f17e7fbbd20a86b9a59,2021-03-04 19:39:19,55.832650,37.493704,fffdfda358f35cf2d7f9d87d205655a7,0.000000
675222,675222,1c11e4febcc32f17e7fbbd20a86b9a59,2021-03-04 19:39:37,55.833031,37.495477,fffdfda358f35cf2d7f9d87d205655a7,23.720000


In [None]:
speed_tracks.speed.mean()

17.024485393003463

In [None]:
speed_tracks.groupby('order_id').speed.mean()

order_id
001662da857b5a39bb402aacf3145f86    15.467769
001a8da3b83fe77dd0f3a406acb414c2    11.176727
001eaa75ad97eff199d914f67b8e32ef    18.818378
001f7deb75b27d8ef5a68204a0bd86a2    10.589347
002946d95ca1bc6954858b63c202bbb1    22.557721
                                      ...    
ffdb0446f266cdc96fe79b9ff4e5ca35    13.007567
ffe343aba3979857c5d268011147d720    15.446079
ffeb60427212d142a2aa6dc9fb23713f    25.301565
fffd991bb7f71a047aa7e982fd7ceea8    66.153396
fffdfda358f35cf2d7f9d87d205655a7    19.298484
Name: speed, Length: 9000, dtype: float64

In [None]:
(speed_tracks.speed == 0).sum() / speed_tracks.shape[0]

0.41399150504129

In [None]:
from tqdm.notebook import tqdm

speed_tracks['is_aggressive'] = np.zeros(speed_tracks.shape[0])

for obj in tqdm(train_labeled.values):
    speed_tracks["is_aggressive"].loc[(speed_tracks.order_id == obj[0])] = obj[-1]

HBox(children=(FloatProgress(value=0.0, max=9000.0), HTML(value='')))




In [None]:
speed_tracks.to_csv("labled_train_tracks_speed.csv")
!cp labled_train_tracks_speed.csv /content/drive/MyDrive/aiijc_transport_simpleteam/data/

## Analysis

In [176]:
speed_tracks = pd.read_csv("/content/drive/MyDrive/aiijc_transport_simpleteam/data/labled_train_tracks_speed.csv")

In [177]:
speed_tracks.speed.dropna(inplace=True, axis=0)
speed_tracks.drop(speed_tracks[speed_tracks.speed == 0].index, axis=0, inplace=True)

In [178]:
speed_tracks.speed.mean()

29.051632103536587

In [179]:
speed_tracks.groupby('order_id').speed.mean()

order_id
001662da857b5a39bb402aacf3145f86    22.746719
001a8da3b83fe77dd0f3a406acb414c2    21.654909
001eaa75ad97eff199d914f67b8e32ef    35.716513
001f7deb75b27d8ef5a68204a0bd86a2    19.797474
002946d95ca1bc6954858b63c202bbb1    30.784655
                                      ...    
ffdb0446f266cdc96fe79b9ff4e5ca35    27.015715
ffe343aba3979857c5d268011147d720    23.623415
ffeb60427212d142a2aa6dc9fb23713f    39.261049
fffd991bb7f71a047aa7e982fd7ceea8    79.793271
fffdfda358f35cf2d7f9d87d205655a7    25.392742
Name: speed, Length: 9000, dtype: float64

In [180]:
speed_tracks.is_aggressive.value_counts()

0.0    372920
1.0     22767
Name: is_aggressive, dtype: int64

In [181]:
speed_tracks.groupby('is_aggressive').speed.mean()

is_aggressive
0.0    29.024081
1.0    29.502921
Name: speed, dtype: float64

In [182]:
speed_tracks_X = [list(i[1].speed) for i in speed_tracks.groupby('order_id')]
speed_tracks_y = [list(i[1].is_aggressive) for i in speed_tracks.groupby('order_id')]

In [183]:
speed_tracks_X

[[nan,
  5.906440677966101,
  15.696,
  2.862,
  2.710588235294118,
  22.527,
  12.020869565217387,
  34.86,
  12.87,
  12.32,
  31.3,
  24.34,
  22.031999999999996,
  51.16235294117647,
  27.70285714285714,
  7.432941176470589,
  3.72,
  8.046,
  8.568,
  20.736,
  37.48,
  41.78,
  26.226,
  18.342,
  37.53,
  34.47,
  27.497142857142855,
  2.032941176470588,
  21.753,
  11.2,
  12.705882352941178,
  27.92571428571428,
  0.6810810810810811,
  6.61090909090909,
  32.379999999999995,
  20.914285714285715,
  36.504,
  48.917647058823526,
  45.162,
  33.90260869565217,
  10.651764705882352,
  27.8,
  42.31800000000001,
  32.75076923076923,
  55.19368421052632,
  23.88,
  28.662857142857145,
  26.0,
  30.90688524590164,
  4.914,
  12.175384615384614,
  14.00108108108108],
 [0.06407593384735254,
  4.167,
  12.245901639344261,
  8.353734939759036,
  49.373999999999995,
  15.498,
  26.480655737704915,
  50.53263157894736,
  42.55672131147541,
  2.7,
  64.296,
  16.401176470588233,
  12.834,


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(speed_tracks_X, speed_tracks_y)

In [None]:
CatBoostClassifier().fit(X_train, y_train)

CatBoostError: ignored

# Model code

In [95]:
class Model:
    def __init__(self):        
        self.model = None
        self.counter_words = {}
    
    def count_words(self, x):
        return len(x.split(" "))

    def check_sentence(self, sentence, words_type):
        words_count = 0
        for word in sentence.split(" "):
            word = word.lower().replace(',', '').replace('.', '')

            if (word not in list(self.counter_words.keys()) or len(self.counter_words[word]) == 2): continue

            if (words_type == self.counter_words[word][2]): 
                words_count += 1
        return words_count
        
    def NLP_preprocess(self, X ,y):
        dataset_joined = X.join(y)
        comment_phrases = list(dataset_joined.comment.value_counts().index[: 10])
        
        dataset_joined['is_comment'] = (~np.isin(dataset_joined.comment, comment_phrases)).astype(int)
        
        aggressive_comments = dataset_joined[(dataset_joined['is_comment'] == True) & (dataset_joined.is_aggressive == True)].comment.values
        normal_comments = dataset_joined[(dataset_joined['is_comment'] == True) & (dataset_joined.is_aggressive == False)].comment.values
        
        stop_words = ['на', 'по', 'с', 'в', 'что', 'и', 'а']

        for sentence in normal_comments:
            for word in sentence.split(" "):
                word = word.lower().replace(',', '').replace('.', '')
                if (word in stop_words): continue
                if (word in self.counter_words.keys()):
                    self.counter_words[word][0] += 1
                else: self.counter_words[word] = [1, 0]

        for sentence in aggressive_comments:
            for word in sentence.split(" "):
                word = word.lower().replace(',', '').replace('.', '')
                if (word in stop_words): continue
                if (word in self.counter_words.keys()):
                    self.counter_words[word][1] += 1
                else: self.counter_words[word] = [0, 1]
        
        
        count_all_words = np.array(list(map(lambda x: np.array(x), np.array(list(self.counter_words.items())).T[1]))).T
        
        count_normal_words = count_all_words[0].sum()
        count_aggressive_words = count_all_words[1].sum()

        for word_pair in list(self.counter_words.items()):
            if (word_pair[1][1] == 0 and word_pair[1][0] > 0):
                self.counter_words[word_pair[0]].append("normal")
                continue

            if (word_pair[1][0] == 0 and word_pair[1][1] > 0):
                self.counter_words[word_pair[0]].append("aggressive")
                continue

            ratio_aggressive = word_pair[1][1] / count_aggressive_words
            ratio_normal = word_pair[1][0] / count_normal_words

            if (ratio_aggressive / ratio_normal >= 3):
                self.counter_words[word_pair[0]].append("aggressive")
                continue

            if (ratio_normal / ratio_aggressive >= 3):
                self.counter_words[word_pair[0]].append("normal")
                continue

            self.counter_words[word_pair[0]].append("neutral")

    def add_features(self, X):
        comment_phrases = list(X.comment.value_counts().index[: 5]) + ["---"]
        
        X["is_comment"] = (~np.isin(X.comment, comment_phrases)).astype(int)
        X['dttm'] = pd.to_datetime(X.dttm)
        X['hour'] = X.dttm.apply(lambda x: x.hour)
        X['traff_jam'] = ((X.hour > 6) & (X.hour < 10)) | ((X.hour > 17) & (X.hour < 23))
        X['traff_jam'] = X.traff_jam.astype(int)
        X['weekday'] = X.dttm.apply(lambda x: x.weekday())
        X['holiday'] = (X.weekday >= 5).astype(int)
        X["count_words"] = [-1] * X.shape[0]
        X.loc[X.is_comment == True, "count_words"] = X[X.is_comment == True].comment.apply(lambda x: self.count_words(x))
        X["speed"] = X.distance / (X.duration / 60)
        X['agg_words'] = X.comment.apply(lambda x: self.check_sentence(x, "aggressive"))
        X['normal_words'] = X.comment.apply(lambda x: self.check_sentence(x, "normal"))
        X['distance_thresh'] = ((X.distance > 5) & (X.distance < 20)).astype(int)
        
        return X
    
    def estimate(self, X, y):
        return roc_auc_score(y, self.predict_proba(X, add_feat=False))
    
    def train_test_split_(self, X, y, test_size, X_ss=None, y_ss=None, random_state=RANDOM_STATE):
        if (X_ss is not None):
            X_ss_full, y_ss_full = self.label_shuffle(X, y, X_ss, y_ss, random_state = random_state)
            
            len_train = len(X_ss_full) - round(len(X_ss_full) * test_size)
            
            x_train = X_ss_full[: len_train]
            x_train.drop('ss', axis = 1, inplace = True)
            
            x_test = X_ss_full.iloc[len_train + 1:]
            x_test = x_test[x_test.ss == 0]
            x_test.drop('ss', axis = 1, inplace = True)
            
            y_train = y_ss_full[: len_train]
            y_train.drop('ss', axis = 1, inplace = True)
            
            y_test = y_ss_full.iloc[len_train + 1:]
            y_test = y_test[y_test.ss == 0]
            y_test.drop('ss', axis = 1, inplace = True)
            
            return (x_train, x_test, y_train, y_test)
        
        len_train = len(X) - round(len(X) * test_size)
        
        X = X.sample(frac=1, random_state=random_state)
        y = y.sample(frac=1, random_state=random_state)
        
        return (X[: len_train], X[len_train :], y[: len_train], y[len_train :])
    
    def train(self, X_train, X_test, y_train, y_test, categorical_feature, random_state=RANDOM_STATE):
        print(f"Train size: {X_train.shape}")
        print(f"Test size: {X_test.shape}")
        self.model = CatBoostClassifier(iterations=2000,
                           depth=2,
                           silent=True,
                           loss_function='Logloss',
                           class_weights=(1, 2),
                           random_state=random_state)

        self.model.fit(X_train, y_train, cat_features=categorical_features)
        
        return self.estimate(X_test, y_test)
    
    def label_shuffle(self, X, y, X_ss, y_ss, random_state=RANDOM_STATE):
        X_ss['ss'] = 1
        y_ss = y_ss.to_frame()
        y_ss['ss'] = 1

        X['ss'] = 0
        y['ss'] = 0

        X_ss_full = pd.concat([X, X_ss]).sample(frac=1, random_state=random_state)
        y_ss_full = pd.concat([y, y_ss]).sample(frac=1, random_state=random_state)
        
        return (X_ss_full, y_ss_full)
    
    def train_cross_validation(self, X, y, k, categorical_features, X_ss=None, y_ss=None, random_state=RANDOM_STATE):
        chunk_size = len(X) / k
        chunks_size = [(i*chunk_size, i*chunk_size + chunk_size) for i in range(k)]
        
        result_score = []
        
        print(f"Part size: {chunk_size}")
        
        if (X_ss is not None):
            X_ss_full, y_ss_full = self.label_shuffle(X, y, X_ss, y_ss, random_state = random_state)
            
            for chunkIndex in range(len(chunks_size)):
                x_test = X_ss_full[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                y_test = y_ss_full[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                
                x_train = X_ss_full.drop(x_test.index, axis = 0)
                y_train = y_ss_full.drop(y_test.index, axis = 0)
                
                x_test = x_test[x_test.ss == 0]
                y_test = y_test[y_test.ss == 0]
                
                x_train.drop('ss', axis = 1, inplace = True)
                y_train.drop('ss', axis = 1, inplace = True)
                x_test.drop('ss', axis = 1, inplace = True)
                y_test.drop('ss', axis = 1, inplace = True)
                
                score = self.train(x_train, x_test, y_train, y_test, categorical_features)
                
                print(f"Chunk {chunkIndex}; Score: {score}")
                
                result_score.append((chunks_size[chunkIndex], score))
        else:            
            for chunkIndex in range(len(chunks_size)):
                x_test = X[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                y_test = y[int(chunks_size[chunkIndex][0]) : int(chunks_size[chunkIndex][1])]
                
                x_train = X.drop(x_test.index, axis = 0)
                y_train = y.drop(y_test.index, axis = 0)
                
                score = self.train(x_train, x_test, y_train, y_test, categorical_features)
                
                print(f"Chunk {chunkIndex}; Score: {score}")
                
                result_score.append((chunks_size[chunkIndex], score))
            
        print(f"Mean score: {sum(list(map(lambda x: x[1], result_score))) / k}")
        
        return result_score
    
    def fit_ss(self, X, y, numeric_features, categorial_features, X_ss, y_ss, cross_validation=False, random_state=RANDOM_STATE):
        self.counter_words = {}
        
        X_ = X
        y_ = y
        
        self.NLP_preprocess(pd.concat([X_, X_ss]), pd.concat([y_, y_ss]))
        X_ = self.add_features(X_)[numeric_features + categorical_features]
        
        X_ss = self.add_features(X_ss)[numeric_features + categorical_features]
        
        if (not cross_validation):
            X_train, X_test, y_train, y_test = self.train_test_split_(X_, y_, test_size=0.2, X_ss=X_ss, y_ss=y_ss, random_state=random_state)
            return self.train(X_train, X_test, y_train, y_test, categorical_features)
        else:
            return self.train_cross_validation(X_, y_, 5, categorical_features, X_ss=X_ss, y_ss=y_ss, random_state=random_state)
        
        
    def fit(self, X, y, numeric_features, categorial_features, cross_validation=False, random_state=RANDOM_STATE):
        self.counter_words = {}
        
        X_ = X
        y_ = y
        
        self.NLP_preprocess(X_, y_)
        X_ = self.add_features(X_)[numeric_features + categorical_features]

        if (not cross_validation):
            X_train, X_test, y_train, y_test = self.train_test_split_(X_, y_, test_size=0.2, random_state=random_state)
            return self.train(X_train, X_test, y_train, y_test, categorical_features)
        else:
            return self.train_cross_validation(X_, y_, 5, categorical_features, random_state=random_state)
        
    
    def predict_proba(self, X, add_feat=True):
        if (add_feat): X = self.add_features(X)
        
        X = X[numeric_features + categorical_features]
        
        return self.model.predict_proba(X).T[1]
    
    def predict(self, X, add_feat=True):
        if (add_feat): X = self.add_features(X)
        
        X = X[numeric_features + categorical_features]
        
        return self.model.predict(X)
    
    def predict_thresh(self, X, thresh_above, thresh_below):
        y_unlab_full = self.predict_proba(X)
        
        y_unlab = pd.Series([-1 for i in range(len(X))])
        
        print("Thresh above: {}".format(sum(y_unlab_full >= thresh_above) / len(y_unlab_full)))
        print("Thresh below: {}".format(sum(y_unlab_full <= thresh_below) / len(y_unlab_full)))
        
        y_unlab.iloc[np.where(y_unlab_full >= thresh_above)] = 1
        y_unlab.iloc[np.where(y_unlab_full <= thresh_below)] = 0
        
        return y_unlab

# Train on labled data

In [96]:
train_labeled = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/labled_train_data.csv', index_col=0, sep='\t', comment='#')

X_ = train_labeled.iloc[:, :-1]
y_ = train_labeled.iloc[:, -1:]

X_['client_rate_ride'] = X_['client_rate_ride'].fillna(X_['client_rate_ride'].mean())
X_['client_rides_cnt'] = X_['client_rides_cnt'].fillna(X_['client_rides_cnt'].mean())
X_['driver_rides_cnt'] = X_['driver_rides_cnt'].fillna(X_['driver_rides_cnt'].mean())

In [97]:
numeric_features = ['distance', 'arrived_distance', 'arrived_duration', 'duration', 'driver_rides_cnt', 'client_rides_cnt', 'client_rate_ride', 'count_words']

categorical_features = ['mark', 'is_comment', 'hour', 'weekday', 'agg_words', 'normal_words']

In [98]:
model_supervised = Model()

model_supervised.fit(X_, y_, numeric_features, categorical_features)

Train size: (7200, 14)
Test size: (1800, 14)


0.8362260536398467

In [99]:
model_cv = Model()

model_cv.fit(X_, y_, numeric_features, categorical_features, cross_validation = True)

Part size: 1800.0
Train size: (7200, 14)
Test size: (1800, 14)
Chunk 0; Score: 0.7501325765301662
Train size: (7200, 14)
Test size: (1800, 14)
Chunk 1; Score: 0.7869791274206916
Train size: (7200, 14)
Test size: (1800, 14)
Chunk 2; Score: 0.7333526906697638
Train size: (7200, 14)
Test size: (1800, 14)
Chunk 3; Score: 0.7684012885885325
Train size: (7200, 14)
Test size: (1800, 14)
Chunk 4; Score: 0.7787730470989159
Mean score: 0.7635277460616141


[((0.0, 1800.0), 0.7501325765301662),
 ((1800.0, 3600.0), 0.7869791274206916),
 ((3600.0, 5400.0), 0.7333526906697638),
 ((5400.0, 7200.0), 0.7684012885885325),
 ((7200.0, 9000.0), 0.7787730470989159)]

# Semi-supervised train

## Preprocessing

In [100]:
X_unlab = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/unlabled_train_data.csv', index_col=0, sep='\t', comment='#')
comments_unlabeled = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/unlabled_train_comments.csv', index_col=0, sep='\t', comment='#')
tracks_unlabeled = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/unlabled_train_tracks.csv', index_col=0, sep='\t', comment='#')

X_unlab['client_rate_ride'] = X_unlab['client_rate_ride'].fillna(X_unlab['client_rate_ride'].mean())
X_unlab['client_rides_cnt'] = X_unlab['client_rides_cnt'].fillna(X_unlab['client_rides_cnt'].mean())
X_unlab['driver_rides_cnt'] = X_unlab['driver_rides_cnt'].fillna(X_unlab['driver_rides_cnt'].mean())

In [101]:
sum(X_unlab.comment.isna()) / len(X_unlab.comment)

0.64866629360194

In [102]:
np.where(X_unlab.comment.isna())[0]

array([ 2762,  3239,  3574, ..., 10719, 10720, 10721])

In [103]:
for nanIndex in np.where(X_unlab.comment.isna())[0]:
    obj_comment = comments_unlabeled.loc[nanIndex]
    
    if (len(obj_comment) != 0):
        X_unlab.comment.iloc[nanIndex] = obj_comment.comment

In [104]:
sum(X_unlab.comment.isna()) / len(X_unlab.comment)

0.00018653236336504383

In [105]:
X_unlab.comment.iloc[np.where(X_unlab.comment.isna())[0]] = "---"

## Prediction&filling

In [115]:
y_unlab = model_supervised.predict_thresh(X_unlab, 0.99, 0.001)

Thresh above: 0.003357582540570789
Thresh below: 0.0


In [116]:
y_unlab.name = "is_aggressive"

In [117]:
y_unlab.value_counts()

-1    10686
 1       36
Name: is_aggressive, dtype: int64

In [118]:
X_unlab_lab = X_unlab.iloc[np.where(y_unlab != -1)]
y_unlab_lab = y_unlab.iloc[np.where(y_unlab != -1)]

In [119]:
X_unlab_lab.comment

7        1)Водитель играл в «шашки» на дороге и игрался...
202              2 раза списали деньги, верните пожалуйста
351      засыпал за рулём,  съезжал с дороги, приходило...
551               резкие повороты. водитель резко тормозил
568      Водитель смотрит кино на втором телефоне, проп...
705      Вел медленно по бордовым дорогам, сказал «пешк...
1317     Водитель резко тормозил, обгонял, кричал на др...
1378                             Водитель вел себя грубо. 
1634     Водитель постоянно громко разговаривал на своё...
1667     Водитель в возрасте, несколько раз отвечал на ...
1707     Водитель 2 раза поругался с другими таксистами...
1975     Водитель опасно вел автомобиль. Было ощущение ...
2221     Водитель явно засыпал за рулём, постоянно зева...
2261      Приехала другая машина вместо указанной в заказе
2302                       Водитель неадекватно себя вёл, 
2346     Проехал на красный свет дважды на перекрестках...
2472                      водит опасно и очень неаккурат

In [120]:
X_unlab_lab.is_comment.value_counts()

1    36
Name: is_comment, dtype: int64

## Train

In [121]:
model_semisupervised = Model()

model_semisupervised.fit_ss(X_, y_, numeric_features, categorical_features, X_unlab_lab, y_unlab_lab)

Train size: (7229, 14)
Test size: (1798, 14)


0.8500983486574265

In [122]:
model_ss_cv = Model()

model_ss_cv.fit_ss(X_, y_, numeric_features, categorical_features, X_unlab_lab, y_unlab_lab, cross_validation=True)

Part size: 1800.0
Train size: (7224, 14)
Test size: (1791, 14)


KeyboardInterrupt: ignored

# Prediction

In [123]:
X_test = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/labled_test_data.csv', index_col=0, sep='\t', comment='#')
tracks_test = pd.read_csv('/content/drive/MyDrive/aiijc_transport_simpleteam/data/base_files/labled_test_tracks.csv', index_col=0, sep='\t', comment='#')

In [124]:
X_test

Unnamed: 0,order_id,driver_id,client_id,dttm,date,arrived_distance,arrived_duration,distance,duration,from_latitude,from_longitude,to_latitude,to_longitude,mark,client_rate_ride,client_rides_cnt,driver_rides_cnt,comment
0,49430c6531d098d3b288d95d7d1e7f4f,21348747875e0b01bc492d32b49c638d,4d4d06feddc5669339b1cd9d7941a116,2021-04-03 14:49:46,2021-04-03,180.0,1.0,6.1,19.4,55.822578,37.596844,55.792253,37.599762,Skoda Octavia,5.0,36.0,241.0,Больше нечего сказать
1,d3fff1a829a5b5ccbccd40c9895ff4b0,398fe459519b5facba93168b71df0625,2a41e0a45a19a892e960d6c4bef5c27b,2021-04-03 15:02:44,2021-04-03,790.0,2.0,6.4,19.0,55.609624,37.719165,55.596660,37.763549,Hyundai Solaris,5.0,19.0,193.0,Больше нечего сказать
2,f73533b8aab3b1d7b52488d954a46fa0,951410ef679f167e6515ea5e4d5fb92d,9a8017ac4e8b0ce55b0dbd75c2f25445,2021-04-03 15:04:47,2021-04-03,170.0,1.0,9.0,15.4,55.570245,37.576624,55.524880,37.589531,Volkswagen Polo,5.0,13.0,214.0,Все отлично!
3,43c6c249e4751b9ba0cf68de3e40a053,311ac9352166fb9b2f9153581b03ab5b,2f9a136bb3baa4912748a981033fd272,2021-04-03 15:05:58,2021-04-03,140.0,0.0,10.7,27.0,55.747910,37.691090,55.750109,37.585366,Hyundai Solaris,5.0,16.0,510.0,Водитель чихает без маски не прикрывая нос
4,b88dccdeaa7c5c1478915d7532082cda,cccd0c6d3ce5a7c553a1f28086beef5a,121a25147dafdb4565cf4eb6db7302f6,2021-04-03 15:10:29,2021-04-03,40.0,0.0,21.8,38.2,55.849331,37.494747,55.725583,37.449681,Kia Cerato,,40.0,296.0,Больше нечего сказать
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,dde84eeec63c85f1a1dc1f5321c9d424,a6e1800b310773694593ce2f810de76d,a0adfd4e6e617c21e9de8f9d006d143a,2021-04-09 23:37:09,2021-04-09,770.0,3.0,7.0,11.6,55.768613,37.580378,55.732811,37.535191,Kia Rio,5.0,27.0,25.0,"Ставит прибытие, не доезжая до А\nСтартует тач..."
1268,86a1d75a3366f9ccf0dd95459ef36c58,c79d0c270adf8cfc92570ab608d228b4,a5832f040002a98f67c884cf0e507a67,2021-04-09 23:40:11,2021-04-09,50.0,0.0,5.6,9.3,55.805612,37.522111,55.790116,37.496172,Toyota Camry,5.0,33.0,104.0,Больше нечего сказать
1269,ef394c9f24453dd4e7d9a563a761f4d8,3694cb26b900e5bc06f0b3fd62e7897b,d48161326ee0534ffc2eebfc4a77f569,2021-04-09 23:42:47,2021-04-09,140.0,1.0,15.9,24.2,55.709959,37.622206,55.780273,37.535656,Skoda Rapid,5.0,8.0,252.0,Больше нечего сказать
1270,010927d83eb31965dd0c63013ee125c4,09563133175aa456e9001c81dc331bdc,3f1c4dc975ec1464e7571c514cd707bc,2021-04-09 23:47:22,2021-04-09,20.0,0.0,2.7,7.0,55.578766,37.665268,55.597149,37.666500,Kia Ceed,5.0,25.0,304.0,Прокуреный салон.


In [126]:
result_series = pd.Series(model_semisupervised.predict(X_test))

In [127]:
result_series.name = 'is_aggressive'

In [128]:
result_series

0       0
1       0
2       0
3       1
4       0
       ..
1267    0
1268    0
1269    0
1270    0
1271    0
Name: is_aggressive, Length: 1272, dtype: int64

In [129]:
result_series.sum()

25

In [130]:
result_series.to_csv("result.csv")