<a href="https://colab.research.google.com/github/Egoluback/aijic2021_transport/blob/yarik/model4track.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
import time
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Any, Tuple, Union
import pickle
from tqdm import tqdm
from os.path import exists

In [4]:
path = "./"

train_labeled = pd.read_csv(path + 'data/base_files/labled_train_data.csv', index_col=0, sep='\t', comment='#')
comments_labeled = pd.read_csv(path + 'data/base_files/labled_train_comments.csv', index_col=0, sep='\t', comment='#')
tracks_labeled = pd.read_csv(path + 'data/labled_train_tracks_speed.csv', index_col=0, sep=',', comment='#')

train_unlabeled = pd.read_csv(path +'data/base_files/unlabled_train_data.csv', index_col=0, sep='\t', comment='#')
comments_unlabeled = pd.read_csv(path+ 'data/base_files/unlabled_train_comments.csv', index_col=0, sep='\t', comment='#')
tracks_unlabeled = pd.read_csv(path+ 'data/unlabled_train_tracks_speed.csv', index_col=0, sep=',', comment='#')

In [6]:
tracks_unlabeled

Unnamed: 0,driver_id,dt,lat_,lon_,order_id,speed
0,6bcc649b6ec22251179da12125d04011,2021-03-29 11:55:56,55.757886,37.406491,0000a57c86cabd27d707a5fde1d0fbe4,
1,6bcc649b6ec22251179da12125d04011,2021-03-29 11:56:02,55.757886,37.406491,0000a57c86cabd27d707a5fde1d0fbe4,0.000000
2,6bcc649b6ec22251179da12125d04011,2021-03-29 11:56:23,55.758017,37.406500,0000a57c86cabd27d707a5fde1d0fbe4,2.228571
3,6bcc649b6ec22251179da12125d04011,2021-03-29 11:56:43,55.757987,37.406500,0000a57c86cabd27d707a5fde1d0fbe4,0.540000
4,6bcc649b6ec22251179da12125d04011,2021-03-29 11:57:04,55.758015,37.406495,0000a57c86cabd27d707a5fde1d0fbe4,0.514286
...,...,...,...,...,...,...
674713,4cbab2104a47e4ea966c7f2ecd8f4775,2021-03-26 20:44:42,55.656228,37.494256,fffface895e65d8da177137701b1ee98,0.000000
674714,4cbab2104a47e4ea966c7f2ecd8f4775,2021-03-26 20:45:04,55.656143,37.494459,fffface895e65d8da177137701b1ee98,3.600000
674715,4cbab2104a47e4ea966c7f2ecd8f4775,2021-03-26 20:45:24,55.655037,37.496817,fffface895e65d8da177137701b1ee98,46.980000
674716,4cbab2104a47e4ea966c7f2ecd8f4775,2021-03-26 20:45:46,55.654412,37.498307,fffface895e65d8da177137701b1ee98,26.509091


In [7]:
tracks_unlabeled.head()

Unnamed: 0,driver_id,dt,lat_,lon_,order_id,speed
0,6bcc649b6ec22251179da12125d04011,2021-03-29 11:55:56,55.757886,37.406491,0000a57c86cabd27d707a5fde1d0fbe4,
1,6bcc649b6ec22251179da12125d04011,2021-03-29 11:56:02,55.757886,37.406491,0000a57c86cabd27d707a5fde1d0fbe4,0.0
2,6bcc649b6ec22251179da12125d04011,2021-03-29 11:56:23,55.758017,37.4065,0000a57c86cabd27d707a5fde1d0fbe4,2.228571
3,6bcc649b6ec22251179da12125d04011,2021-03-29 11:56:43,55.757987,37.4065,0000a57c86cabd27d707a5fde1d0fbe4,0.54
4,6bcc649b6ec22251179da12125d04011,2021-03-29 11:57:04,55.758015,37.406495,0000a57c86cabd27d707a5fde1d0fbe4,0.514286


each order is different batch

In [2]:
class Preprocessing_labeled():
    def __init__(self, tracks_labeled,  chunk_size=15, multiplier=1):
        self.tracks = tracks_labeled
        self.MULTIPLIER = multiplier
        self.CHUNK_SIZE = chunk_size

    # undersampling method deletes some extra non aggressive values
    def undersampling(self, X):
        aggressive_count = sum(X.is_aggressive == 1)
        non_aggressive_ind = X[X.is_aggressive == 0].index

        # keep aggressive_count*multiplier number of non_aggressive samples
        random_indices = np.random.choice(
            non_aggressive_ind, aggressive_count*self.MULTIPLIER, replace=False)
        return pd.concat([X.loc[random_indices], X[X.is_aggressive == 1]])

    def split(self, arr, chunk_size=15):
        result = []
        # get right length of arr so that it equally splits into chunks
        length = len(arr)
        split_length = length - (length % self.CHUNK_SIZE)

        for i in range(split_length)[self.CHUNK_SIZE::self.CHUNK_SIZE]:
            result.append(arr[i-self.CHUNK_SIZE:i])

        return np.array(result)

    # make df, so that each row has Series object with order speeds
    def make_nested(self, tracks,):
        copied_tracks = tracks.copy()
        orders = copied_tracks.drop_duplicates('order_id', keep='last')
        orders = self.undersampling(orders)

        y_labels = []
        X_train = []
        for order in tqdm(orders['order_id']):
            order_df = copied_tracks[copied_tracks.order_id == order]
            order_df.loc[0, 'speed'] = 0

            if order_df.shape[0] < self.CHUNK_SIZE:
                continue

            splitted_arrs = self.split(order_df.values, self.CHUNK_SIZE)
            for arr in splitted_arrs:
                is_aggressive = arr[0][7]
                y_labels.append(is_aggressive)
                speed_series = []
                for row in arr:  
                    # append only speed 
                    speed_series.append(row[6])
                X_train.append(pd.Series(speed_series))

        return X_train, y_labels

In [1]:

class Preprocessing_unlabeled():
    def __init__(self, tracks_unlabeled, chunk_size=15, multiplier=1):
        self.CHUNK_SIZE = chunk_size
        self.MULTIPLIER = multiplier
        self.tracks = tracks_unlabeled

    def split(self, arr,):
        result = []
        # get right length of arr so that it equally splits into chunks
        length = len(arr)
        split_length = length - (length % self.CHUNK_SIZE)

        for i in range(split_length)[self.CHUNK_SIZE::self.CHUNK_SIZE]:
            result.append(arr[i-self.CHUNK_SIZE:i])

        return np.array(result)

    # undersampling method deletes extra non aggressive values
    def undersampling(self, X, y):
        aggressive_count = np.sum(y)
        non_aggressive_ind = np.argwhere(y == 0.0).flatten()
        aggressive_ind = np.argwhere(y == 1.0).flatten()
        # keep aggressive_count*multiplier number of non_aggressive samples
        random_indices = np.random.choice(
            non_aggressive_ind, int(aggressive_count)*self.MULTIPLIER, replace=False)
        print(random_indices)
        undersampling_X = np.concatenate(
            [np.take(X, random_indices, 0), 
             np.take(X, aggressive_ind, 0)])
        undersampling_y = np.concatenate(
            [np.take(y, aggressive_ind), 
             np.take(y, random_indices)])
        return undersampling_X, undersampling_y

    # make df, so that each row has Series object with order speeds
    def make_nested(self, tracks):
        orders = tracks.drop_duplicates('order_id', keep='last')

        X_train = []
        for order in tqdm(orders['order_id']):
            order_df = tracks[tracks.order_id == order]
            order_df.loc[0, 'speed'] = 0

            if order_df.shape[0] < self.CHUNK_SIZE:
                continue
            
            splitted_arrs = self.split(order_df.values)
            for arr in splitted_arrs:
                
                speed_series = []
                for row in arr:
                    # append only speed
                    speed_series.append(row[5])
                X_train.append(pd.Series(speed_series))

        return pd.DataFrame({'speed': X_train})


In [10]:
testing = "dummy"
testing2 = "dummy"

In [6]:
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sktime.transformations.panel.rocket import MiniRocket
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
import sklearn.linear_model
import json

#Classifier = Union[LogisticRegressionCV,
#                   LogisticRegressionCV, RidgeClassifier, RidgeClassifierCV]

class Model4track():
    def __init__(self, train_labeled, tracks_labeled, tracks_unlabeled, labeled_trained_model_path, semisupervised_model_path, get_score=True):
        self.rocket = MiniRocket()
        self.get_score = get_score
        self.train_labeled = train_labeled
        self.tracks_unlabeled = tracks_unlabeled
        self.tracks_labeled = tracks_labeled
        self.preprocessing_unlabeled = Preprocessing_unlabeled(
            self.tracks_unlabeled,15,1)
        self.preprocessing_labeled = Preprocessing_labeled(
            self.tracks_labeled,15,1)
        self.labled_trained_model_path = labeled_trained_model_path 
        self.semisupervised_model_path = semisupervised_model_path 

    def train(self, X_train_transform, y_train, X_test_transform, y_test):

        classifier = RidgeClassifier(normalize=True)
        if self.get_score:
            classifier = RidgeClassifierCV(normalize=True)
        classifier.fit(X_train_transform, y_train)
        with open(self.semisupervised_model_path, 'wb+') as file:
            pickle.dump(classifier, file)
        if self.get_score:
            print("________________SCORE____________________")
            print(classifier.score(X_test_transform, y_test))

        return classifier

    def ss_train(self) -> sklearn.linear_model:
        '''
        Semi-supervised training algorithm. We train classifier on our labeled data, 
        then use this classifier to label unlabeled data 
        and retrain classifier on this pseudo-labeled data 
        ''' 
        def convert(x):
            if hasattr(x, "tolist"):  # numpy arrays have this
                return {"$array": x.tolist()}  # Make a tagged object
            raise TypeError(x)

        def deconvert(x):
            if len(x) == 1:  # Might be a tagged object...
                key, value = next(iter(x.items()))  # Grab the tag and value
                if key == "$array":  # If the tag is correct,
                    return np.array(value)  # cast back to array
            return x

        cached_data_path = 'data/labeled_preprocessed.json'
        cached_y_data_path = 'data/labeled_y_preprocessed.json'
        if exists(cached_data_path):
            with open(cached_data_path) as X_file, open(cached_y_data_path) as y_file:
                X_train = np.array(json.load(X_file, object_hook=deconvert))
                X_train = [pd.Series(arr) for arr in X_train]
                y_train = json.load(y_file, object_hook=deconvert)
        else:
            X_train, y_train = self.preprocessing_labeled.make_nested(self.tracks_labeled)
            with open(cached_data_path, 'w') as X_file, open(cached_y_data_path, 'w') as y_file:
                json.dump(X_train, X_file, default=convert)
                json.dump(y_train, y_file, default=convert)

        if self.get_score:
            X_train, X_test, y_train, y_test = train_test_split(
                X_train, y_train, test_size=0.33)
            X_test = pd.DataFrame({'speed':X_test})
            y_test = np.array(y_test)
        
        X_train = pd.DataFrame({'speed':X_train})
        y_train = np.array(y_train)

        # training classifier on labeled data
        self.rocket.fit(X_train)
        X_train_transform = self.rocket.transform(X_train)
        if self.get_score:
            X_test_transform = self.rocket.transform(X_test)

        # check if pretrained model exists
        if not exists(self.labled_trained_model_path):
                        
            classifier = self.train(
                X_train_transform, y_train, X_test_transform, y_test)

            with open(self.labled_trained_model_path ,mode='wb') as file:
                pickle.dump(classifier, file)
        else:
            with open(self.labled_trained_model_path ,mode='rb') as file:
                classifier = pickle.load(file)

        # unsupervised learning
        unlabled_preprocessed_path = './data/pseudo_X.json'
        if exists(unlabled_preprocessed_path):
            with open(unlabled_preprocessed_path) as file:
                pseudo_X = json.load(file, object_hook=deconvert)
        else:
            pseudo_X = self.preprocessing_unlabeled.make_nested(self.tracks_unlabeled)
            with open(unlabled_preprocessed_path, 'w') as file:
                json.dump(pseudo_X.to_numpy(), file, default = convert)


        #pseudo_X = pd.DataFrame(pseudo_X)
        print(pseudo_X)

        self.rocket.fit(pseudo_X)
        transformed_pseudo_X = self.rocket.transform(pseudo_X)
        pseudo_y = classifier.predict(transformed_pseudo_X)

        # if number of aggressive samples less than non agressive
        if sum(pseudo_y)<len(pseudo_y)/2:
            pseudo_X, pseudo_y = self.preprocessing_unlabeled.undersampling(pseudo_X, pseudo_y)

        all_X_transform = pd.concat([transformed_pseudo_X, X_train_transform])
        all_y = np.concatenate((pseudo_y, y_train))
        print('SHAPE')
        print(len(all_X_transform))
        print('AGGRESSIVE PERCENTAGE')
        print(all_y.sum()/len(all_y))

        
        semisupervised_classifer = self.train(all_X_transform, all_y, X_test_transform, y_test) 
        with open(self.semisupervised_model_path, 'wb+') as file:
            pickle.dump(semisupervised_classifer, file)
        return semisupervised_classifer




Self-training realization:
1. Make pseudo-labels for unlabled data
2. Retrain classifier on labled and pseudolabled data
https://towardsdatascience.com/a-gentle-introduction-to-self-training-and-semi-supervised-learning-ceee73178b38 

In [57]:

labeled_trained_model_path = './models/labeled_trained_classifier.pkl'
semisupervised_model_path = './models/semisupervised_classifier.pkl'
model = Model4track(train_labeled, tracks_labeled, tracks_unlabeled, labeled_trained_model_path, semisupervised_model_path)
model.ss_train()

[[[  0.           0.           2.22857143 ...   7.88571429   5.70731707
    49.4557377 ]]

 [[  0.          89.1          0.         ...  31.07368421  43.37142857
    39.6       ]]

 [[  5.48571429   3.51         7.92       ... 129.22105263 121.32
   107.82857143]]

 ...

 [[  0.           1.06363636  10.6        ...   0.          28.28571429
     0.        ]]

 [[  0.           2.74285714   0.         ... 103.71428571 100.50967742
   103.24285714]]

 [[  0.         106.81967213  88.74782609 ...  44.57142857  63.74117647
    49.24285714]]]
[ 2663  8402  6430 ...  9264 17978  6959]
SHAPE
32216
AGGRESSIVE PERCENTAGE
0.5015062567588444


ValueError: Found input variables with inconsistent numbers of samples: [32216, 25892]

In [7]:
classifier = None

preprocessing_labeled = Preprocessing_labeled(
    tracks_labeled,15,1)
#X_train, y_train = preprocessing_labeled.make_nested(tracks_labeled)

cached_data_path = 'data/labeled_preprocessed.json'
cached_y_data_path = 'data/labeled_y_preprocessed.json'
def deconvert(x):
    if len(x) == 1:  # Might be a tagged object...
        key, value = next(iter(x.items()))  # Grab the tag and value
        if key == "$array":  # If the tag is correct,
            return np.array(value)  # cast back to array
    return x

with open(cached_data_path) as X_file, open(cached_y_data_path) as y_file:
                X_train = np.array(json.load(X_file, object_hook=deconvert))
                X_train = [pd.Series(arr) for arr in X_train]
                y_train = json.load(y_file, object_hook=deconvert)
print(1)                
X_train, X_test, y_train, y_test = train_test_split(
                X_train, y_train, test_size=0.33)
X_test = pd.DataFrame({'speed':X_test})
print(X_test.describe())
y_test = np.array(y_test)
print(y_test.shape)
X_train = pd.DataFrame({'speed':X_train})
print(2)                
rocket = MiniRocket()
rocket.fit(X_train)
X_test_transform = rocket.transform(X_test)
print(3)                

with open('./models/semisupervised_classifier.pkl',mode='rb') as file:
    classifier = pickle.load(file)
classifier.score(X_test_transform, y_test)

1


In [43]:
testing3 = None