In [2]:
!pip install plotly-express

Collecting plotly-express
  Using cached plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Collecting plotly>=4.1.0
  Using cached plotly-4.14.3-py2.py3-none-any.whl (13.2 MB)
Collecting patsy>=0.5
  Using cached patsy-0.5.1-py2.py3-none-any.whl (231 kB)
Collecting statsmodels>=0.9.0
  Using cached statsmodels-0.12.2-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)
Collecting retrying>=1.3.3
  Using cached retrying-1.3.3-py3-none-any.whl
Installing collected packages: retrying, patsy, statsmodels, plotly, plotly-express
Successfully installed patsy-0.5.1 plotly-4.14.3 plotly-express-0.4.1 retrying-1.3.3 statsmodels-0.12.2


In [55]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.11.1-py3-none-any.whl (285 kB)
[K     |████████████████████████████████| 285 kB 1.8 MB/s eta 0:00:01
Installing collected packages: seaborn
Successfully installed seaborn-0.11.1


In [44]:
import pandas as pd
import json
from tqdm import tqdm
import plotly.express as px
from pathlib import Path
pd.set_option('display.max_columns', None)

In [56]:
import seaborn as sns

def get_color(number):
    color = sns.color_palette("deep")[number]
    actual_rgb = tuple(int(255*x) for x in color)
    actual_hex = '#%02x%02x%02x' % actual_rgb
    return actual_hex

In [4]:
def cast_to_string(routes_property):
    return [str(int).replace("[", "").replace("]", "") for int in routes_property]

class GIS_processing():
    
    df = pd.DataFrame()
    df_clear = pd.DataFrame()
    dfs = pd.DataFrame()
    
    def __init__(self) -> None:
        pass
    
    def add_additionals(self, filepath) -> None:
        self.df = pd.read_csv(filepath)
        route_type = []
        start_point_meters = []
        finish_point_meters = []
        start_point_part = []
        finish_point_part = []
        instruction_type = []
        
        for i in tqdm(range(len(self.df["drivingDirection_json"]))):
            route_type.append(json.loads(self.df["drivingDirection_json"][i])["type"])
            start_point_meters.append(json.loads(self.df["drivingDirection_json"][i])['start_point']['meters'])
            finish_point_meters.append(json.loads(self.df["drivingDirection_json"][i])['finish_point']['meters'])
            start_point_part.append(json.loads(self.df["drivingDirection_json"][i])['start_point']['part'])
            finish_point_part.append(json.loads(self.df["drivingDirection_json"][i])['finish_point']['part'])
            if ('instruction' in json.loads(self.df["drivingDirection_json"][i])):
                instruction_type.append(json.loads(self.df["drivingDirection_json"][i])['instruction']['type'])
            else:
                 instruction_type.append(-1)
        route_type = pd.DataFrame(cast_to_string(route_type), columns=['route_type'])
        start_point_meters = pd.DataFrame(cast_to_string( start_point_meters), columns=['start_point_meters'])
        finish_point_meters = pd.DataFrame(cast_to_string(finish_point_meters), columns=['finish_point_meters'])
        start_point_part = pd.DataFrame(cast_to_string(start_point_part), columns=[' start_point_part'])
        finish_point_part = pd.DataFrame(cast_to_string(finish_point_part), columns=['finish_point_part'])
        instruction_type = pd.DataFrame(cast_to_string(instruction_type), columns=['instruction_type'])
        
        self.df = self.df.join(route_type.join(start_point_meters.join(finish_point_meters.join(start_point_part.join(finish_point_part.join(instruction_type))))))
        self.df.to_csv('add_' + Path(filepath).name)
        
    def flatternize(self, items_column, filepath = "", dataframe = "",) -> None:
        if (filepath == ""):
            self.df = dataframe
        else:
            self.df = pd.read_csv(filepath)
            
        routes_edges = []
        routes_time = []
        routes_speed = []
        routes_length = []
#        routes_traffic_type = []

        for i in tqdm(range(len(self.df))):
            routes_edges.append([])
            routes_time.append([])
            routes_speed.append([])
            routes_length.append([])
#            routes_traffic_type.append([])
            for j in range(len(json.loads(self.df.iloc[i, items_column])['items'])):
                 for k in range(len(json.loads(self.df.iloc[i, items_column])['items'][j]['edges'])):
                        routes_edges[i].append(json.loads(self.df.iloc[i, items_column])['items'][j]['edges'][k]['edge_id'])
                        routes_time[i].append(json.loads(self.df.iloc[i, items_column])['items'][j]['edges'][k]['time'])
                        routes_speed[i].append(json.loads(self.df.iloc[i, items_column])['items'][j]['edges'][k]['speed'])
                        routes_length[i].append(json.loads(self.df.iloc[i, items_column])['items'][j]['edges'][k]['length'])
#                if ('traffic_type' in json.loads(df.iloc[i, items_column])['items'][j]['edges'][k]):
#                            routes_traffic_type[i].append(json.loads(self.df.iloc[i, items_column])['items'][j]['edges'][k]['traffic_type'])
#                        else:
#                            routes_traffic_type[i].append(-1)
                            
        edges_pd = pd.DataFrame(cast_to_string(routes_edges), columns=['edges'])
        time_pd = pd.DataFrame(cast_to_string(routes_time), columns=['time'])
        speed_pd = pd.DataFrame(cast_to_string(routes_speed), columns=['speed'])
        routes_length_pd = pd.DataFrame(cast_to_string(routes_length), columns=['length'])
#        routes_traffic_type_pd = pd.DataFrame(cast_to_string(routes_traffic_type), columns=['directionality'])
        
        
        self.df = self.df.join(edges_pd.join(time_pd.join(speed_pd.join(routes_length_pd)))).drop(['start_json', 'end_json', 'navigationId', "start_utc", "end_utc", "ETA", "build_utc", "build_timestamp"], axis=1)
        self.df.to_csv('processed_' + Path(filepath).name)
        return(self.df)
        
    def plot_time_freq(self, routes_1, routes_2):
        
        def freq_counter(routes):
            freq = []
            time = []
            index = pd.DatetimeIndex(routes['start_timestamp'])
            for i in range(0, 23):
                freq.append(len(routes.iloc[index.indexer_between_time(str(0+i) + ':00', str(1+i) + ':00')]))
                time.append(str(0+i) + ':00 - ' +  str(1+i) + ':00');
            freq.append(len(routes.iloc[index.indexer_between_time('23:00','00:00')]))
            time.append('23:00 - 00:00')
            df = pd.DataFrame(freq, index = time, columns = ['frequencies'])
            return(df)

        def draw_freq_hist(freq_df):
            fig = px.bar(freq_df, x=freq_df.index, y='frequencies')
            fig.show()

        def draw_freq_line(freq_1, freq_2):
            fig = px.line(freq_1, x=freq_1.index, y='frequencies')
            fig.add_scatter(x=freq_2.index, y=freq_2['frequencies'], mode='lines')
            fig.show()
            
        freq_city_1 = freq_counter(routes_1)
        freq_city_2 = freq_counter(routes_2)
        
        draw_freq_line(freq_city_1, freq_city_2)
        
    def plot_time_freq(self, routes_1, routes_2):
        
        def flat_list(data):
            return [int(item.replace("'", "")) for sublist in data for item in sublist]
        
        def get_use_data(routes_omsk):
            day_edges = [] 
            for i in range(0, 2):
                tmp = routes_omsk[(routes_omsk['start_timestamp'] >= '2020-12-0' + str(7+i)) & (routes_omsk['start_timestamp'] < '2020-12-0' + str(7+i+1))]['new_edges'].to_list()
                tmp_cl = [x for x in tmp if str(x) != 'nan']
                for j in range(len(tmp_cl)):
                    tmp_cl[j] = tmp_cl[j].split(',')
                day_edges.append(flat_list(tmp_cl))

            tmp = routes_omsk[(routes_omsk['start_timestamp'] >= '2020-12-09') & (routes_omsk['start_timestamp'] < '2020-12-10')]['new_edges'].to_list()
            tmp_cl = [x for x in tmp if str(x) != 'nan']
            for j in range(len(tmp_cl)):
                tmp_cl[j] = tmp_cl[j].split(',')
            day_edges.append(flat_list(tmp_cl))

            for i in range(0, 4):
                tmp = routes_omsk[(routes_omsk['start_timestamp'] >= '2020-12-1' + str(i)) & (routes_omsk['start_timestamp'] < '2020-12-1' + str(i+1))]['new_edges'].to_list()
                tmp_cl = [x for x in tmp if str(x) != 'nan']
                tmp_cl = [x for x in tmp_cl if str(x) != '']
                for j in range(len(tmp_cl)):
                    tmp_cl[j] = tmp_cl[j].split(',')
                day_edges.append(flat_list(tmp_cl))
                return day_edges
        
        def usage_to_dict(usage):
            counts = dict()
            for i in usage:
                counts[i] = counts.get(i, 0) + 1
            return counts

        def overall_to_dict(overall):
            counts = dict()
            for i in overall:
                counts[i] = 0
            return counts
        
        def draw_freq_line_inter(freq_1, freq_2, freq_3):
            fig = px.line(freq_1, x=freq_1.index, y='frequencies')
            fig.add_scatter(x=freq_2.index, y=freq_2['frequencies'], mode='lines', line = {'color': get_color(3), 'dash': 'solid'})
            fig.add_scatter(x=freq_3.index, y=freq_3['frequencies'], mode='lines', line = {'color': get_color(8), 'dash': 'solid'})
            fig.update_layout(showlegend=False)
            fig.show()
        
        overall = flat_list(dfs[1])
        usage_monday = usage_to_dict(get_use_data(routes_omsk_clear)[0])
        usage_wednesday = usage_to_dict(get_use_data(routes_omsk_clear)[1])

        usage_saturday = usage_to_dict(get_use_data(routes_omsk_clear)[5])
        usage_sunday = usage_to_dict(get_use_data(routes_omsk_clear)[6])
        overall = overall_to_dict(overall)
        
        weekdays = {k: overall.get(k, 0) + usage_monday.get(k, 0) + usage_wednesday.get(k, 0) 
            for k in set(overall) | set(usage_monday) | set(usage_wednesday)}
        weekend = {k: overall.get(k, 0) + usage_saturday.get(k, 0) + usage_sunday.get(k, 0) 
            for k in set(overall) | set(usage_saturday) | set(usage_sunday)}
        intersection = {x:min(weekdays[x], weekend[x]) for x in weekdays if x in weekend}
        
        weekdays = pd.DataFrame.from_dict(weekdays, orient = 'index', columns = ['frequencies']).reset_index()
        weekend = pd.DataFrame.from_dict(weekend, orient = 'index', columns = ['frequencies']).reset_index()
        intersection = pd.DataFrame.from_dict(intersection, orient = 'index', columns = ['frequencies']).reset_index()
        
        draw_freq_line_inter(intersection, weekdays, weekend)
        
        
    def clear_flatternized(self, filepath = None) -> None:

        def del_heads_n_tails(route):
            counter_d = 0
            edge_arr = route['edges'].split(',')
            time_arr = route['time'].split(',')
            speed_arr = route['speed'].split(',')
            len_arr = route['length'].split(',')
#            dir_arr = route['directionality'].split(',')
            
            indexes = []
            for j in range(len(edge_arr)):
                if (int(edge_arr[j]) == 0 or int(time_arr[j]) == 0):
                    indexes.append(j)

            for j in range(len(edge_arr) - 1):
                if (int(edge_arr[j]) == int(edge_arr[j+1])):
                    indexes.append(j)

            indexes = set(indexes)
            counter_d = len(indexes)
            for index in sorted(indexes, reverse=True):
                del edge_arr[index]
                del time_arr[index]
                del speed_arr[index]
                del len_arr[index]
#                del dir_arr[index]                                                             
            
            return [edge_arr, time_arr, speed_arr, len_arr, counter_d]

        def clear_routes_data(routes):
            edges_clear = []
            time_clear = []
            speed_clear = []
            len_clear = []
 #           dir_clear = []
                                                                                     
            counter_do = 0
            for i in tqdm(range(len(routes))):
                route_data = del_heads_n_tails(routes.iloc[i, :])
                edges_clear.append(route_data[0])
                time_clear.append(route_data[1])
                speed_clear.append(route_data[2])
                len_clear.append(route_data[3])
#                dir_clear.append(route_data[4])
                counter_do += route_data[4] 
            routes_properties = [[edges_clear, 'edges'], [time_clear, 'time'], [speed_clear, 'speed'], [len_clear, 'length']]
            for i in range(len(routes_properties)):
                routes_properties[i] = pd.DataFrame(cast_to_string(routes_properties[i][0]), columns = [routes_properties[i][1]])
            return [routes_properties, edges_clear, counter_do]
        
        if (filepath != None):
            self.dfs = clear_routes_data(pd.read_csv(filepath))
            self.df = pd.read_csv(filepath)
        else:          
            self.dfs = clear_routes_data(self.df)
#        self.df_clear = self.df.drop(['edges', 'time', 'speed', 'length', 'directionality'], axis=1).join(self.dfs[0][0].join(self.dfs[0][1].join(self.dfs[0][2].join(self.dfs[0][3].join(self.dfs[0][4])))))
        self.df_clear = self.df.drop(['edges', 'time', 'speed', 'length'], axis=1).join(self.dfs[0][0].join(self.dfs[0][1].join(self.dfs[0][2].join(self.dfs[0][3]))))
        self.df_clear.to_csv('clear_' + Path(filepath).name)
        
            

In [40]:
def day_period(routes):
    index = pd.DatetimeIndex(routes['start_timestamp'])
    routes['day_period'] = -1
    routes.loc[index.indexer_between_time('00:00', '06:00'), 'day_period'] = 0
    routes.loc[index.indexer_between_time('06:00', '11:00'), 'day_period'] = 1
    routes.loc[index.indexer_between_time('11:00', '19:00'), 'day_period'] = 2
    routes.loc[index.indexer_between_time('19:00', '00:00'), 'day_period'] = 3
    return routes

def week_period(clear): # that's hell, i know
    clear['week_period'] = -1
    clear.loc[('2021-01-01' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2021-01-02'), 'week_period'] = 1
    clear.loc[('2020-12-31' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2021-01-01'), 'week_period'] = 1
    clear.loc[('2020-12-30' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-31'), 'week_period'] = 0
    clear.loc[('2020-12-29' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-30'), 'week_period'] = 0
    clear.loc[('2020-12-28' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-29'), 'week_period'] = 0
    clear.loc[('2020-12-27' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-28'), 'week_period'] = 1
    clear.loc[('2020-12-26' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-27'), 'week_period'] = 1
    clear.loc[('2020-12-25' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-26'), 'week_period'] = 0
    clear.loc[('2020-12-24' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-25'), 'week_period'] = 0
    clear.loc[('2020-12-23' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-24'), 'week_period'] = 0
    clear.loc[('2020-12-22' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-23'), 'week_period'] = 0
    clear.loc[('2020-12-21' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-22'), 'week_period'] = 0
    clear.loc[('2020-12-20' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-21'), 'week_period'] = 1
    clear.loc[('2020-12-19' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-20'), 'week_period'] = 1
    clear.loc[('2020-12-18' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-19'), 'week_period'] = 0
    clear.loc[('2020-12-17' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-18'), 'week_period'] = 0
    clear.loc[('2020-12-16' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-17'), 'week_period'] = 0
    clear.loc[('2020-12-15' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-16'), 'week_period'] = 0
    clear.loc[('2020-12-14' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-15'), 'week_period'] = 0
    clear.loc[('2020-12-13' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-14'), 'week_period'] = 1
    clear.loc[('2020-12-12' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-13'), 'week_period'] = 1
    clear.loc[('2020-12-11' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-12'), 'week_period'] = 0
    clear.loc[('2020-12-10' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-11'), 'week_period'] = 0
    clear.loc[('2020-12-09' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-10'), 'week_period'] = 0
    clear.loc[('2020-12-08' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-09'), 'week_period'] = 0
    clear.loc[('2020-12-07' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-08'), 'week_period'] = 0
    clear.loc[('2020-12-06' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-07'), 'week_period'] = 1
    clear.loc[('2020-12-05' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-06'), 'week_period'] = 1
    clear.loc[('2020-12-04' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-15'), 'week_period'] = 0
    clear.loc[('2020-12-03' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-04'), 'week_period'] = 0
    clear.loc[('2020-12-02' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-03'), 'week_period'] = 0
    clear.loc[('2020-12-01' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-02'), 'week_period'] = 0
    clear.loc[('2020-11-30' <= clear['start_timestamp']) & (clear['start_timestamp'] <= '2020-12-01'), 'week_period'] = 0
    return clear

def weather_period(clear, prop, values):
    clear[prop] = -1    
    
    night = pd.DatetimeIndex(clear['start_timestamp']).indexer_between_time('00:00', '11:00')
    day = pd.DatetimeIndex(clear['start_timestamp']).indexer_between_time('11:00', '19:00')
    evening = pd.DatetimeIndex(clear['start_timestamp']).indexer_between_time('19:00', '06:00')
    parts = [night, day, evening]
    v_index = 0
    dates = ['2020-12-01', '2020-12-02', '2020-12-03', '2020-12-04', '2020-12-05', '2020-12-06','2020-12-07', '2020-12-08', 
             '2020-12-09', '2020-12-10', '2020-12-11', '2020-12-12', '2020-12-13', '2020-12-14', '2020-12-15', '2020-12-16', '2020-12-17',
             '2020-12-18', '2020-12-19', '2020-12-20', '2020-12-21', '2020-12-22', '2020-12-23', '2020-12-24', '2020-12-25', '2020-12-26', 
             '2020-12-27', '2020-12-28', '2020-12-29', '2020-12-30', '2020-12-31'] # and this is too 
    for j in range(len(dates) - 1):
        for i in range(len(parts)):
            s = (dates[j] <= clear.loc[parts[i], 'start_timestamp']) & (clear.loc[parts[i], 'start_timestamp'] <= dates[j+1])
            clear.loc[(s)[s].index, prop] = values[v_index]
            if (i+2 != len(parts)):
                v_index += 1

    return clear

In [17]:
processing = GIS_processing()

In [5]:
dataframe = pd.read_csv("574880_omsk_routes_out.csv")

In [12]:
import multiprocessing as mp
import psutil

In [134]:
part_const = 20898 # near to optimal for 24 cores

In [135]:
def spawn():
    dataframe_list = []
    procs = list()
    n_cpus = psutil.cpu_count()
    manager = mp.Manager()
    return_list = manager.list()
    
    for cpu in range(37):
        d = dict(border = part_const*(cpu+1))
        p = mp.Process(target=run_child, args=(d, return_list))
        p.start()
        procs.append(p)
    for p in procs:
        p.join()
        print('joined')
    
    return return_list

def run_child(border, return_list):
    print(border['border'])
    df = pd.DataFrame(dataframe[(int(border['border'])-part_const):border['border']]).reset_index()
    return_list.append([(int(border['border'])-part_const)/part_const,processing.flatternize(16, filepath = "", dataframe = df)])

In [136]:
a = spawn()

20898
4179662694



  0%|          | 0/20898 [00:00<?, ?it/s]

83592

  0%|          | 2/20898 [00:00<18:04, 19.27it/s]

104490


  0%|          | 3/20898 [00:00<16:38, 20.93it/s]


125388146286


  0%|          | 4/20898 [00:03<3:29:59,  1.66it/s]

167184



  0%|          | 6/20898 [00:03<2:33:11,  2.27it/s]

188082

  0%|          | 4/20898 [00:01<10:51, 32.08it/s]


208980229878

  0%|          | 5/20898 [00:06<6:44:16,  1.16s/it]





  0%|          | 0/20898 [00:00<?, ?it/s]

250776


  0%|          | 7/20898 [00:07<4:50:24,  1.20it/s]

271674292572

  0%|          | 0/20898 [00:00<?, ?it/s]





  0%|          | 8/20898 [00:06<2:31:58,  2.29it/s]

313470
334368

  0%|          | 6/20898 [00:10<9:26:04,  1.63s/it]




  0%|          | 2/20898 [00:03<32:25, 10.74it/s]

355266


  0%|          | 8/20898 [00:11<8:41:16,  1.50s/it]

376164397062

  0%|          | 0/20898 [00:00<?, ?it/s]




  0%|          | 1/20898 [00:04<56:16,  6.19it/s]

417960


  0%|          | 0/20898 [00:00<?, ?it/s]

438858



  0%|          | 10/20898 [00:12<5:26:40,  1.07it/s]

459756480654


  0%|          | 7/20898 [00:17<14:22:39,  2.48s/it]


501552

  0%|          | 0/20898 [00:00<?, ?it/s]




  0%|          | 4/20898 [00:10<8:06:45,  1.40s/it]

522450


  0%|          | 0/20898 [00:00<?, ?it/s]

543348

  0%|          | 9/20898 [00:19<13:15:45,  2.29s/it]


564246

  0%|          | 0/20898 [00:00<?, ?it/s]




  0%|          | 0/20898 [00:00<?, ?it/s]

585144606042



  0%|          | 2/20898 [00:12<18:23:21,  3.17s/it]

626940

  0%|          | 1/20898 [00:07<39:33,  8.80it/s]




  0%|          | 0/20898 [00:00<?, ?it/s]

647838668736

  0%|          | 0/20898 [00:00<?, ?it/s]




  0%|          | 0/20898 [00:00<?, ?it/s]


689634

  0%|          | 11/20898 [00:23<11:12:14,  1.93s/it]


710532
731430

  0%|          | 0/20898 [00:00<?, ?it/s]

752328


  0%|          | 8/20898 [00:29<21:27:17,  3.70s/it]




  0%|          | 2/20898 [00:11<34:08, 10.20it/s]

773226


100%|██████████| 20898/20898 [1:36:04<00:00,  3.63it/s]  
100%|██████████| 20898/20898 [1:37:03<00:00,  3.59it/s]  
100%|██████████| 20898/20898 [1:37:05<00:00,  3.59it/s]
100%|██████████| 20898/20898 [1:37:37<00:00,  3.57it/s]
100%|██████████| 20898/20898 [1:37:17<00:00,  3.58it/s]
100%|██████████| 20898/20898 [1:38:03<00:00,  3.55it/s]
100%|██████████| 20898/20898 [1:38:12<00:00,  3.55it/s]
100%|██████████| 20898/20898 [1:38:33<00:00,  3.53it/s]
100%|██████████| 20898/20898 [1:38:43<00:00,  3.53it/s]
100%|██████████| 20898/20898 [1:38:55<00:00,  3.52it/s]
100%|██████████| 20898/20898 [1:39:15<00:00,  3.51it/s]
100%|██████████| 20898/20898 [1:39:16<00:00,  3.51it/s]
100%|██████████| 20898/20898 [1:39:22<00:00,  3.51it/s]
100%|██████████| 20898/20898 [1:39:59<00:00,  3.48it/s]
100%|██████████| 20898/20898 [1:40:04<00:00,  3.48it/s]
100%|██████████| 20898/20898 [1:40:22<00:00,  3.47it/s]
100%|██████████| 20898/20898 [1:40:31<00:00,  3.47it/s]
100%|██████████| 20898/20898 [1:40:30<00:00,

joined


100%|██████████| 20898/20898 [1:41:08<00:00,  3.44it/s]
100%|██████████| 20898/20898 [1:41:06<00:00,  3.44it/s]
100%|██████████| 20898/20898 [1:41:14<00:00,  3.44it/s]
100%|██████████| 20898/20898 [1:41:50<00:00,  3.42it/s]
100%|██████████| 20898/20898 [1:41:33<00:00,  3.43it/s]
100%|██████████| 20898/20898 [1:41:28<00:00,  3.43it/s]
100%|██████████| 20898/20898 [1:41:58<00:00,  3.42it/s]
100%|██████████| 20898/20898 [1:41:48<00:00,  3.42it/s]
100%|██████████| 20898/20898 [1:41:44<00:00,  3.42it/s]
100%|██████████| 20898/20898 [1:41:48<00:00,  3.42it/s]
100%|██████████| 20898/20898 [1:42:09<00:00,  3.41it/s]
100%|██████████| 20898/20898 [1:42:19<00:00,  3.40it/s]
100%|██████████| 20898/20898 [1:42:19<00:00,  3.40it/s]
100%|██████████| 20898/20898 [1:42:27<00:00,  3.40it/s]
100%|██████████| 20898/20898 [1:42:39<00:00,  3.39it/s]
100%|██████████| 20898/20898 [1:43:09<00:00,  3.38it/s]
100%|██████████| 20898/20898 [1:42:55<00:00,  3.38it/s]
 99%|█████████▉| 20718/20898 [1:42:57<00:32,  5.

joined
joined
joined
joined
joined


 99%|█████████▉| 20730/20898 [1:43:00<00:34,  4.80it/s]

joined
joined
joined
joined


100%|██████████| 20898/20898 [1:43:26<00:00,  3.37it/s]


joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined
joined


In [138]:
normal_container_s = []
for i in tqdm(range(len(a))):
    normal_container_s.append(a[i])

100%|██████████| 37/37 [01:04<00:00,  1.74s/it]


In [139]:
a_s_s = sorted(normal_container_s,key=lambda x: (x[0],x[1]))

In [147]:
result = a_s_s[0][1]
for i in range(1, len(a_s_s)):
    result = result.append(a_s_s[i][1])

In [149]:
result.to_csv("processed_omsk_E.csv")

In [151]:
processing.add_additionals('processed_omsk_E.csv')

100%|██████████| 773226/773226 [24:36<00:00, 523.86it/s]


In [154]:
!rm processed_omsk_E.csv

The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.


In [18]:
processing.clear_flatternized("add_processed_omsk_E.csv")

100%|██████████| 773226/773226 [04:06<00:00, 3137.30it/s]


In [19]:
clear = pd.read_csv("clear_add_processed_omsk_E.csv")

In [46]:
clear_dates = week_period(day_period(clear))

In [48]:
clear_dates.to_csv("omsk_full_routes_final.csv")

KeyboardInterrupt: 

In [8]:
december_omsk = pd.read_csv("omsk_december_fixed.csv", sep = ";")
clouds_omsk_day = december_omsk["cloud"].to_list()
snow_omsk_day = december_omsk["weather"].to_list()
temp_omsk_day = december_omsk["temp"].to_list()
wind_omsk_day = december_omsk["windDir(from)"].to_list()
press_omsk_day = december_omsk["pressure"].to_list() 
clouds_omsk_night = december_omsk["cloud.1"].to_list()
snow_omsk_night = december_omsk["weather.1"].to_list()
temp_omsk_night = december_omsk["temp.1"].to_list()
wind_omsk_night = december_omsk["windDir(from).1"].to_list()
press_omsk_night = december_omsk["pressure.1"].to_list()

def weather_merger(day, night):
    result = []
    for i in range(len(day)):
        result.append(day[i])
        result.append(night[i])
    return result

clouds_omsk = weather_merger(clouds_omsk_day, clouds_omsk_night)
snow_omsk = weather_merger(snow_omsk_day, snow_omsk_night)
temp_omsk = weather_merger(temp_omsk_day, temp_omsk_night)
wind_omsk = weather_merger(wind_omsk_day, wind_omsk_night)
press_omsk = weather_merger(press_omsk_day, press_omsk_night)

props = ['clouds', 'snow', 'temperature', 'wind', 'pressure']
vals_omsk = [clouds_omsk, snow_omsk, temp_omsk, wind_omsk, press_omsk]

In [None]:
final = pd.read_csv("omsk_full_routes_final.csv")

In [None]:
final.drop(final.loc[('2020-11-31' <= final['start_timestamp']) & (final['start_timestamp'] <= '2020-12-01')].index, inplace=True)

In [None]:
final.drop(final.loc[('2021-01-01' <= final['start_timestamp']) & (final['start_timestamp'] <= '2021-01-02')].index, inplace=True)

In [None]:
final = final.reset_index()

In [None]:
for i in tqdm(range(len(props))):
    final = weather_period(final, props[i], vals_omsk[i])

In [None]:
final.drop(["instruction_type"], axis = 1, inplace = True)

In [None]:
final.to_csv("omsk_full_routes_final_weather.csv")