In [1]:
from joblib import Parallel, delayed
import functools

import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from mpl_toolkits.mplot3d import Axes3D

import numpy as np
import pandas as pd

import time
from tqdm import tqdm_notebook as  tqdm 

%pylab inline
pd.set_option('display.max_rows', 7)

Populating the interactive namespace from numpy and matplotlib


# Load  data

In [3]:
data = pd.read_csv(
    "trips.csv",
    header=None,
    names=['source', 'sink', 'correspndc', 'avg_time', 'avg_dist'])

data

Unnamed: 0,source,sink,correspndc,avg_time,avg_dist
0,1,1,40,26,2.2
1,1,2,4,49,8.1
2,1,3,3,35,6.8
...,...,...,...,...,...
207,22,1,1,140,84.8
208,22,2,3,120,72.6
209,22,22,18,25,1.5


$L_i$ - полное число людей, ездящих на работу из района $i$ (*source_volumes*)

$Q_j$ - полное число людей, приезжающих на работу в район $j$ (*sink_volumes*)

$(d_{real})_{ij}$ - реальная корреспонденция между районами $i$ и $j$ 

$\sum\limits_j (d_{real})_{ij} = L_i$

$\sum\limits_i(d_{real})_{ij} = W_j$

Заполним клетки, в которых отсутствует информация, inf-значениями, а соответствующую корреспонеднцию положим равной 0. 

In [5]:
num_of_regions = 22
for source_num in range(1, num_of_regions + 1):
    for sink_num in range(1, num_of_regions + 1):
        if not ((data['source'] == source_num) & (data['sink'] == sink_num)).any():
            df = pd.DataFrame(np.array([[source_num, sink_num, 0, np.Inf, np.Inf]]),
                              columns=['source','sink','correspndc', 'avg_time','avg_dist'])
            data = pd.concat([data, df], axis=0)
data.sort_values(by=['source','sink','correspndc', 'avg_time','avg_dist'], inplace=True)
data.reset_index(drop=True,inplace=True);

In [6]:
W = data[["sink", "correspndc"]].groupby(["sink"]).sum()["correspndc"].values
W = W.reshape((W.shape[0], 1))
L = data[["source", "correspndc",]].groupby(["source"]).sum()["correspndc"].values
L = L.reshape((W.shape[0], 1))

In [7]:
av_time = data["avg_time"].values.reshape(num_of_regions,
                                    num_of_regions).astype(float)
dist = data["avg_dist"].values.reshape(num_of_regions,
                                    num_of_regions).astype(float)
real_correspondance = data["correspndc"].values.reshape(
    num_of_regions, num_of_regions)

In [8]:
num_of_regions = 22

In [9]:
print(L.sum())
print(W.sum())

1965.0
1965.0
