In [14]:
import pandas as pd
import numpy as np
import geopandas
#import difflib
import multiprocess
import diff_match_patch as dmp_module

DIR_DATA = 'Data/'

https://medium.com/@grvsinghal/speed-up-your-python-code-using-multiprocessing-on-windows-and-jupyter-or-ipython-2714b49d6fac

In [15]:
def GMLtoGDF(filename):
    gdf = geopandas.read_file(filename)
    gdf.rename_geometry('Geometry', inplace=True) # Default geometry column name is 'geometry'; changed for consistent capitalization of columns
    gdf.set_geometry('Geometry') # Renaming is insufficient; this sets special variable gdf.geometry = gdf['Geometry']
    gdf = gdf.set_crs(epsg=3347) # Needed only for FSA file, the others are 3347 and parsed correctly by geopandas, and the pdf in the zip file has the same projection parameters (FSA vs. DA, ADA, CT)
    gdf['Area'] = gdf['Geometry'].to_crs(epsg=6931).area # Equal-area projection
    gdf['Centroid'] = gdf['Geometry'].centroid
    gdf['Geometry'] = gdf['Geometry'].to_crs(epsg=4326) # Latitude/Longitude representation
    gdf['Centroid'] = gdf['Centroid'].to_crs(epsg=4326) # Only the set geometry is converted with gdf.to_crs(); all other geometry-containing columns must be converted explicitly; here we convert all columns explicitly
    gdf = gdf.set_crs(epsg=4326) # The series and geodataframe can have separate crs; this was found necessary for the geopandas.union function to operate easily
    gdf['Centroid Latitude'] = gdf['Centroid'].geometry.y
    gdf['Centroid Longitude'] = gdf['Centroid'].geometry.x
    gdf.drop(columns = 'Centroid', inplace=True) # Because WKT Point cannot be serialized to JSON, we drop the Centroid column and keep only its float components
    return gdf

In [16]:
%%time
gdf_CA_FSA = GMLtoGDF(DIR_DATA+'lfsa000b16g_e.gml')

  for feature in features_lst:


Wall time: 42.2 s


In [17]:
%%time
gdf_CA_DA = GMLtoGDF(DIR_DATA+'lda_000b16g_e.gml')

Wall time: 1min 44s


In [18]:
%%time
gdf1 = gdf_CA_FSA
gdf1b = gdf1.copy(deep=True)
gdf1b['Geometry'] = gdf1b['Geometry'].buffer(0)
gdf2 = gdf_CA_DA
gdf2b = gdf2.copy(deep=True)
gdf2b['Geometry'] = gdf2b['Geometry'].buffer(0)

Wall time: 1min 53s


In [None]:
import multiprocess
import testmp
import time
import ipypb # Lightweight progress bar, source copied from GitHub

if __name__ == '__main__':
    completed = 0
    N = gdf1.shape[0]
    results = [None]*N
    NUM_PROCESSES = 10
    start_time = time.time()
    with multiprocess.Pool(NUM_PROCESSES) as pool:
        print(f'Generating pool, P={NUM_PROCESSES}, N={N}')
        ret = [pool.apply_async(testmp.dmpDiff,(gdf1['Geometry'].iloc[ind], gdf1b['Geometry'].iloc[ind], ind)) for ind in range(N)]
        print('Processing pool')
        for i in ipypb.track(range(N)): # Alternative: initialize pb and call next(pb) in loop, instead of having a while loop to process all updates since last loop
            while True:
                indb_finished = [r.ready() for r in ret]
                indb_empty = [r==None for r in results]
                indb_update = [f and e for f, e in zip(indb_finished, indb_empty)]
                if any(indb_update):
                    ind_update = indb_update.index(True)
                    results[ind_update] = ret[ind_update].get(999999)
                    completed += 1
                    if completed%1000==0 or completed==1 or completed==N:
                        print(f'Finished {completed}/{N}, wall time {time.strftime("%H:%M:%S", time.gmtime(start_time-time.time()))}')
                    break
            
    wall_time = time.time()-start_time
    processor_time = sum([sum(r[2]) for r in results])
    print(f'Pool processing concluded, process count {sum([r!=None for r in results])}/{N}, wall time {time.strftime("%H:%M:%S", time.gmtime(wall_time))}, processor time {time.strftime("%H:%M:%S", time.gmtime(processor_time))}, speedup {processor_time/wall_time:.3}x')

In [None]:
results[3][1]

Note that dmp reduction can occur

There are no newlines in wkt representation, so we can replace commas with newlines, process, and it will look like the difflib results.  This is necessary because the diff_match_patch module, in line mode, looks for newlines explicitly (does not take delimiters as a parameter)... this might be worth modifiying.

In [None]:
import multiprocess
import testmp
import time
import ipypb # Lightweight progress bar, source copied from GitHub

if __name__ == '__main__':
    completed = 0
    N = gdf1.shape[0]
    results = [None]*N
    NUM_PROCESSES = 10
    start_time = time.time()
    with multiprocess.Pool(NUM_PROCESSES) as pool:
        print(f'Generating pool, P={NUM_PROCESSES}, N={N}')
        ret = [pool.apply_async(testmp.dmpDiffLine,(gdf1['Geometry'].iloc[ind], gdf1b['Geometry'].iloc[ind], ind)) for ind in range(N)]
        print('Processing pool')
        for i in ipypb.track(range(N)): # Alternative: initialize pb and call next(pb) in loop, instead of having a while loop to process all updates since last loop
            while True:
                indb_finished = [r.ready() for r in ret]
                indb_empty = [r==None for r in results]
                indb_update = [f and e for f, e in zip(indb_finished, indb_empty)]
                if any(indb_update):
                    ind_update = indb_update.index(True)
                    results[ind_update] = ret[ind_update].get(999999)
                    completed += 1
                    break
            
    wall_time = time.time()-start_time
    processor_time = sum([sum(r[2]) for r in results])
    print(f'Pool processing concluded, process count {sum([r!=None for r in results])}/{N}, wall time {time.strftime("%H:%M:%S", time.gmtime(wall_time))}, processor time {time.strftime("%H:%M:%S", time.gmtime(processor_time))}, speedup {processor_time/wall_time:.3}x')

In [None]:
results[3][0][1]

In [23]:
tmp = geopandas.array.to_wkt(gdf_CA_FSA.geometry.values)

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [25]:
tmp2 = [len(t) for t in tmp]

In [27]:
np.mean(tmp2),max(tmp2)

(114640.62283950618, 43054237)

In [29]:
tmp3 = np.sort(tmp2)

In [31]:
tmp3[-1:-10:-1]

array([43054237,  6468734,  6098821,  5368165,  5049054,  4645296,
        4312769,  2720355,  2513920])

TRY TO MAKE OVERLAP CALCULATION PARALLEL

In [1]:
import multiprocess
import testmp
import time
import ipypb # Lightweight progress bar, source copied from GitHub
import importlib
importlib.reload(testmp)

<module 'testmp' from 'C:\\Users\\Arkadiatri\\AllData\\JL-Coursera\\9. IBM Applied Data Science Capstone\\Coursera_Capstone\\testmp.py'>

In [2]:
import geopandas
import dill
import gzip
DIR_DATA = 'Data/'
DIR_RESULTS = 'Results/'

In [3]:
def GMLtoGDF(filename):
    gdf = geopandas.read_file(filename)
    gdf.rename_geometry('Geometry', inplace=True) # Default geometry column name is 'geometry'; changed for consistent capitalization of columns
    gdf.set_geometry('Geometry') # Renaming is insufficient; this sets special variable gdf.geometry = gdf['Geometry']
    gdf = gdf.set_crs(epsg=3347) # Needed only for FSA file, the others are 3347 and parsed correctly by geopandas, and the pdf in the zip file has the same projection parameters (FSA vs. DA, ADA, CT)
    gdf['Area'] = gdf['Geometry'].to_crs(epsg=6931).area # Equal-area projection
    gdf['Centroid'] = gdf['Geometry'].centroid
    gdf['Geometry'] = gdf['Geometry'].to_crs(epsg=4326) # Latitude/Longitude representation
    gdf['Centroid'] = gdf['Centroid'].to_crs(epsg=4326) # Only the set geometry is converted with gdf.to_crs(); all other geometry-containing columns must be converted explicitly; here we convert all columns explicitly
    gdf = gdf.set_crs(epsg=4326) # The series and geodataframe can have separate crs; this was found necessary for the geopandas.union function to operate easily
    gdf['Centroid Latitude'] = gdf['Centroid'].geometry.y
    gdf['Centroid Longitude'] = gdf['Centroid'].geometry.x
    gdf.drop(columns = 'Centroid', inplace=True) # Because WKT Point cannot be serialized to JSON, we drop the Centroid column and keep only its float components
    return gdf

In [4]:
def loadResults_(name,tuples,fileformat='db',compress=False):
    '''Loads variables from files
    
    Parameters
    ----------
    name: str, file name base (including directory if desired)
    tuples: list of tuples (varname, suffix),
        varname: str, the key of the output dict where the data will be stored
        suffix: str, the string appended to name to generate a full file name
    fileformat: str, suffix to save the file with (do not include period)
    compress: bool, True to zip results (appends '.gz' to filename)
    
    Returns
    -------
    None if an error was encountered, or
    Tuple the length of tuples containing for each element of tuples:
        None if there was an error, or
        the variable loaded from file at the same position from tuples
    
    Notes
    -----
    Files read in binary format with optional gzip encoding
    This function is the complement to saveResults_()
    
    TODO
    ----
    Add option to change save format (text vs. binary)
    Make fileformat select the save format
    '''
    if type(name)!=str:
        print('Error: name must be a string')
        return None
    if type(fileformat)!=str:
        print('Error: fileformat must be a string')
        return None
    
    ret = []
    for n, s in tuples:
        fn = name+s+'.'+fileformat+('.gz' if compress else '')
        try:
            with open(fn,'rb') as file:
                ret.append(dill.loads(gzip.decompress(file.read()) if compress else file.read()))
        except (FileNotFoundError, IOError) as e:
            ret.append(None)
            print(f'An error was encountered while reading from file {fn}: {e}')
    return tuple(ret)

def loadResults(name):
    '''Loads variables 'gdf_union', 'times', and 'areas' from zipped files
    
    Parameters
    ----------
    name: str containing the base name of the files
    
    Returns
    -------
    None if an error was encountered, or
    Tuple the length of tuples containing:
        None if there was an error, or
        the variable loaded from file at the same position from tuples
    
    Notes
    -----
    File names area <name>_<variable>.db.gz and are in gzip dill binary format
    Uses outside variable DIR_RESULTS if available, otherwise put path in name
    '''
    tuples = [('gdf_union',''),
              ('times','_times'),
              ('areas','_areas')]
    
    return loadResults_(name,tuples,fileformat='db',compress=True)

In [5]:
%%time
gdf_CA_FSA_D = GMLtoGDF(DIR_DATA+'lfsa000a16g_e.gml')
gdf_CA_DA_D = GMLtoGDF(DIR_DATA+'lda_000a16g_e.gml')

  for feature in features_lst:


Wall time: 1min 12s


In [6]:
%%time
gdf_CA_FSA = GMLtoGDF(DIR_DATA+'lfsa000b16g_e.gml')
gdf_CA_DA = GMLtoGDF(DIR_DATA+'lda_000b16g_e.gml')

Wall time: 1min 50s


In [7]:
%%time
gdf1 = gdf_CA_FSA_D
key1 = 'CFSAUID'
gdf2 = gdf_CA_DA_D
key2 = 'DAUID'
gdf1b = gdf1.copy(deep=True)
gdf1b['Geometry'] = gdf1b['Geometry'].buffer(0)
gdf2b = gdf2.copy(deep=True)
gdf2b['Geometry'] = gdf2b['Geometry'].buffer(0)

Wall time: 5.34 s


In [12]:
r = loadResults(DIR_RESULTS+'GDF_FSA-DA')
areas = r[2]
times = r[1]

NameError: name 'loadResults' is not defined

In [11]:
times = r[1]

NameError: name 'r' is not defined

In [3]:
import numpy as np

In [10]:
a = [[0,.5,1],[2,4,1]]
a = [[max(aa) if aaa==max(aa) else 0 for aaa in aa] for aa in a]
a

[[0, 0, 1], [0, 4, 0]]

In [None]:
if __name__ == '__main__':
    from multiprocessing import get_context

    completed = 0
    N = gdf2.shape[0]
    NUM_PROCESSES = 10
    DIV = 100
    NDIV = N//DIV+(0 if N%DIV==0 else 1)
    results = [None]*NDIV

    start_time = time.time()
    with multiprocess.Pool(NUM_PROCESSES) as pool: # .get_context("spawn")
        print(f'Generating pool, P={NUM_PROCESSES}, N={N}')
        ret = [pool.apply_async(testmp.intersectGDFareas,(gdf1,key1,gdf2.iloc[ind*DIV:((ind+1)*DIV if (ind+1)*DIV<N else N),:],key2,areas[ind*DIV:((ind+1)*DIV if (ind+1)*DIV<N else N)],0,6931,gdf1b,gdf2b.iloc[ind*DIV:((ind+1)*DIV if (ind+1)*DIV<N else N),:])) for ind in range(NDIV)]
        print('Processing pool')
        for i in ipypb.track(range(NDIV)): # Alternative: initialize pb and call next(pb) in loop, instead of having a while loop to process all updates since last loop
            while True:
                indb_finished = [r.ready() for r in ret]
                indb_empty = [r==None for r in results]
                indb_update = [f and e for f, e in zip(indb_finished, indb_empty)]
                if any(indb_update):
                    ind_update = indb_update.index(True)
                    results[ind_update] = ret[ind_update].get(999999)
                    completed += 1
                    if completed%10==0 or completed==1 or completed==(NDIV):
                        wall_time = time.time()-start_time
                        print(f'Finished {completed}/{NDIV}, wall time {time.gmtime(wall_time).tm_yday - 1}d{time.strftime("%H:%M:%S", time.gmtime(wall_time))}')
                    break
                time.sleep(5)
            
    wall_time = time.time()-start_time
    processor_time = sum([sum(r[1]) for r in results])
    print(f'Pool processing concluded, process count {sum([r!=None for r in results])}/{NDIV}, wall time {time.gmtime(wall_time).tm_yday - 1}d{time.strftime("%H:%M:%S", time.gmtime(wall_time))}, processor time {(processor_time/60/60/24)//1: d}d{time.strftime("%H:%M:%S", time.gmtime(processor_time))}, speedup {processor_time/wall_time:.3}x')

Generating pool, P=10, N=56590
Processing pool


Finished 1/566, wall time  0dd00:00:35
Finished 10/566, wall time  0dd00:00:55
Finished 20/566, wall time  0dd00:01:45
Finished 30/566, wall time  0dd00:02:41
Finished 40/566, wall time  0dd00:03:36
Finished 50/566, wall time  0dd00:04:31
Finished 60/566, wall time  0dd00:05:21
Finished 70/566, wall time  0dd00:06:06
Finished 80/566, wall time  0dd00:06:56
Finished 90/566, wall time  0dd00:07:52


Digital, 10 processes, areas shortcut, buffer pre-processed



In [None]:
# Compile results
import geopandas
gdf_union = results[0][0]
times = results[0][1]
areas = results[0][2]
for r in results[1:]:
    gdf_union = gdf_union.append(r[0],ignore_index=True)
    times.extend(r[1])
    areas.extend(r[2])

In [None]:
gdf_union

Digital, intersectGDFareas, buffer(0) pre-processed, now removing .get_context("spawn") in pool generation line

Digital, intersectGDFareas, buffer(0) pre-processed

    Finished 1/566, wall time 00:00:28
    Finished 10/566, wall time 00:00:54
    Finished 20/566, wall time 00:01:29
    Finished 30/566, wall time 00:02:04
    Finished 40/566, wall time 00:02:39
    Finished 50/566, wall time 00:03:14
    Finished 60/566, wall time 00:03:49
    Finished 70/566, wall time 00:04:34
    Finished 80/566, wall time 00:05:20
    Finished 90/566, wall time 00:05:55
    Finished 100/566, wall time 00:06:40
    Finished 110/566, wall time 00:07:20
    Finished 120/566, wall time 00:08:05
    Finished 130/566, wall time 00:08:51
    Finished 140/566, wall time 00:09:36
    Finished 150/566, wall time 00:10:17
    Finished 160/566, wall time 00:11:07
    Finished 170/566, wall time 00:11:57
    Finished 180/566, wall time 00:12:47
    Finished 190/566, wall time 00:13:38
    Finished 200/566, wall time 00:14:23
    Finished 210/566, wall time 00:15:23
    Finished 220/566, wall time 00:16:13
    Finished 230/566, wall time 00:16:48
    Finished 240/566, wall time 00:17:39
    Finished 250/566, wall time 00:18:24
    Finished 260/566, wall time 00:19:19
    Finished 270/566, wall time 00:20:04
    Finished 280/566, wall time 00:20:50
    Finished 290/566, wall time 00:21:40
    Finished 300/566, wall time 00:22:35
    Finished 310/566, wall time 00:23:15
    Finished 320/566, wall time 00:24:00
    Finished 330/566, wall time 00:24:56
    Finished 340/566, wall time 00:25:46
    Finished 350/566, wall time 00:26:36
    Finished 360/566, wall time 00:27:22
    Finished 370/566, wall time 00:28:07
    Finished 380/566, wall time 00:28:57
    Finished 390/566, wall time 00:29:48
    Finished 400/566, wall time 00:30:53
    Finished 410/566, wall time 00:32:03
    Finished 420/566, wall time 00:32:58
    Finished 430/566, wall time 00:33:49
    Finished 440/566, wall time 00:34:39
    Finished 450/566, wall time 00:35:39
    Finished 460/566, wall time 00:36:34
    Finished 470/566, wall time 00:37:25
    Finished 480/566, wall time 00:38:15
    Finished 490/566, wall time 00:39:06
    Finished 500/566, wall time 00:39:56
    Finished 510/566, wall time 00:40:41
    Finished 520/566, wall time 00:41:21
    Finished 530/566, wall time 00:42:12
    Finished 540/566, wall time 00:42:57
    Finished 550/566, wall time 00:43:52
    Finished 560/566, wall time 00:44:28
    Finished 566/566, wall time 00:44:53
    Pool processing concluded, process count 566/566, wall time 00:44:53, processor time 07:06:35, speedup 9.5x

Cartographic, intersectGDF, buffer(0) pre-processed

    565/566 [08:05:30<05:17:42, 51.56s/it]
    565/566 [15:04:57<06:59:28, 95.93s/it]

    Finished 1/566, wall time 00:01:25
    Finished 10/566, wall time 00:04:01
    Finished 20/566, wall time 00:06:21
    Finished 30/566, wall time 00:08:47
    Finished 40/566, wall time 00:11:23
    Finished 50/566, wall time 00:13:48
    Finished 60/566, wall time 00:16:25
    Finished 70/566, wall time 00:18:50
    Finished 80/566, wall time 00:21:21
    Finished 90/566, wall time 00:23:41
    Finished 100/566, wall time 00:26:07
    Finished 110/566, wall time 00:28:43
    Finished 120/566, wall time 00:31:04
    Finished 130/566, wall time 00:33:25
    Finished 140/566, wall time 00:35:35
    Finished 150/566, wall time 00:38:01
    Finished 160/566, wall time 00:40:22
    Finished 170/566, wall time 00:42:57
    Finished 180/566, wall time 00:45:33
    Finished 190/566, wall time 00:48:19
    Finished 200/566, wall time 00:50:49
    Finished 210/566, wall time 00:53:25
    Finished 220/566, wall time 00:55:51
    Finished 230/566, wall time 00:58:11
    Finished 240/566, wall time 01:00:37
    Finished 250/566, wall time 01:03:08
    Finished 260/566, wall time 01:05:38
    Finished 270/566, wall time 01:08:39
    Finished 280/566, wall time 01:11:20
    Finished 290/566, wall time 01:14:01
    Finished 300/566, wall time 01:16:21
    Finished 310/566, wall time 01:18:47
    Finished 320/566, wall time 01:21:23
    Finished 330/566, wall time 01:23:48
    Finished 340/566, wall time 01:26:14
    Finished 350/566, wall time 01:29:05
    Finished 360/566, wall time 01:32:10
    Finished 370/566, wall time 01:35:06
    Finished 380/566, wall time 01:37:47
    Finished 390/566, wall time 01:40:48
    Finished 400/566, wall time 01:43:44
    Finished 410/566, wall time 01:46:54
    Finished 420/566, wall time 01:50:15
    Finished 430/566, wall time 01:53:06
    Finished 440/566, wall time 01:56:02
    Finished 450/566, wall time 01:58:33
    Finished 460/566, wall time 02:01:13
    Finished 470/566, wall time 02:03:39
    Finished 480/566, wall time 02:06:05
    Finished 490/566, wall time 02:08:35
    Finished 500/566, wall time 02:11:11
    Finished 510/566, wall time 02:13:37
    Finished 520/566, wall time 02:16:07
    Finished 530/566, wall time 02:18:43
    Finished 540/566, wall time 02:21:09
    Finished 550/566, wall time 02:23:55
    Finished 560/566, wall time 02:28:06
    Finished 566/566, wall time 15:04:59
    Pool processing concluded, process count 566/566, wall time 15:04:59, processor time 18:42:22, speedup 2.83x

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.plot(range(len(times)),np.log10(times),'.',alpha=0.2,label='All DAs')
plt.legend()
plt.xlabel('DA Index')
plt.ylabel('Log$_{10}$(Time [s])');

In [None]:
sum([not r is None for r in results])

In [None]:
results = results[0:566]

In [None]:
    processor_time = sum([sum(r[1]) for r in results])
    print(f'Pool processing concluded, process count {sum([r!=None for r in results])}/{N}, wall time {time.strftime("%H:%M:%S", time.gmtime(wall_time))}, processor time {time.strftime("%H:%M:%S", time.gmtime(processor_time))}, speedup {processor_time/wall_time:.3}x')

Digital, 10 process, all overlaps

Generating pool, P=10, N=56590
    Processing pool

    100% 565/566 [01:10:04<00:30, 7.43s/it]

    Finished 1/566, wall time 00:00:45
    Finished 10/566, wall time 00:01:00
    Finished 20/566, wall time 00:02:16
    Finished 30/566, wall time 00:03:31
    Finished 40/566, wall time 00:04:47
    Finished 50/566, wall time 00:05:57
    Finished 60/566, wall time 00:07:08
    Finished 70/566, wall time 00:08:13
    Finished 80/566, wall time 00:09:24
    Finished 90/566, wall time 00:10:29
    Finished 100/566, wall time 00:11:35
    Finished 110/566, wall time 00:12:41
    Finished 120/566, wall time 00:13:46
    Finished 130/566, wall time 00:14:47
    Finished 140/566, wall time 00:15:57
    Finished 150/566, wall time 00:17:03
    Finished 160/566, wall time 00:18:13
    Finished 170/566, wall time 00:19:24
    Finished 180/566, wall time 00:20:29
    Finished 190/566, wall time 00:21:40
    Finished 200/566, wall time 00:22:50
    Finished 210/566, wall time 00:24:01
    Finished 220/566, wall time 00:25:07
    Finished 230/566, wall time 00:26:17
    Finished 240/566, wall time 00:27:23
    Finished 250/566, wall time 00:28:29
    Finished 260/566, wall time 00:29:34
    Finished 270/566, wall time 00:30:40
    Finished 280/566, wall time 00:31:45
    Finished 290/566, wall time 00:33:01
    Finished 300/566, wall time 00:34:11
    Finished 310/566, wall time 00:35:22
    Finished 320/566, wall time 00:36:27
    Finished 330/566, wall time 00:37:33
    Finished 340/566, wall time 00:38:38
    Finished 350/566, wall time 00:39:59
    Finished 360/566, wall time 00:41:05
    Finished 370/566, wall time 00:42:21
    Finished 380/566, wall time 00:43:46
    Finished 390/566, wall time 00:44:57
    Finished 400/566, wall time 00:46:32
    Finished 410/566, wall time 00:48:28
    Finished 420/566, wall time 00:50:19
    Finished 430/566, wall time 00:52:00
    Finished 440/566, wall time 00:53:31
    Finished 450/566, wall time 00:54:51
    Finished 460/566, wall time 00:56:33
    Finished 470/566, wall time 00:57:58
    Finished 480/566, wall time 00:59:29
    Finished 490/566, wall time 01:00:45
    Finished 500/566, wall time 01:02:00
    Finished 510/566, wall time 01:03:11
    Finished 520/566, wall time 01:04:16
    Finished 530/566, wall time 01:05:27
    Finished 540/566, wall time 01:06:32
    Finished 550/566, wall time 01:07:58
    Finished 560/566, wall time 01:09:19
    Pool processing concluded, process count 566/56590, wall time 01:10:04, processor time 11:09:04, speedup 9.55x

    diff1_i1 : All lines, was previously diff1_instance
    diff1_i2 : Only diff lines

In [None]:
%%time
diff1_i1 = [[[('- ' if i<0 else '+ ' if i>0 else '  ')+ss for ss in s.split(',') if ss!=''] for i, s in r[0]] for r in results]

In [None]:
diff1_i1[3]

In [None]:
%%time
diff1_i2 = [[[('- ' if i<0 else '+ ' if i>0 else '  ')+ss for ss in s.split(',') if ss!=''] for i, s in r[0] if i!=0] for r in results]

In [None]:
diff1_i2[3]

In [None]:
%%time
diff1_1 = [[] for d in diff1_i1]
_ = [[l.extend(dd) for dd in d if dd!=''] for d, l in zip(diff1_i1, diff1_1)]

In [None]:
diff1_1[3]

In [None]:
%%time
diff1_2 = [[] for d in diff1_i2]
_ = [[l.extend(dd) for dd in d if dd!=''] for d, l in zip(diff1_i2, diff1_2)]

In [None]:
diff1_2[3]

Now check the lengths

In [None]:
%%time
len_diff1_1 = [sum(len(dd) for dd in d) for d in diff1_1]
len_diff1_2 = [sum(len(dd) for dd in d) for d in diff1_2]
all([a>=b for a, b in zip(len_diff1_1,len_diff1_2)])

In [None]:
[*zip(len_diff1_1,len_diff1_2)][809]

In [None]:
len(diff1_i1[809][0]),len(diff1_i1[809][1])

In [None]:
np.argmax(len_diff1_1), np.argmax(len_diff1_2)

In [None]:
a = np.sort(len_diff1_2)

In [None]:
a[-4]

In [None]:
%%time
diff1_instance_divided = [[[(i, ss) for ss in s.split(',')] for i, s in r[0]] for r in results]

In [None]:
diff1_instance_divided[3]

In [None]:
[[for i, s in r[0]] for r in results]

In [None]:
diff1_instance[3]

In [None]:
diff1_i1_donly[3]

In [None]:
len_diff1_instance = [sum(len(dd) for dd in d) for d in diff1_instance]

In [None]:
len_diff1_instance[809]

In [None]:
len_diff1_i1= [sum(len(dd) for dd in d) for d in diff1_i1]

In [None]:
len_diff1_i1[809]

In [None]:
len_diff1_i1_donly = [sum(len(dd) for dd in d) for d in diff1_i1_donly]

In [None]:
len_diff1_i1_donly[809]

In [None]:
plt.scatter(y[:len(t)],np.sqrt(t))

Ahah, the time is quadratic in the output length...

In [None]:
diff1_i1[3]

In [None]:
y = diff1_instance[3]
y

In [None]:
print(results[3])

In [None]:
diff1_len = [sum(1 for _ in d) for d in diff1_i2]

In [None]:
diff1_instance = diff1_i2

In [None]:
# Find the number of line alterations for each geometry pair
diff1_changecount = [sum(1 for e in d if e[0]!=' ') for d in diff1_instance]

In [None]:
diff1_changenumber = sum(np.array(diff1_changecount)>0)
buffind_diff1 = np.nonzero(diff1_changecount)[0]
print(f'There are {diff1_changenumber} FSA geometries with alterations upon buffering')
print('Indices of altered FSA geometries:')
print(buffind_diff1)

In [None]:
all(np.where(match1==False)[0] == np.where(np.array(diff1_changecount)>0)[0])

In [None]:
minind = 0 # Index to display, sorted by ascending number of diff lines
ind_changed = np.where(np.array(diff1_changecount)>0)[0]
ind_changed_minlen = ind_changed[np.argsort(np.array(diff1_len)[ind_changed])[minind]]
print(f"The {1+minind}{'st' if 1+minind==1 else 'nd' if 1+minind==2 else 'rd' if 1+minind==3 else 'th'} shortest geometry that changes with buffering is at index {ind_changed_minlen} with length of {diff1_len[ind_changed_minlen]} coordinates (including modification listings)\n")
display(diff1_instance[ind_changed_minlen])

In [None]:
def condenseDiff(diff):
    '''Extracts changed lines from diff and cancels equivalent additions and subtractions.
    
    Parameters
    ----------
    diff: a list of strings, the result of a line difference (difflib.ndiff())
    
    Returns
    -------
    list of strings, the lines of diff comprising changes that do not appear as both additions and subtractions
    '''
    addlist = set(d[2:] for d in diff if d[0]=='+')
    sublist = set(d[2:] for d in diff if d[0]=='-')
    addlist_short = addlist.difference(sublist)
    sublist_short = sublist.difference(addlist)
    addlist_short = ['+ '+a for a in addlist_short]
    sublist_short = ['- '+s for s in sublist_short]
    return [*addlist_short, *sublist_short]

In [None]:
condenseDiff(diff1_instance[3])

In [None]:
def condenseDiffPoints(diff, keep_comments=False, strip_test=False, strip_output=False, use_sets=False):
    '''Extracts changed lines from diff and cancels equivalent additions and subtractions.
    
    Parameters
    ----------
    diff: list of str, the result of a line difference (difflib.ndiff()) on a WKT geometry that was split by comma delimiters
    keep_comments: bool, if True, comments prefixed by '?' and their associated line are kept even if the associated line's point could be paired and cancelled
    strip_test: bool, if True, 
    strip_output: bool, if True, returns the bare points prefixed with addition and subtraction instead of the original diff line
    use_sets: bool, if True, perform cancelling using sets instead of lists (may not preserve point repetition)
    
    Returns
    -------
    list of strings, the lines of diff comprising changes that do not appear as both additions and subtractions
    
    Notes
    -----
    1) Create stripped lists for cancel testing
    '''
    diff = np.array(diff.copy())
    addind = list(np.where([d[0]=='+' for d in diff])[0])
    subind = list(np.where([d[0]=='-' for d in diff])[0])
    altind = list(np.where([d[0]=='?' for d in diff])[0]) # if difflib.ndiff() functions properly, '?' only follows '+' or '-'
    
    if keep_comments==False:
        altind = []
        addind_alt = []
        subind_alt = []
        addind_lon = addind
        subind_lon = subind
    else:
        addind_alt = [i for i in addind if (i+1) in altind]
        subind_alt = [i for i in subind if (i+1) in altind]
        addind_lon = [i for i in addind if not (i in addind_alt)]
        subind_lon = [i for i in subind if not (i in subind_alt)]

    # Get only the points for comparison
    difflist = np.array([d[2:] for d in diff])
    if strip_test:
        difflist = np.array([d[(d.rfind("(")+1):(len(d) if d.find(")")==-1 else d.find(")"))] for d in difflist])
    addlist = difflist[addind]
    sublist = difflist[subind]

    if use_sets:
        addlist = set(addlist)
        sublist = set(sublist)
        addlist_short = addlist.difference(sublist)
        sublist_short = sublist.difference(addlist)
        addind_short = [np.where(difflist==r)[0][0] for r in addlist_short]
        subind_short = [np.where(difflist==r)[0][0] for r in sublist_short]
        strip_output = True

    else:
        addlist_short = []
        addind_short = []
        sublist_short = sublist.copy()
        sublist_short = list(np.array(sublist_short)[[list(subind).index(i) for i in subind_lon]])
        subind_short = subind_lon.copy()

        for ind, val in zip(addind, addlist):
            if ind in addind_lon:
                if val in sublist_short:
                    sind = sublist_short.index(val)
                    sublist_short.remove(val)
                    subind_short.remove(subind_short[sind])
                else:
                    addlist_short.append(val)
                    addind_short.append(ind)
    
    ret_ind = addind_alt + subind_alt + altind + addind_short + subind_short
    ret_ind.sort()
    
    if strip_output:
        retlist  = list(difflist[ret_ind])
        retlist = [('+ ' if (d in addind_alt + addind_short) else '- ' if (d in subind_alt+subind_short) else '? ') + r.strip() for d, r in zip(ret_ind,retlist)]
    else:
        retlist = list(diff[ret_ind])
    return retlist

In [None]:
condenseDiffPoints(diff1_instance[3], keep_comments=True, strip_test=False)

In [None]:
diff1_instance[3]

In [None]:
%%time
import time
x = [] # Condensed diff1_instance
t = [] # computation time
for i in range(len(diff1_instance)):
    start_time = time.time()
    x.append(condenseDiffPoints(diff1_instance[i], keep_comments=False, strip_test=True))
    end_time = time.time()
    t.append(end_time-start_time)
    print(f'{i}, {t[-1]:.6}')

Had to stop on iteration 809, kept going forever!

In [None]:
y = [sum(len(dd) for dd in d) for d in diff1_instance]

In [None]:
plt.scatter(y[:len(t)],np.sqrt(t))

Ahah, the time is quadratic in the input length... makes sense as I have many loops?

In [None]:
l = [sum(len(dd) for dd in d) for d in results]

In [None]:
x[3]

In [None]:
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
plt.scatter([sum([len(yy) for yy in y]) for y in x],t)

In [None]:
%%time
diff1_changelist = [condenseDiffPoints(d, keep_comments=False, strip_test=True, strip_output=False) for d in diff1_instance]
diff1_changelist_ind = [i for i, d in enumerate(diff1_changelist) if d!=[]]
diff1_changelist_ind_added = [sum(1 for e in diff1_changelist[i] if e[0]=='+') - sum(1 for e in diff1_changelist[i] if e[0]=='-') for i in diff1_changelist_ind]
buffind_changed1 = np.array(diff1_changelist_ind)
buffind_added1 = buffind_changed1[np.nonzero(diff1_changelist_ind_added)[0]]
print(f'There are {len(diff1_changelist_ind)} FSA geometries with non-condensable alterations.')
print(f"There are {len(buffind_added1)} FSA geometries with altered total number of points.\n")
for i, j in zip(diff1_changelist_ind, diff1_changelist_ind_added):
    print(f"Altered FSA points at index {i}, with total {len(diff1_instance[i])} diff lines, net change {j} coordinates:")
    display(diff1_changelist[i])

In [None]:
s = 'asdf,saf'
print(s)
s = s.replace(',','\n')
print(s)
s = s.replace('\n',',')
print(s)

In [None]:
import diff_match_patch
help(diff_match_patch.diff_match_patch.diff_cleanupSemantic)

In [None]:
import diff_match_patch
help(diff_match_patch.diff_match_patch)

In [None]:
wkt = ''
wkt.replace(',','\n')

In [None]:
print(all([r!=None for r in results]))

In [None]:
 |  BLANKLINEEND = re.compile('\\n\\r?\\n$')
 |  
 |  BLANKLINESTART = re.compile('^\\r?\\n\\r?\\n')

In [None]:
help(diff_match_patch.diff_match_patch.diff_linesToChars)

In [None]:
import diff_match_patch
#help(diff_match_patch.diff_match_patch.diff_linesToChars)
help(diff_match_patch.diff_match_patch)

In [None]:
import inspect
print(inspect.getsource(diff_match_patch.diff_match_patch.diff_linesToChars))

In [None]:
import inspect
print(inspect.getsource(diff_match_patch.diff_match_patch.diff_bisect))

In [None]:
help(ipypb.progressbar.ConfigurableProgressBar.__next__)

In [None]:
indb_empty = [r==None for r in results]

In [None]:
[len(r[0]) for r in results].index(8)

In [None]:
results[809]

In [None]:
import multiprocess
import testmp # Within notebooks, the function executed in a new process must be in a separate file, or else it will execute only within the notebook process
import time
import ipypb # Lightweight progress bar, source copied from GitHub

completed = 0
start_time = time.time()
if __name__ == '__main__':
    completed = 0
    N = gdf1.shape[0]
    results = [None]*N
    NUM_PROCESSES = 10
    with multiprocess.Pool(NUM_PROCESSES) as pool:
        print(f'Generating pool, P={NUM_PROCESSES}, N={N}')
        ret = [pool.apply_async(testmp.dmpDiff,(gdf1['Geometry'].iloc[ind], gdf1b['Geometry'].iloc[ind], ind)) for ind in range(N)]
        print('Processing pool')
        while True:
            indb_finished = [r.ready() for r in ret]
            indb_empty = [r==None for r in results]
            indb_update = [f and e for f, e in zip(indb_finished, indb_empty)]
            for i, b in enumerate(indb_update):
                if not b: continue
                results[i] = ret[i].get(999999)
                completed += 1
                #print(f'completed {completed: >4}/{N}, index {i: >4} in {time.strftime("%H:%M:%S", time.gmtime(sum(results[i][2])))}; total processor time {time.strftime("%H:%M:%S", time.gmtime(sum([sum(r[2]) for r in results if r!=None])))}, total wall time {time.strftime("%H:%M:%S", time.gmtime(time.time()-start_time))}')
            if all(indb_finished):
                break
        #print(f'Processing complete in {time.strftime("%H:%M:%S", time.gmtime(time.time()-start_time))}')
    wall_time = time.time()-start_time
    processor_time = sum([sum(r[2]) for r in results])
    print(f'Pool processing concluded, wall time {time.strftime("%H:%M:%S", time.gmtime(wall_time))}, processor time {time.strftime("%H:%M:%S", time.gmtime(processor_time))}, speedup {processor_time/wall_time:.3}x')

In [None]:
indb_nochange = [len(r[0])==1 for r in results]
ind_change = [i for i, b in enumerate(indb_nochange) if (not b)]
print(ind_change)

In [None]:
len(results[3][0])

In [None]:
multiprocess.cpu_count()

In [None]:
help(dmp_module)