In [1]:
import pathlib
import numpy as np

In [60]:
data_path = pathlib.Path("../data/commuters.csv")
data = np.genfromtxt(
        data_path, delimiter=",", skip_header=1,
        dtype=int,
        converters= {
            0: lambda x: int(x == "\"R\""), 
            1: lambda x: int(x.strip("\"")),
            2: lambda x: int(x.strip("\"")),
            3: lambda x: int(x.strip("\"")),
            4: lambda x: int(x.strip("\"")),
            5: lambda x: int(x.strip("\"")),
            6: lambda x: int(x.strip("\"")),
        })
print(data.shape)

(449720, 7)


In [61]:
# ignore all rows with 7777 entries (other commune) or 77 (other canton)
canton_columns = data[:, [2,4]] # columns containing canton identifiers
commune_columns = data[:, [3,5]] # columns cont. commune identifiers

commune_invalid = np.any(commune_columns == 7777, axis=1) # all row indices with a 7777 commune entry
canton_invalid = np.any(canton_columns == 77, axis=1) # all row indices with a 77 canton entry
clean_rows = ~np.logical_or(canton_invalid, commune_invalid) # rows cont. only valid communes + cantons

print(np.sum(~clean_rows)) # nr. deleted rows
print(np.sum(commune_invalid))
print(np.sum(canton_invalid)) 

print(np.sum(np.logical_and(canton_invalid, ~commune_invalid))) # 0 => canton==77 -> commune==7777 entry

data = data[clean_rows] # only keep rows without both "other canton" and "other commune" entries

r_rows = data[data[:, 0] == 1][:, 1:] # "R" as perspective
w_rows = data[data[:, 0] == 0][:, 1:] # "W" as perspective

assert(len(r_rows) == len(w_rows))
print("Same number of (cleaned) r and w entries:", len(r_rows))

assert(np.all(r_rows == w_rows)) # simplified check since the w and r row order is the same
print("Arrays equal! That means that the cleaned r and w rows match perfectly!")


67418
67418
12321
0
Same number of (cleaned) r and w entries: 191151
Arrays equal! That means that the cleaned r and w rows match perfectly!
