# Analysis Optimisation

## Data preparation

### Spot anomalies

Before we initiate cleaning the data, we need to know how the data looks and determine which things are out of the ordinary. 
Lets read one of the files in order to determine the approach. 

In [3]:
import pandas as pd
import sklearn
import os
import numpy as np
import re

In [4]:
with open("datasets/sources.d") as input:
    for line in input:
        print(line)

Sourcenummer Sourcenaam	auteur	datum	omschrijving	verwijderen

6411 "protools\_dblist.r" "" ? "" no

6457 "prohelp\_msgs.r" "" ? "" no

6459 "protools\_propath.r" "" ? "" no

6461 "protools\_protool.r" "" ? "" no

7335 "prodict\dump_df.r" "" ? "" no

9071 "src\adm\method\containr.i" "" ? "" no

9072 "src\adm\method\smart.i" "" ? "" no

9073 "src\adm\method\attribut.i" "" ? "" no

9075 "src\adm\template\dialogmn.i" "" ? "" no

9076 "src\adm\template\row-head.i" "" ? "" no

9077 "src\adm\template\row-end.i" "" ? "" no

9078 "src\adm\template\snd-head.i" "" ? "" no

9079 "src\adm\template\snd-list.i " "" ? "" no

9080 "src\adm\template\snd-end.i" "" ? "" no

11451 "adeuib\_prvw4gl.r" "" ? "" no

16059 "auditing\_auditmainw.w" "" ? "" no

16060 "o2o.lib\order.p" "John Kattestaart" ? "Routines tbv order" no

16061 "xp\mri.p" "" ? "Machine Route Interface" no

16062 "lib\bus-mdrkrt.p" "Rudi Middelkoop" ? "Moederkaart procedures" no

16063 "lib\capcontr.p" "Marcel Overwater" ? "Controleren & 

We know there are more files in that directory so this can be written more optimally to do this for every file

In [5]:
def readFile(fileName):
    with open(fileName) as input:
        for line in input:
            print(line)

In [6]:
location = "datasets/"
dir_list = os.listdir(location)
for directory in dir_list:
    print(directory)

sources.d
tabellen.d
tabellen-per-source.d
velden.d
velden-per-source.d


In [7]:
for file in dir_list:
    with open(location + file) as input:
        print(file)
        headers = input.readline().split()
        print(headers)
        print(len(headers))

sources.d
['Sourcenummer', 'Sourcenaam', 'auteur', 'datum', 'omschrijving', 'verwijderen']
6
tabellen.d
['Tabnummerr', 'tabnaam', 'omschrijving', 'databasenummer']
4
tabellen-per-source.d
['sourcenummer', 'tabelnummer', 'Create', 'Delete', 'Read', 'Update', 'Sourcenummer(redacted)']
7
velden.d
['tabelnummer', 'veldnummer', 'veldnaam']
3
velden-per-source.d
['sourcenummer', 'veldnummer', 'datamutatie', 'sourcenummer(redundant)']
4


In [8]:
class DSStandard:
    def __init__(self, _headers, _location):
        self.headers = _headers
        self.location = _location
    def getHeadersLength(self):
        return len(self.headers)

In [9]:
headersFiles = {}
for file in dir_list:
    with open(location + file) as input:
        headers = input.readline().split()
        dsStandard = DSStandard(headers, location+file)
        headersFiles[file] = dsStandard

for entry in headersFiles:
    print(entry+ ":"+str(headersFiles[entry].headers))

sources.d:['Sourcenummer', 'Sourcenaam', 'auteur', 'datum', 'omschrijving', 'verwijderen']
tabellen.d:['Tabnummerr', 'tabnaam', 'omschrijving', 'databasenummer']
tabellen-per-source.d:['sourcenummer', 'tabelnummer', 'Create', 'Delete', 'Read', 'Update', 'Sourcenummer(redacted)']
velden.d:['tabelnummer', 'veldnummer', 'veldnaam']
velden-per-source.d:['sourcenummer', 'veldnummer', 'datamutatie', 'sourcenummer(redundant)']


### sources.d

In [10]:
readFile(location+dir_list[0])

Sourcenummer Sourcenaam	auteur	datum	omschrijving	verwijderen

6411 "protools\_dblist.r" "" ? "" no

6457 "prohelp\_msgs.r" "" ? "" no

6459 "protools\_propath.r" "" ? "" no

6461 "protools\_protool.r" "" ? "" no

7335 "prodict\dump_df.r" "" ? "" no

9071 "src\adm\method\containr.i" "" ? "" no

9072 "src\adm\method\smart.i" "" ? "" no

9073 "src\adm\method\attribut.i" "" ? "" no

9075 "src\adm\template\dialogmn.i" "" ? "" no

9076 "src\adm\template\row-head.i" "" ? "" no

9077 "src\adm\template\row-end.i" "" ? "" no

9078 "src\adm\template\snd-head.i" "" ? "" no

9079 "src\adm\template\snd-list.i " "" ? "" no

9080 "src\adm\template\snd-end.i" "" ? "" no

11451 "adeuib\_prvw4gl.r" "" ? "" no

16059 "auditing\_auditmainw.w" "" ? "" no

16060 "o2o.lib\order.p" "John Kattestaart" ? "Routines tbv order" no

16061 "xp\mri.p" "" ? "Machine Route Interface" no

16062 "lib\bus-mdrkrt.p" "Rudi Middelkoop" ? "Moederkaart procedures" no

16063 "lib\capcontr.p" "Marcel Overwater" ? "Controleren & 

#### Getting rid of empty array item

After inspecting the file, the first row is clearly tab seperated and the subsequent rows with data are space seperated. 
A problem however is that the rows contain data within quotes that also contain spaces so we have to write a requirement that checks that first before creating a new data value. 

In [11]:
def checkDataWithinQuotes(data):
    result = data.split('"')
    print(result)

We have to adjust the previous method of "readFile" since it can include the checkDataWithQuotes now

In [12]:
def readFile(fileName):
    with open(fileName) as input:
        for line in input:
            checkDataWithinQuotes(line)

In [13]:
readFile(location+dir_list[0])

['Sourcenummer Sourcenaam\tauteur\tdatum\tomschrijving\tverwijderen\n']
['6411 ', 'protools\\_dblist.r', ' ', '', ' ? ', '', ' no\n']
['6457 ', 'prohelp\\_msgs.r', ' ', '', ' ? ', '', ' no\n']
['6459 ', 'protools\\_propath.r', ' ', '', ' ? ', '', ' no\n']
['6461 ', 'protools\\_protool.r', ' ', '', ' ? ', '', ' no\n']
['7335 ', 'prodict\\dump_df.r', ' ', '', ' ? ', '', ' no\n']
['9071 ', 'src\\adm\\method\\containr.i', ' ', '', ' ? ', '', ' no\n']
['9072 ', 'src\\adm\\method\\smart.i', ' ', '', ' ? ', '', ' no\n']
['9073 ', 'src\\adm\\method\\attribut.i', ' ', '', ' ? ', '', ' no\n']
['9075 ', 'src\\adm\\template\\dialogmn.i', ' ', '', ' ? ', '', ' no\n']
['9076 ', 'src\\adm\\template\\row-head.i', ' ', '', ' ? ', '', ' no\n']
['9077 ', 'src\\adm\\template\\row-end.i', ' ', '', ' ? ', '', ' no\n']
['9078 ', 'src\\adm\\template\\snd-head.i', ' ', '', ' ? ', '', ' no\n']
['9079 ', 'src\\adm\\template\\snd-list.i ', ' ', '', ' ? ', '', ' no\n']
['9080 ', 'src\\adm\\template\\snd-end.i', ' 

The first line contains the headers of the columns. In total there are 6 columns. Lets make sure the lines are properly read as well. 

A by product of this implementation seems to be that a redundant space between a pair of double quotes, is being mistaken for data as well.
Lets fix that. 

In [14]:
def checkDataWithinQuotes(index, data):
    result = data.split('"')
    if len(result) != 1:
        result.pop(2)
    return result

In [15]:
def readFile(fileName):
    datasetTemp = []
    with open(fileName) as input:
        for index, line in enumerate(input):
            datasetTemp.append(checkDataWithinQuotes(index, line))
    return datasetTemp

In [16]:
dataset = readFile("datasets/"+dir_list[0])

#### Standardize the dataset

In [17]:
print(headersFiles["sources.d"].getHeadersLength())

6


An observation based on the previous cell shows that there are entries that deviate from the usual 6. Determine if there are more deviations. 

In [18]:
anomalyCheckSources = []
for entry in dataset:
    anomalyCheckSources.append(len(entry))

In [19]:
uniqueEntries = set(anomalyCheckSources)
print(uniqueEntries)

{1, 6, 8, 10, 14, 18, 22}


Based on the sets, we can create key value pairs to assign the entries of the dataset to. This will make the data cleaning a lot easier.

In [20]:
dictionaryEntries = {}
for entry in uniqueEntries:
    dictionaryEntries[entry] = []
print(dictionaryEntries)

{1: [], 6: [], 8: [], 10: [], 14: [], 18: [], 22: []}


In [21]:
anomalies = []
for entry in dataset:
    if len(entry) != 1 and len(entry) != 6: 
        anomalies.append(entry)

The entries that need to be ignored are either:
- the header
- the correct amount

so those fields do not have to be cleaned. 
The correct amount is based on the fields mentioned in the header. 

Might have to adjust the files to seperate the headers with commas, making the seperation easier. 

In [22]:
switcher = {
    1:0,
    6:0,
    8:0,
    10:0,
    14:0,
    18:0,
    22:0
}

for anomaly in anomalies:
    switcher[len(anomaly)] = switcher[len(anomaly)] + 1 
print(switcher)

{1: 0, 6: 0, 8: 2, 10: 22, 14: 17, 18: 24, 22: 3}


This can be done more efficiently: combine the previous two blocks by assigning the entries with the corresponding lengths.

In [23]:
for entry in dataset:
    dictionaryEntries[len(entry)].append(entry) 

In [24]:
for entry in dictionaryEntries:
    print(str(entry)+ ":" + str(dictionaryEntries[entry][0]))

1:['Sourcenummer Sourcenaam\tauteur\tdatum\tomschrijving\tverwijderen\n']
6:['6411 ', 'protools\\_dblist.r', '', ' ? ', '', ' no\n']
8:['22774 ', 'o2o.klo\\p4521c0.p', "U) THEN 'NEW,OPEN,COPY,DEL':U ELSE '':U)", '', '', ' ? ', 'Overzicht gusi', ' no\n']
10:['16597 ', 'zs\\zsopsl.w', 'Matthijs Kaan', ' ? ', 'Opslag, afroep en artikel ', '', 'verbruik', '', '-gegevens', ' no\n']
14:['16842 ', 'cv\\gbin01008.p', 'engelpe', ' ? ', 'Toevoegen/verwijderen gbin entries  met gbin.InstellingCd = ', '', 'Properties', '', ':U  AND gbin.ObjectCd  = ', '', 'P5547C0-BRW-ORDERMUT', '', '', ' no\n']
18:['20825 ', 'cv\\cpcd01187.p', 'engelpe', ' ? ', 'Toevoegen/verwijderen cpcd entries  met cpcd.TabelNm = ', '', 'Rep', '', ':U AND cpcd.VeldNm = ', '', 'SoortRapportCd', '', ':U AND cpcd.SysteemCd BEGINS ', '', 'AUD', '', ':U', ' no\n']
22:['27642 ', 'cv\\vrtl00022.p', 'engelpe', ' ? ', 'Toevoegen/verwijderen vrtl entries  met vrtl.VrtKd = ', '', '031', '', ' AND vrtl.ContextCd = ', '', 'DT', '', ' AND v

In [25]:
# Starting point should be a variable that can be passed to a method
for listCount in dictionaryEntries:
    # startingPoint = 4
    print(dictionaryEntries[listCount])
    # for string in dictionaryEntries[listCount]:
    #     while startingPoint < listCount:
    #         print(dictionaryEntries[listCount][startingPoint])
    #         startingPoint += 1
            

[['Sourcenummer Sourcenaam\tauteur\tdatum\tomschrijving\tverwijderen\n'], ['.\n'], ['PSC\n'], ['filename=src\n'], ['records=0000000013330\n'], ['ldbname=ref\n'], ['timestamp=2025/10/28-13:33:35\n'], ['numformat=46,44\n'], ['dateformat=dmy-1970\n'], ['map=NO-MAP\n'], ['cpstream=ISO8859-1\n'], ['.\n'], ['0000972860\n']]
[['22774 ', 'o2o.klo\\p4521c0.p', "U) THEN 'NEW,OPEN,COPY,DEL':U ELSE '':U)", '', '', ' ? ', 'Overzicht gusi', ' no\n'], ['22865 ', 'o2o.klo\\p4536c0.p', "U) THEN 'NEW,OPEN,COPY,DEL':U ELSE '':U)", '', '', ' ? ', 'Overzicht factuuradres liable entity', ' no\n']]
[['16597 ', 'zs\\zsopsl.w', 'Matthijs Kaan', ' ? ', 'Opslag, afroep en artikel ', '', 'verbruik', '', '-gegevens', ' no\n'], ['16619 ', 'lib\\bufcmp.i ', 'Dirk vreken', ' ? ', 'Algemene ', '', 'buffer-compare', '', '-procedures', ' no\n'], ['16804 ', 'cobis\\lib\\relation.cls', 'Peter Engel', ' ? ', 'Class Library voor Relatie  (', '', 'copy', '', ' from relatie.cls)', ' no\n'], ['16846 ', 'cv\\gbin01012.p', 'lind

The pattern we are seeing, is that when a list is longer than 6, the description contains multiple quotes. We just have concatenate the entries starting with 4 until last -1. That should be the entry in the list so lets try that.

The implementation as seen above, does not function as intended. The kernel is endlessly loading.
Optimising the code is essential.
When critically looking at the "description" fields, there is a pattern:
It starts at the 5th variable (index 4) and ends at the end of the list -1 due to the last string being a boolean. In other words, not part of the description variable. 

In [26]:
def aggregateString(startingPoint, line):
    arr = np.array(line)
    result = arr[startingPoint:len(line)-1]
    return ''.join(result)

Testing to check if the method works as intended.

In [27]:
test = aggregateString(4, dictionaryEntries[18][3])
print(test)
print(dictionaryEntries[18][3])

Toevoegen/verwijderen cpcd entries  met cpcd.TabelNm = REP AND cpcd.VeldNm = SoortRapportCd AND cpcd.SysteemCd = AUD15
['26370 ', 'cv\\cpcd01191.p', 'engelpe', ' ? ', 'Toevoegen/verwijderen cpcd entries  met cpcd.TabelNm = ', '', 'REP', '', ' AND cpcd.VeldNm = ', '', 'SoortRapportCd', '', ' AND cpcd.SysteemCd = ', '', 'AUD15', '', '', ' no\n']


In [28]:
for entry in dictionaryEntries:
    if entry != 1 and entry != 6:
        for listItem in dictionaryEntries[entry]:
            cleanedString = aggregateString(4,listItem)
            print(cleanedString)
            print(entry)
    if entry == 1 or entry == 6:
        print(entry)

1
6
 ? Overzicht gusi
8
 ? Overzicht factuuradres liable entity
8
Opslag, afroep en artikel verbruik-gegevens
10
Algemene buffer-compare-procedures
10
Class Library voor Relatie  (copy from relatie.cls)
10
Toevoegen/verwijderen gbin entries  met gbin.InstellingCd = Properties:U AND
10
Toevoegen/verwijderen cpcd en rep entries met Cpcd.VeldNm = soortrapportcd
10
- Toevoeging waarde NL18 aan lijst met mogelijke waardes voor
10
Toevoegen/verwijderen cpcd entries  met Cpcd.VeldNm = soortrapportcd
10
Toevoegen/verwijderen xmlt entries  met contextid begins HML
10
Met dit programma worden orders vrijgegeven.
10
Tonen van communicatie relatie record met dezelfde
10
Browser gebaseerd op Onderhouden prijsregel
10
Browser voor Order naar (orn)
10
Toevoegen/verwijderen gbin entries  met gbin.ObjectCd     = P5552C0-BRW-KLANT-AUDIT
10
Toevoegen/verwijderen gbin entries  met ObjectCd     = P6871C0-BRW-POD-DVSOPT
10
Toevoegen/verwijderen vrtl entries  met vrtl.vertalingTx = Controle nesting:U
10
Toev

As you can see, there is an issue:
The entries of 1 and 6 are being completely skipped now. That is due to the nature of the for loop. 
In order to ensure it works, the dataset will be split in two:
- the default
- the anomalies

This should allow the correct methods to be called. 

In [29]:
regularsToExtract = [6,8,10,14,18,22]
anomaliesToExtract = [1]

#### Insert the data into an object per entry

The data is now mostly clean, except for a few empty fields but we can now insert it into an object. 

In [30]:
class Source:
    def __init__(self, _id, _fileName, _createdBy, _createdDate, _description, _redundant):
        self.id = _id
        self.fileName = _fileName
        self.createdBy = _createdBy
        self.createdDate = _createdDate
        self.description = _description
        self.redundant = _redundant

In [31]:
anomaliesDS = {key:dictionaryEntries[key] for key in anomaliesToExtract}
regularDS = {key:dictionaryEntries[key] for key in regularsToExtract}
cleanedDataset = []
for key in regularDS: 
    for entry in regularDS[key]:
        source = Source(entry[0],entry[1],entry[2],entry[3], aggregateString(4, entry), entry[len(entry)-1])
        cleanedDataset.append(source)

In [147]:
print(cleanedDataset[50].description)
print(cleanedDataset[50].fileName)
print(cleanedDataset[50].redundant)
print(cleanedDataset[50].createdDate)
print(cleanedDataset[50].createdBy)

DSO for Company (bed)
cobis\dso\company.cls
 no

 ? 
PEN


We have been succesful in concatenating the strings in the previous arrays. It is probably optimisable but for now, this works. Implement this so it is a part of the list processing. 

### tabellen.d

In [33]:
print(headersFiles["tabellen.d"].getHeadersLength())

4


In [34]:
print(headersFiles["tabellen.d"].headers)

['Tabnummerr', 'tabnaam', 'omschrijving', 'databasenummer']


In [35]:
tabellenDS = []
with open(headersFiles["tabellen.d"].location) as temp:
    for entry in temp:
        tempArray = entry.split('"')
        tabellenDS.append(tempArray)

In [36]:
anomalyCheckTabellen = []
for entry in tabellenDS:
    anomalyCheckTabellen.append(len(entry))
    if len(entry) != 4:
        print(entry)

['Tabnummerr\ttabnaam\tomschrijving\tdatabasenummer\n']
['1 ', 'attb', ' ', 'Attribuut', ' 3\n']
['2 ', 'attd', ' ', 'Attribuut definitie', ' 3\n']
['3 ', 'attg', ' ', 'Attribuutgroep', ' 3\n']
['4 ', 'attw', ' ', 'Attribuutwaarde', ' 3\n']
['5 ', 'fgp', ' ', 'Functional group', ' 2\n']
['6 ', 'oco', ' ', 'opco', ' 2\n']
['7 ', 'opkf', ' ', 'Openstaandeposten klantfaktuur', ' 3\n']
['9 ', 'reprrl', ' ', 'Rapporten per relatie', ' 3\n']
['12 ', 'd_fnk', ' ', 'Functie', ' 1\n']
['13 ', 'sfnc', ' ', 'Functie', ' 1\n']
['14 ', '_file', ' ? 3\n']
['15 ', 'wet', ' ', 'Type werkzaamheden', ' 2\n']
['16 ', 'emt', ' ', 'Email templates', ' 2\n']
['17 ', 'bed', ' ', 'Bedrijfsgegeven', ' 3\n']
['18 ', 'rel', ' ', 'Relatie', ' 3\n']
['19 ', 'ofr', ' ', 'Offerteregel', ' 3\n']
['20 ', 'ofk', ' ', 'Offertekop', ' 3\n']
['21 ', 'pra', ' ', 'Prijsafspraak', ' 3\n']
['22 ', 'sta', ' ', 'Offerteregel status', ' 3\n']
['23 ', 'lvc', ' ', 'Leveringsconditie', ' 3\n']
['24 ', 'kwa', ' ', 'Kwaliteit', ' 3\n

In [37]:
uniqueEntriesTabellen = set(anomalyCheckTabellen)
print(uniqueEntriesTabellen)

{1, 3, 5}


The amount 5 is explainable. An empty string is being mistaken for an entry. Can be changed. Only the entry with a length of 3 is an issue. 


In [38]:
for entry in tabellenDS:
    if len(entry) == 3:
        entry.insert(2,"empty")
print(tabellenDS)



In [39]:
for entry in tabellenDS:
    if len(entry)==4:
        print(entry)

['14 ', '_file', 'empty', ' ? 3\n']
['343 ', 'emp', 'empty', ' ? 2\n']
['427 ', '_connect', 'empty', ' ? 1\n']
['444 ', '_field', 'empty', ' ? 1\n']
['516 ', 'imag', 'empty', ' ? 3\n']
['588 ', 'parent', 'empty', ' ? 3\n']
['603 ', 'prob', 'empty', ' ? 3\n']
['627 ', 'sic', 'empty', ' ? 3\n']
['669 ', 'wobt', 'empty', ' ? 3\n']
['917 ', '_index', 'empty', ' ? 3\n']
['918 ', '_index-field', 'empty', ' ? 3\n']
['949 ', '_sequence', 'empty', ' ? 8\n']
['951 ', '_aud-audit-policy', 'empty', ' ? 3\n']
['952 ', '_aud-file-policy', 'empty', ' ? 3\n']
['956 ', '_file-trig', 'empty', ' ? 8\n']
['996 ', '_aud-audit-data', 'empty', ' ? 3\n']
['997 ', '_aud-event', 'empty', ' ? 3\n']
['1150 ', '_user', 'empty', ' ? 1\n']
['1340 ', '_db', 'empty', ' ? 8\n']
['1610 ', '_aud-audit-data-value', 'empty', ' ? 3\n']
['2864 ', 'tomlink', 'empty', ' ? 2\n']
['2889 ', 'task', 'empty', ' ? 3\n']
['2990 ', 'p_olo', 'empty', ' ? 3\n']
['3009 ', '_indexstat', 'empty', ' ? 3\n']
['3010 ', '_userindexstat', 'empt

In [40]:
for entry in tabellenDS:
    if len(entry) ==5:
        print(entry)

['1 ', 'attb', ' ', 'Attribuut', ' 3\n']
['2 ', 'attd', ' ', 'Attribuut definitie', ' 3\n']
['3 ', 'attg', ' ', 'Attribuutgroep', ' 3\n']
['4 ', 'attw', ' ', 'Attribuutwaarde', ' 3\n']
['5 ', 'fgp', ' ', 'Functional group', ' 2\n']
['6 ', 'oco', ' ', 'opco', ' 2\n']
['7 ', 'opkf', ' ', 'Openstaandeposten klantfaktuur', ' 3\n']
['9 ', 'reprrl', ' ', 'Rapporten per relatie', ' 3\n']
['12 ', 'd_fnk', ' ', 'Functie', ' 1\n']
['13 ', 'sfnc', ' ', 'Functie', ' 1\n']
['15 ', 'wet', ' ', 'Type werkzaamheden', ' 2\n']
['16 ', 'emt', ' ', 'Email templates', ' 2\n']
['17 ', 'bed', ' ', 'Bedrijfsgegeven', ' 3\n']
['18 ', 'rel', ' ', 'Relatie', ' 3\n']
['19 ', 'ofr', ' ', 'Offerteregel', ' 3\n']
['20 ', 'ofk', ' ', 'Offertekop', ' 3\n']
['21 ', 'pra', ' ', 'Prijsafspraak', ' 3\n']
['22 ', 'sta', ' ', 'Offerteregel status', ' 3\n']
['23 ', 'lvc', ' ', 'Leveringsconditie', ' 3\n']
['24 ', 'kwa', ' ', 'Kwaliteit', ' 3\n']
['25 ', 'cof', ' ', 'Combinatie offertekop', ' 3\n']
['26 ', 'orm', ' ', 'Offert

Index 2 in the array is empty. We can just pop it for every entry that has 5 items.  

In [41]:
for entry in tabellenDS:
    if len(entry) ==5:
        entry.pop(2)     

In [42]:
for entry in tabellenDS:
    if len(entry) ==5:
        print(entry)

In [43]:
anomalyCheckTabellen = []
for entry in tabellenDS:
    anomalyCheckTabellen.append(len(entry))
    if len(entry) != 4:
        print(entry)

['Tabnummerr\ttabnaam\tomschrijving\tdatabasenummer\n']
['.\n']
['PSC\n']
['filename=tab\n']
['records=0000000001909\n']
['ldbname=ref\n']
['timestamp=2025/10/28-13:29:49\n']
['numformat=46,44\n']
['dateformat=dmy-1970\n']
['map=NO-MAP\n']
['cpstream=ISO8859-1\n']
['.\n']
['0000051287\n']


In [44]:
checkAnomalyTable2 = set(anomalyCheckTabellen)
print(checkAnomalyTable2)

{1, 4}


### Tabellen per Source

In [76]:
print(len(headersFiles["tabellen-per-source.d"].headers))

7


In [77]:
print(headersFiles["tabellen-per-source.d"].headers)

['sourcenummer', 'tabelnummer', 'Create', 'Delete', 'Read', 'Update', 'Sourcenummer(redacted)']


In [78]:
tpsDS = []
with open(headersFiles["tabellen-per-source.d"].location) as temp:
    for entry in temp:
        tempArray = entry.split('"')
        tpsDS.append(tempArray)

In [79]:
print(tpsDS)

[['sourcenummer\ttabelnummer\tCreate\tDelete\tRead\tUpdate\tSourcenummer(redacted)\n'], ['651 334 no no yes no 0\n'], ['651 323 no no yes no 0\n'], ['651 6 no no yes no 0\n'], ['651 317 no no yes no 0\n'], ['651 333 no no yes no 0\n'], ['651 241 no no yes no 0\n'], ['651 332 no no yes no 0\n'], ['651 338 yes no yes no 0\n'], ['652 339 yes no yes yes 0\n'], ['652 241 no no yes no 0\n'], ['652 333 no no yes no 0\n'], ['652 332 no no yes no 0\n'], ['652 338 yes no yes no 0\n'], ['652 334 no no yes no 0\n'], ['652 323 no no yes no 0\n'], ['652 6 no no yes no 0\n'], ['652 317 no no yes no 0\n'], ['653 334 no no yes no 0\n'], ['653 323 no no yes no 0\n'], ['653 6 no no yes no 0\n'], ['653 317 no no yes no 0\n'], ['653 333 no no yes no 0\n'], ['653 241 no no yes no 0\n'], ['653 338 yes no yes no 0\n'], ['654 334 no no yes no 0\n'], ['654 323 no no yes no 0\n'], ['654 6 no no yes no 0\n'], ['654 317 no no yes no 0\n'], ['654 108 no no yes no 0\n'], ['654 333 no no yes no 0\n'], ['654 241 no no

The previous approach does not yield the desired result. Inspection of the dataset is required for an appropriate solution. 

In [80]:
print(tpsDS[1])

['651 334 no no yes no 0\n']


In [81]:
tpsDSTest = []
with open(headersFiles["tabellen-per-source.d"].location) as temp:
    for entry in temp:
        tpsDSTest.append(entry)
print(tpsDSTest[33])

655 108 no no yes no 0



The entries are now split with spaces. 

In [82]:
tpsDS = []
with open(headersFiles["tabellen-per-source.d"].location) as temp:
    for entry in temp:
        tempArray = entry.split()
        tpsDS.append(tempArray)

In [83]:
print(tpsDS[2])

['651', '323', 'no', 'no', 'yes', 'no', '0']


In [84]:
anomalyCheckTpsDS = []
for entry in tpsDS:
    anomalyCheckTpsDS.append(len(entry))

In [85]:
uniqueEntriesTPS = set(anomalyCheckTpsDS)
print(uniqueEntriesTPS)

{1, 7}


### Velden

In [99]:
print(len(headersFiles["velden.d"].headers))

3


In [116]:
tempDS = []
with open(headersFiles["velden.d"].location) as temp:
    for entry in temp:
        tempArray = entry.split('"')
        tempDS.append(entry)


In [117]:
print(tempDS)

['tabelnummer\tveldnummer\tveldnaam\n', '12 1 "FNKnr"\n', '12 2 "FKTkd"\n', '13 3 "sfnckd"\n', '12 4 "FNKkd"\n', '13 5 "sobjkd"\n', '12 6 "FNKnms"\n', '14 7 "_File-Name"\n', '14 8 "_Template"\n', '14 9 "_File-Label"\n', '15 14 "wetnr"\n', '15 15 "wetom"\n', '15 16 "wetlb"\n', '15 17 "wetcdke"\n', '15 18 "wetlgac"\n', '15 19 "wetlgpl"\n', '15 20 "wetnrvgpl"\n', '16 21 "emtnr"\n', '16 23 "emtemfrm"\n', '16 24 "emttxon"\n', '18 25 "relnr"\n', '17 26 "bednr"\n', '18 27 "relnm"\n', '19 28 "ofknr"\n', '19 29 "stanr"\n', '19 30 "ofrnr"\n', '19 31 "kadnr"\n', '19 32 "prmkddp"\n', '19 33 "ofrhv"\n', '19 34 "ofrhvog"\n', '19 35 "ofrhvbg"\n', '19 36 "ofrbvof"\n', '19 37 "valkd"\n', '20 38 "kadnr"\n', '20 39 "ofklgsr"\n', '19 40 "ofrbbkemc"\n', '19 41 "ofrbbkems"\n', '21 42 "prapc"\n', '22 43 "staom"\n', '23 44 "lvcom"\n', '24 45 "kwalb"\n', '24 46 "kwaom"\n', '25 47 "cofhvmt"\n', '25 48 "cofhvdl"\n', '25 49 "cofhvkm"\n', '25 50 "cofhvkl"\n', '26 51 "ormhvml"\n', '26 52 "ipkkd"\n', '26 53 "prmkdvz

In [118]:
print(tempDS[2])

12 2 "FKTkd"



In [119]:
veldenDS = []
for entry in tempDS:
    tempVeld = entry.split()
    veldenDS.append(tempVeld)

In [120]:
print(veldenTestDS)

[['12', '1', '"FNKnr"'], ['12', '2', '"FKTkd"'], ['13', '3', '"sfnckd"'], ['12', '4', '"FNKkd"'], ['13', '5', '"sobjkd"'], ['12', '6', '"FNKnms"'], ['14', '7', '"_File-Name"'], ['14', '8', '"_Template"'], ['14', '9', '"_File-Label"'], ['15', '14', '"wetnr"'], ['15', '15', '"wetom"'], ['15', '16', '"wetlb"'], ['15', '17', '"wetcdke"'], ['15', '18', '"wetlgac"'], ['15', '19', '"wetlgpl"'], ['15', '20', '"wetnrvgpl"'], ['16', '21', '"emtnr"'], ['16', '23', '"emtemfrm"'], ['16', '24', '"emttxon"'], ['18', '25', '"relnr"'], ['17', '26', '"bednr"'], ['18', '27', '"relnm"'], ['19', '28', '"ofknr"'], ['19', '29', '"stanr"'], ['19', '30', '"ofrnr"'], ['19', '31', '"kadnr"'], ['19', '32', '"prmkddp"'], ['19', '33', '"ofrhv"'], ['19', '34', '"ofrhvog"'], ['19', '35', '"ofrhvbg"'], ['19', '36', '"ofrbvof"'], ['19', '37', '"valkd"'], ['20', '38', '"kadnr"'], ['20', '39', '"ofklgsr"'], ['19', '40', '"ofrbbkemc"'], ['19', '41', '"ofrbbkems"'], ['21', '42', '"prapc"'], ['22', '43', '"staom"'], ['23', 

In [122]:
anomalyVeldenDS = []
for entry in veldenDS:
    anomalyVeldenDS.append(len(entry))    

In [123]:
uniqueEntriesVelden = set(anomalyVeldenDS)
print(uniqueEntriesVelden)

{1, 3}


### Velden per Source

In [124]:
print(headersFiles["velden-per-source.d"].headers)

['sourcenummer', 'veldnummer', 'datamutatie', 'sourcenummer(redundant)']


In [125]:
print(len(headersFiles["velden-per-source.d"].headers))

4


In [140]:
vpsDS = []

with open(headersFiles["velden-per-source.d"].location) as temp:
    for entry in temp:
        tempArray = entry.split()
        vpsDS.append(tempArray)

In [141]:
print(vpsDS[1])

['651', '2678', 'no', '0']


In [142]:
anomalyCheckVPS = []
for entry in vpsDS:
    anomalyCheckVPS.append(len(entry))

In [143]:
uniqueEntries = set(anomalyCheckVPS)
print(uniqueEntries)

{1, 4}
