# 20-CLEAN

In [None]:
from lxml import etree
import yaml
from datetime import datetime

In [None]:
# Load XML

#file = 'C:/Users/celine.gross/Social Finance Ltd/CS Front Door Data Collaboration - Documents/06. Sensitive data (restricted)/West Berkshire/input/cin/CIN Final Extract 31032018.xml'
file = 'C:/Users/celine.gross/Social Finance Ltd/CS Front Door Data Collaboration - Documents/06. Sensitive data (restricted)/Southampton/input/cin/CINReturn1718.xml'

# Parse XML
tree = etree.parse(file) 
root = tree.getroot()

In [None]:
# Define namespace

#NS = {None: 'http://www.dcsf.gov.uk/schemas/cbds'}
NS = {None: None}

In [None]:
# Load the CIN config file

filename = 'cin_datamap.yaml'
with open(filename) as FILE:
    cinmap = yaml.load(FILE, Loader=yaml.FullLoader)

## Function definitions

In [None]:
# Generic cleaner functions

def to_category(string, categories):
    for code in categories:
        if str(string).lower() == str(code['code']).lower():
            return code['code']
        elif str(code['code']).lower() in str(string).lower():
            return code['code']
        elif 'name' in code:
            if str(code['name']).lower() in str(string).lower():
                return code['code']
    return 'Not in proper format: {}'.format(string)
    # If time, add here the matching report

def to_date(string, dateformat):
    try:
        datetime.strptime(string, dateformat) # Check this is possible
    except:
        string = 'Not in proper format: {}'.format(string)
    return string
    # If time, add here the matching report
    
def to_integer(string):
    try:
        int(string) # Check this is possible
    except:
        string = 'Not in proper format: {}'.format(string)
        # If time, add here the matching report
        
        
# Cleaner functions depending on XML tag

class clean:
    
    @staticmethod
    def child(value, config):
        for group in value:
            if group.tag.endswith('ChildIdentifiers'):
                group = clean.childidentifiers(group, config['ChildIdentifiers'])
            elif group.tag.endswith('ChildCharacteristics'):
                group = clean.childcharacteristics(group, config['ChildCharacteristics'])
            elif group.tag.endswith('CINdetails'):
                group = clean.cindetails(group, config['CINdetails'])
        return value
  
    # Child Identifiers functions
    @staticmethod
    def childidentifiers(value, config):
        for group in value:
            if group.tag.endswith('LAchildID'):
                group = clean.lachildid(group)
            if group.tag.endswith('UPN'):
                group = clean.upn(group)
            if group.tag.endswith('FormerUPN'):
                group = clean.formerupn(group)
            if group.tag.endswith('UPNunknown'):
                group = clean.upnunknown(group, config['UPNunknown'])
            if group.tag.endswith('PersonBirthDate'):
                group = clean.personbirthdate(group, config['PersonBirthDate'])
            if group.tag.endswith('ExpectedPersonBirthDate'):
                group = clean.expectedpersonbirthdate(group, config['ExpectedPersonBirthDate'])
            if group.tag.endswith('GenderCurrent'):
                group = clean.gendercurrent(group, config['GenderCurrent'])
            if group.tag.endswith('PersonDeathDate'):
                group = clean.persondeathdate(group, config['PersonDeathDate'])
        return value

    @staticmethod
    def lachildid(value, config=None):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
        # If time, add config and test that len<=10 and type = alphanumeric
        return value

    @staticmethod
    def upn(value, config=None):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().upper()
        # If time, add config and test that len==13 and regex follows pattern
        return value

    @staticmethod
    def formerupn(value, config=None):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().upper()
        # If time, add config and test that len==13 and regex follows pattern
        return value

    @staticmethod
    def upnunknown(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().upper()
            value.text = to_category(value.text, config['category'])
        return value

    @staticmethod
    def personbirthdate(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value

    @staticmethod
    def expectedpersonbirthdate(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value
# If time, add logical test to check there is just one birth date
    
    @staticmethod
    def gendercurrent(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_category(value.text, config['category'])
        return value
    
    @staticmethod
    def persondeathdate(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value

    # Child Characteristics functions
    @staticmethod
    def childcharacteristics(value, config):
        for group in value:
            if group.tag.endswith('Ethnicity'):
                group = clean.ethnicity(group, config['Ethnicity'])
            if group.tag.endswith('Disabilities'):
                group = clean.disabilities(group, config['Disabilities'])
        return value
    
    @staticmethod
    def ethnicity(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().upper()
            value.text = to_category(value.text, config['category'])
        return value
    
    @staticmethod
    def disabilities(value, config):
        for group in value:
            if group.tag.endswith('Disability'):
                group = clean.disability(group, config['Disability'])
            else:
                pass #Here add a flag if we are getting something else
        return value
    
    @staticmethod
    def disability(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().upper()
            value.text = to_category(value.text, config['category'])
        return value
    
    # CIN Details functions
    @staticmethod
    def cindetails(value, config):
        for group in value:
            if group.tag.endswith('CINreferralDate'):
                group = clean.cinreferraldate(group, config['CINreferralDate'])
            if group.tag.endswith('ReferralSource'):
                group = clean.referralsource(group, config['ReferralSource'])
            if group.tag.endswith('PrimaryNeedCode'):
                group = clean.primaryneedcode(group, config['PrimaryNeedCode'])
            if group.tag.endswith('CINclosureDate'):
                group = clean.cinclosuredate(group, config['CINclosureDate'])
            if group.tag.endswith('ReasonForClosure'):
                group = clean.reasonforclosure(group, config['ReasonForClosure'])
            if group.tag.endswith('ReferralNFA'):
                group = clean.referralnfa(group, config['ReferralNFA'])
            if group.tag.endswith('DateOfInitialCPC'):
                group = clean.dateofinitialcpc(group, config['DateOfInitialCPC'])
            if group.tag.endswith('Assessments'):
                group = clean.assessments(group, config['Assessments'])
            if group.tag.endswith('Section47'):
                group = clean.section47(group, config['Section47'])
            if group.tag.endswith('ChildProtectionPlans'):
                group = clean.childprotectionplans(group, config['ChildProtectionPlans'])
        return value
    
    @staticmethod
    def cinreferraldate(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value

    @staticmethod
    def referralsource(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().upper()
            value.text = to_category(value.text, config['category'])
        return value

    @staticmethod
    def primaryneedcode(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().upper()
            value.text = to_category(value.text, config['category']) 
        return value

    @staticmethod
    def cinclosuredate(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value

    @staticmethod
    def reasonforclosure(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().upper()
            value.text = to_category(value.text, config['category'])
        return value

    @staticmethod
    def referralnfa(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().capitalize()
            value.text = to_category(value.text, config['category'])
        return value

    @staticmethod
    def dateofinitialcpc(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value

    @staticmethod
    def assessments(value, config):
        for group in value:
            if group.tag.endswith('AssessmentActualStartDate'):
                group = clean.assessmentactualstartdate(group, config['AssessmentActualStartDate'])
            if group.tag.endswith('AssessmentInternalReviewDate'):
                group = clean.assessmentinternalreviewdate(group, config['AssessmentInternalReviewDate'])
            if group.tag.endswith('AssessmentAuthorisationDate'):
                group = clean.assessmentauthorisationdate(group, config['AssessmentAuthorisationDate'])
            if group.tag.endswith('FactorsIdentifiedAtAssessment'):
                group = clean.factorsidentifiedatassessment(group, config['FactorsIdentifiedAtAssessment'])
        return value

    @staticmethod
    def assessmentactualstartdate(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value
    
    @staticmethod
    def assessmentinternalreviewdate(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value
    
    @staticmethod
    def assessmentauthorisationdate(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value
    
    @staticmethod
    def factorsidentifiedatassessment(value, config):
        for group in value:
            if group.tag.endswith('AssessmentFactors'):
                group = clean.assessmentfactors(group, config['AssessmentFactors'])
            else:
                pass # if time, flag whatever else we find here
        return value    
    
    @staticmethod
    def assessmentfactors(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().upper()
            value.text = to_category(value.text, config['category'])
        return value
    
    @staticmethod
    def section47(value, config):
        for group in value:
            if group.tag.endswith('S47ActualStartDate'):
                group = clean.s47actualstartdate(group, config['S47ActualStartDate'])
            if group.tag.endswith('InitialCPCtarget'):
                group = clean.initialcpctarget(group, config['InitialCPCtarget'])
            if group.tag.endswith('DateOfInitialCPC'):
                group = clean.dateofinitialcpc(group, config['DateOfInitialCPC'])
            if group.tag.endswith('ICPCnotRequired'):
                group = clean.icpcnotrequired(group, config['ICPCnotRequired'])
        return value

    @staticmethod
    def s47actualstartdate(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value

    @staticmethod
    def initialcpctarget(value, config):
        if value.text is not None: #if time, automate the reading of 'canbeblank'
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value
    
    @staticmethod
    def dateofinitialcpc(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value
    
    @staticmethod
    def icpcnotrequired(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().capitalize()
            value.text = to_category(value.text, config['category'])
        return value

    @staticmethod
    def childprotectionplans(value, config):
        for group in value:
            if group.tag.endswith('CPPstartDate'):
                group = clean.cppstartdate(group, config['CPPstartDate'])
            if group.tag.endswith('InitialCategoryOfAbuse'):
                group = clean.initialcategoryofabuse(group, config['InitialCategoryOfAbuse'])
            if group.tag.endswith('LatestCategoryOfAbuse'):
                group = clean.latestcategoryofabuse(group, config['LatestCategoryOfAbuse'])
            if group.tag.endswith('NumberOfPreviousCPP'):
                group = clean.numberofpreviouscpp(group)
            if group.tag.endswith('CPPendDate'):
                group = clean.cppenddate(group, config['CPPendDate'])
            if group.tag.endswith('Reviews'):
                group = clean.reviews(group, config['Reviews'])
        return value

    @staticmethod
    def cppstartdate(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value
    
    @staticmethod
    def initialcategoryofabuse(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().upper()
            value.text = to_category(value.text, config['category'])
        return value
    
    @staticmethod
    def latestcategoryofabuse(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip().upper()
            value.text = to_category(value.text, config['category'])
        return value
    
    @staticmethod
    def numberofpreviouscpp(value, config=None):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_integer(value.text)
        return value
    
    @staticmethod
    def cppenddate(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value
    
    @staticmethod
    def reviews(value, config):
        for group in value:
            if group.tag.endswith('CPPreviewDate'):
                group = clean.cppreviewdate(group, config['CPPreviewDate'])
            else:
                pass # if time, flag whatever else we find here
        return value
    
    @staticmethod
    def cppreviewdate(value, config):
        if value.text is None:
            node = value.getparent()
            node.remove(value)
        else:
            value.text = value.text.strip()
            value.text = to_date(value.text, config['date'])
        return value

## Run programme

In [None]:
# Run cleaner

children = root.findall('.//Child', NS)

for child in children:
    child = clean.child(child, cinmap)
tree.write('cintest.xml')

In [None]:
#Contents of error report:
# NoneType where it shouldn't be empty
# Data could not be matched to category
# Data could not be matched to datatype (int, date)
# Tags are not opening and closing properly