In [26]:
import pandas as pd
import difflib as dl

# Assembly and cleaning
First we assemble all the data, and in some cases we have to reformat some of the sequences. when all is done we clean the data by making everything lowercase

In [27]:
def cleaning(df,file):
    if file == 'František': # František has a different format
        df["sequence"] = df['sequence'].str.replace("x", "t")
        df['sequence'] = df['sequence'].str.replace("o", "h")
        df['sequence'] = df['sequence'].str.replace("f", "x")
    if file == 'Kaleem': # Kaleem has a different format
        df['sequence'] = df['sequence'].str.replace("y", "x")
    df['sequence'] = df['sequence'].str.lower()
    return(df)

In [52]:
files = ['Amir','DavidV',"František",'Kaleem','Pierre', 'DavidKL']
df = pd.DataFrame()
for file in files:
    f = pd.read_csv(f'Audit_data/{file}.csv', header = 0)
    f = cleaning(f,file)
    df = pd.concat([df,f], axis = 0, ignore_index = True)

# Getting original data
Now we need to get the original data form the tosser's to compare

In [53]:
bachelor_f = pd.read_csv('bachelor_data.csv', header = 0)
marathon_f = pd.read_csv('marathon_data.csv', header = 0)
bachelor = pd.DataFrame(bachelor_f)
marathon = pd.DataFrame(marathon_f)
reported = []
start = []
for _,i in df.iterrows():
    if i['person'] == 'kaleemU' and i['sequence_id'] == 25:
        reported.append(bachelor.loc[((bachelor['sequence_id'] == i['sequence_id']) & (bachelor['person'] == 'irmaT'))]['sequence'].values[0])
        start.append(bachelor.loc[((bachelor['sequence_id'] == i['sequence_id']) & (bachelor['person'] == 'irmaT'))]['start'].values[0])
    elif i['group'] == 'bachelor':    
        reported.append(bachelor.loc[((bachelor['sequence_id'] == i['sequence_id']) & (bachelor['person'] == i['person']))]['sequence'].values[0])
        start.append(bachelor.loc[((bachelor['sequence_id'] == i['sequence_id']) & (bachelor['person'] == i['person']))]['start'].values[0])
    else:
        reported.append(marathon.loc[((marathon['sequence_id'] == i['sequence_id']) & (marathon['person'] == i['person']))]['sequence'].values[0])
        start.append(marathon.loc[((marathon['sequence_id'] == i['sequence_id']) & (marathon['person'] == i['person']))]['start'].values[0])
df['reported_sequence'] = reported
df['start'] = start


In [30]:
def switcher(audit, report):
    if audit[0] == report[0]:
        return audit # if the first letter is the same, no need to switch
    elif audit[0] != report[0] and audit[0] != 'u':
        audit = audit.replace("h", "V") # replace h with a placeholder
        audit = audit.replace("t", "h") # replace t with h
        audit = audit.replace("V", "t") # replace placeholder with t
        return audit
    elif audit[-1] == report[-1]:
        return audit
    elif audit[-1] != report[-1]:
        audit = audit.replace("h", "V")
        audit = audit.replace("t", "h")
        audit = audit.replace("V", "t")
        return audit
    else:
        return "manually check"


In [54]:
sequences = []
for _,i in df.iterrows():
    sequences.append(switcher(i['sequence'],i['reported_sequence']))
df['corrected_sequence'] = sequences


In [55]:
df['cleaned_sequence'] = df['corrected_sequence'].str.replace("x", "")
df['cleaned_sequence'] = df['cleaned_sequence'].str.replace("u", "")
df['cleaned_reported_sequence'] = df['reported_sequence'].str.replace("x", "").replace("u", "")


# Comparing
Now that we have both reported and audited sequences we can start comparing them

In [56]:
correct = []
for _,i in df.iterrows():
    if i['cleaned_sequence'] == i['cleaned_reported_sequence']:
        correct.append(1)
        print(i['person'], i['sequence_id'])
    else:
        correct.append(0)
df['correct'] = correct
print(sum(df['correct']))

jillR 36
pierreG 8
davidV 52
pierreG 141
kaleemU 20
pierreG 48
pierreG 138
amirS 61
davidV 42
kaleemU 21
davidKL 53
ingeborgR 55
madlenH 3
13


Using this we can see that 13 trails are completely correct, even without considering possible 1-off errors

# Naive approach
Using a naive approach we can see that Pierre's data is extremely accurate with only 2 mistakes. everything else is still unclear, presumably because of misalignment 

In [34]:
def naive_check(a,b):
    errors = []
    succes = []
    for i in range(min(len(a),len(b))):
        if a[i] == b[i] or (a[i] == "_" or b[i] == "_"):
            errors.append(0)
        else:
            errors.append(1)
    return errors

In [57]:
sequences_errors = []
for _,i in df.iterrows():
    sequences_errors.append(sum(naive_check(i['cleaned_sequence'],i['cleaned_reported_sequence'])))	
df['sequence_errors'] = sequences_errors
df['sequence_errors'].head().sort_values(ascending = False)


1    52
4    39
2    34
3     1
0     0
Name: sequence_errors, dtype: int64

In [36]:
import numpy as np
 
def get_minimum_penalty(x:str, y:str, pxy:int, pgap:int):
    """
    Function to find out the minimum penalty
 
    :param x: pattern X
    :param y: pattern Y
    :param pxy: penalty of mis-matching the characters of X and Y
    :param pgap: penalty of a gap between pattern elements
    """
 
    # initializing variables
    i = 0
    j = 0
     
    # pattern lengths
    m = len(x)
    n = len(y)
     
    # table for storing optimal substructure answers
    dp = np.zeros([m+1,n+1], dtype=int) #int dp[m+1][n+1] = {0};
 
    # initialising the table
    dp[0:(m+1),0] = [ i * pgap for i in range(m+1)]
    dp[0,0:(n+1)] = [ i * pgap for i in range(n+1)]
 
    # calculating the minimum penalty
    i = 1
    while i <= m:
        j = 1
        while j <= n:
            if x[i - 1] == y[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = min(dp[i - 1][j - 1] + pxy,
                                dp[i - 1][j] + pgap,
                                dp[i][j - 1] + pgap)
            j += 1
        i += 1
     
    # Reconstructing the solution
    l = n + m   # maximum possible length
    i = m
    j = n
     
    xpos = l
    ypos = l
 
    # Final answers for the respective strings
    xans = np.zeros(l+1, dtype=int)
    yans = np.zeros(l+1, dtype=int)
     
 
    while not (i == 0 or j == 0):
        #print(f"i: {i}, j: {j}")
        if x[i - 1] == y[j - 1]:       
            xans[xpos] = ord(x[i - 1])
            yans[ypos] = ord(y[j - 1])
            xpos -= 1
            ypos -= 1
            i -= 1
            j -= 1
        elif (dp[i - 1][j - 1] + pxy) == dp[i][j]:
         
            xans[xpos] = ord(x[i - 1])
            yans[ypos] = ord(y[j - 1])
            xpos -= 1
            ypos -= 1
            i -= 1
            j -= 1
         
        elif (dp[i - 1][j] + pgap) == dp[i][j]:
            xans[xpos] = ord(x[i - 1])
            yans[ypos] = ord('_')
            xpos -= 1
            ypos -= 1
            i -= 1
         
        elif (dp[i][j - 1] + pgap) == dp[i][j]:       
            xans[xpos] = ord('_')
            yans[ypos] = ord(y[j - 1])
            xpos -= 1
            ypos -= 1
            j -= 1
         
 
    while xpos > 0:
        if i > 0:
            i -= 1
            xans[xpos] = ord(x[i])
            xpos -= 1
        else:
            xans[xpos] = ord('_')
            xpos -= 1
     
    while ypos > 0:
        if j > 0:
            j -= 1
            yans[ypos] = ord(y[j])
            ypos -= 1
        else:
            yans[ypos] = ord('_')
            ypos -= 1
 
    # Since we have assumed the answer to be n+m long,
    # we need to remove the extra gaps in the starting
    # id represents the index from which the arrays
    # xans, yans are useful
    id = 1
    i = l
    while i >= 1:
        if (chr(yans[i]) == '_') and chr(xans[i]) == '_':
            id = i + 1
            break
         
        i -= 1
 
    # Printing the final answer
    # print(f"Minimum Penalty in aligning the genes = {dp[m][n]}")
    # print("The aligned genes are:")   
    # X
    i = id
    x_seq = ""
    while i <= l:
        x_seq += chr(xans[i])
        i += 1
 
    # Y
    i = id
    y_seq = ""
    while i <= l:
        y_seq += chr(yans[i])
        i += 1
    return [x_seq, y_seq]

 

In [58]:

markings = []
errors = []
for _,i in df.iterrows():
    x = get_minimum_penalty(i['cleaned_sequence'],i['cleaned_reported_sequence'],1,3)
    errors.append(naive_check(x[0],x[1]))
    if sum(errors[-1]) > 9:
        markings.append("x")
    else:
        markings.append("") 
df['marked'] = markings
df['errors'] = errors

# df = df.explode('errors')


In [49]:
df['person_factor'] = pd.factorize(df['person'])[0] + 1

In [59]:
last = ""
decoded = []
for _, i in df.iterrows():
    decoded_str = ""
    for index,letter in enumerate(i['cleaned_sequence']):
        if index == 0:
            if letter == df.loc[_]['start']:
                decoded_str = decoded_str + 's'
            else:
                decoded_str = decoded_str +'f'
        else:
            if letter == last:
                decoded_str = decoded_str +'s'
            else:
                decoded_str = decoded_str +'f'
        last = letter
    decoded.append(decoded_str)
df['decoded_sequence'] = decoded

t
t
t
t
h
t
h
h
h
h
t
h
h
h
h
h
h
h
h
h
h
h
t
t
t
h
t
h
t
h
t
h
t
h
t
h
h
h
t
h
t
t
t
t
t
t
h
t
t
t
t
h
h
h
h
h
h
h
t
h


In [60]:
df.explode(['errors', 'decoded_sequence'])	

ValueError: columns must have matching element counts