In [1]:
import os
import pandas as pd
from fuzzywuzzy import fuzz
from pyxdameraulevenshtein import damerau_levenshtein_distance
import regex as re
import sys

In [14]:
def string_cleaning(string):
    clean_string = re.sub(r"\n", "", string)
    clean_string = re.sub(r"nan", "", clean_string)
    clean_string = re.sub(r" ", "", clean_string)
    clean_string = re.sub(r"\r", "", clean_string)
    clean_string = clean_string.lower()
    clean_string = re.sub(r'[^a-zA-zäöüÄÖÜß]', '', clean_string)
    return clean_string

def score_response_damerau(response):
    if response <= 2:
        return 1
    else:
        return 0
    
def score_response_leveinshtein(response):
    if response > 90:
        return 1
    else:
        return 0

In [8]:
test_string = "Daune(n)"

string_cleaning(test_string)

'daunen'

In [19]:
path = "/Users/Richard/PycharmProjects/NeuroLab_3.10/PsychoPy/scoring/Memory/immediate recall/Nature/RU02AG240724_Memory2_IR.csv"
df = pd.read_csv(path)

# Create a copy of the DataFrame slice
df_short = df[["Word1","Word2", "Response.text"]].copy()

# Ensure all values are treated as strings, replacing NaNs with empty strings
df_short["Word2"] = df_short["Word2"].astype(str).fillna("")
df_short = df_short[df_short["Word2"] != "nan"]
df_short["Response.text"] = df_short["Response.text"].astype(str).fillna("")

#remove unwanted characters from result string and make every letter lowercase
df_short["Response.text"] = df_short["Response.text"].apply(string_cleaning)

#make every letter lowercase in the solution string
df_short["Word2"] = df_short["Word2"].str.lower()

#calculate distance based on Leveinstein algorithm
df_short["distance_leveinshtein"] = df_short.apply(lambda row: fuzz.token_sort_ratio(row["Word2"], row["Response.text"]), axis=1)

#calculate distance based on Damerau Levenshtein Distance
df_short["distance_damerau"] = df_short.apply(lambda row: damerau_levenshtein_distance(row["Word2"], row["Response.text"]), axis=1)

#Calculate score of the participant
df_short["scoring_leveinshtein"] = df_short["distance_leveinshtein"].apply(score_response_leveinshtein)
result_leveinshtein = df_short["scoring_leveinshtein"].sum()

df_short["scoring_damerau"] = df_short["distance_damerau"].apply(score_response_damerau)
result_damerau = df_short["scoring_damerau"].sum()

print(f"Result leveinshtein: {result_leveinshtein}")
print(f"Result damerau: {result_damerau}")

df_short[["Word1", "Word2", "Response.text", "distance_leveinshtein","distance_damerau"]].head(50)

Result leveinshtein: 32
Result damerau: 32


Unnamed: 0,Word1,Word2,Response.text,distance_leveinshtein,distance_damerau
1,Wolf,buche,buche,100,0
2,Säugetier,fledermaus,fledermaus,100,0
3,Spore,bulle,bulle,100,0
4,Eule,larve,larve,100,0
5,Kuh,amsel,amsel,100,0
6,Zeder,ente,ente,100,0
7,Mammut,otter,otter,100,0
8,Lurch,ratte,ratte,100,0
9,Sprosse,brombeere,brombeere,100,0
10,Kiwi,reptil,reptil,100,0


# Manually checking programm 

In [76]:
need_to_check = df_short[(df_short["distance"] < 100) & (df_short["distance"] > 80)]
need_to_check

Unnamed: 0,Word1,Word2,Response.text,distance,scoring
9,Knoblauch,Spargel,Spagel,92,0


In [86]:
need_to_check.iloc[0, need_to_check.columns.get_loc('Response.text')]

'Spagel'

In [101]:
need_to_check

Unnamed: 0,Word1,Word2,Response.text,distance,scoring
9,Knoblauch,Spargel,Spagel,92,0


Spargel and Spagel: 1
Buch and BUch: 1
Daune and Daunnen: 2


In [100]:
# integrate this into a loop
if need_to_check.shape[0] > 0:
    print(f"Some words are almost the same. Please check if they are.\n "
          f"Is {need_to_check.iloc[0,need_to_check.columns.get_loc('Word2')]} close enough to {need_to_check.iloc[0, need_to_check.columns.get_loc('Response.text')]}?\n"
          f"if yes, press the y key if no press the n key.")
else:
    print(f"Result: {result}")
    sys.exit()
    
    
response = input("Press y or n to continue: ")

if response == "y":
    # set value to 1 if "y" is pressed
    word2_value = need_to_check.iloc[0, need_to_check.columns.get_loc('Word2')]
    df_short.loc[df_short["Word2"] == word2_value, "scoring"] = 1
    print("\nThanks for rating this row.")
elif response== "n":
    print("\nAll done.")
else:
    print("\nPlease press y or n to continue.")
    

Some words are almost the same. Please check if they are.
 Is Spargel close enough to Spagel?
if yes, press the y key if no press the n key.

Thanks for rating this row.


In [95]:
# Use .loc[] to avoid SettingWithCopyWarning
word2_value = need_to_check.iloc[0, need_to_check.columns.get_loc('Word2')]
df_short.loc[df_short["Word2"] == word2_value, "scoring"] = 1

df_short

Unnamed: 0,Word1,Word2,Response.text,distance,scoring
1,Hafer,Gras,Gras,100,1
2,Säugetier,Fledermaus,Fledermaus,100,1
3,Kolibri,Thunfisch,Thunfisch,100,1
4,Krokodil,Seeigel,Seeigel,100,1
5,Sprosse,Brombeere,Brombeere,100,1
6,Spore,Bulle,,0,0
7,Wolf,Buche,Buche,100,1
8,Languste,Getreide,,0,0
9,Knoblauch,Spargel,Spagel,92,1
10,Lorbeer,Walnuss,Walnuss,100,1


## Comparing fuzzy wuzzy functions

In [54]:
string1 = "Daune"
string2 = "Daunen"

ratio = fuzz.ratio(string1, string2)
partial_ratio = fuzz.partial_ratio(string1, string2)
token_sort_ratio = fuzz.token_sort_ratio(string1, string2)

print(f"Ratio: {ratio}")
print(f"Partial Ratio: {partial_ratio}")
print(f"Token Sort Ratio: {token_sort_ratio}")

Ratio: 91
Partial Ratio: 100
Token Sort Ratio: 91


## This uses the Damerau Levenshtein Distance

In [None]:
correct_words = ["Spargel", "Buch", "Daune"]
typos = ["Spagel", "BUch", "Daunen"]

for correct_word, typo in zip(correct_words, typos):
    distance = damerau_levenshtein_distance(correct_word, typo)
    print(f"{correct_word} and {typo}: {distance}")