In [2]:
from fuzzywuzzy import fuzz, process
from difflib import SequenceMatcher



## Calculate similarity with "SequenceMatcher"

In [2]:
# first parameter is for defining the junk element
s = SequenceMatcher(lambda x: x == " ",
                    "Deep Transcranial Magnetic Stimulation (dTMS) to Induce Smoking Cessation",
                    "Repetitive Transcranial Magnetic Stimulation (rTMS) in the treatment of depression.").ratio()
s

0.6538461538461539

In [3]:
s = SequenceMatcher(None,
                    "Deep Transcranial Magnetic Stimulation (dTMS) to Induce Smoking Cessation",
                    "Repetitive Transcranial Magnetic Stimulation (rTMS) in the treatment of depression.").ratio()
s

0.7051282051282052

In [4]:
s = SequenceMatcher(lambda x: x == " ",
                    "The PREVAIL Study",
                    "The STEAL Study").ratio()
s

0.8125

## Calculate similarity with "Levenshtein distance"

In [5]:
s = fuzz.ratio("Deep Transcranial Magnetic Stimulation (dTMS) to Induce Smoking Cessation",
               "Repetitive Transcranial Magnetic Stimulation (rTMS) in the treatment of depression.")/100
s

0.71

In [6]:
# partial_ratio : Attempts to account for partial string matches better
s = fuzz.partial_ratio("Deep Transcranial Magnetic Stimulation (dTMS) to Induce Smoking Cessation",
                       "Repetitive Transcranial Magnetic Stimulation (rTMS) in the treatment of depression.")/100
s

0.74

In [7]:
# token_sort_ratio : Attempts to account for similar strings out of order
s = fuzz.token_sort_ratio("Deep Transcranial Magnetic Stimulation (dTMS) to Induce Smoking Cessation",
                          "Repetitive Transcranial Magnetic Stimulation (rTMS) in the treatment of depression.")/100
s

0.62

In [8]:
s = fuzz.ratio("The PREVAIL Study",
               "The STEAL Study")/100
s

0.81

## Calculate similarity with "Jaccard distance"

In [9]:
def DistJaccard(str1, str2):
    str1 = set(str1.lower().split())
    str2 = set(str2.lower().split())
    return float(len(str1 & str2)) / len(str1 | str2)

In [11]:
DistJaccard("Deep Transcranial Magnetic Stimulation (dTMS) to Induce Smoking Cessation",
            "Repetitive Transcranial Magnetic Stimulation (rTMS) in the treatment of depression.")

0.1875

In [10]:
DistJaccard("The PREVAIL Study",
            "The STEAL Study")

0.5

____________

In [12]:
s = SequenceMatcher(lambda x: x == " ",
                    "Multiple Sclerosis Gut Microbiota and",
                    "Rehabilitation and driving with Multiple Sclerosis").ratio()
s

0.41379310344827586

In [13]:
s = fuzz.token_sort_ratio('Multiple Sclerosis Gut Microbiota and', ' Rehabilitation and driving with Multiple Sclerosis')/100
s

0.55

In [14]:
s = fuzz.ratio('Gut Microbiota and Multiple Sclerosis', ' Rehabilitation and driving with Multiple Sclerosis')/100
s

0.64

In [15]:
DistJaccard('Gut Microbiota and Multiple Sclerosis', ' Rehabilitation and driving with Multiple Sclerosis')

0.375

In [16]:
DistJaccard('Abir jaza','abirr')

0.0

____________

In [17]:
s = SequenceMatcher(lambda x: x == " ",
                    "Multiple Sclerosis Gut Microbiota and",
                    "Rehabilitation and driving with Multiple Sclerosis").ratio()
s

0.41379310344827586

In [7]:
s = SequenceMatcher(lambda x: x == " ",
                    "Repetitive Transcranial Magnetic Stimulation (rTMS) in the treatment of depression of going to the extreme other.",
                    "Repetitive Transcranial Magnetic Stimulation in the treatment of depression.").ratio()
s

0.8042328042328042