<a href="https://colab.research.google.com/github/Dev-180Memes/text-deduplication/blob/main/Text_Deduplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import hashlib
import time
import psutil
import os
from collections import defaultdict
import re

In [2]:
def get_memory_usage():
  process = psutil.Process(os.getpid())
  return process.memory_info().rss / 1024 / 1024

In [10]:
class SimHash:
  def __init__(self, text, hash_size=64):
    self.hash_size = hash_size
    self.simhash = self._compute_simhash(text)

  def _tokenize(self, text):
    text = text.lower()
    tokens = re.findall(r'\w+', text)
    return tokens

  def _hash(self, token):
    return int(hashlib.sha1(token.encode('utf-8')).hexdigest(), 16)

  def _compute_simhash(self, text):
    v = [0] * self.hash_size
    tokens = self._tokenize(text)
    weights = defaultdict(int)
    for token in tokens:
      weights[token] += 1

    for token, weight in weights.items():
      token_hash = self._hash(token)
      for i in range(self.hash_size):
        bit = (token_hash >> i) & 1
        v[i] += weight if bit else -weight

    fingerprint = 0
    for i in range(self.hash_size):
      if v[i] > 0:
        fingerprint |= 1 << i
    return fingerprint

  def hamming_distance(self, other_simhash):
    x = self.simhash ^ other_simhash
    distance = 0
    while x:
      distance += x & 1
      x >>= 1
    return distance

In [23]:
class CDCTTTD:
    def __init__(self, text, min_size=64, max_size=256, window_size=48, hash_size=64):
        self.min_size = min_size
        self.max_size = max_size
        self.window_size = window_size
        self.hash_size = hash_size
        self.fingerprint = self._compute_fingerprint(text)

    def _rolling_hash(self, window):
        return int(hashlib.md5(window.encode('utf-8')).hexdigest(), 16)

    def _chunk_text(self, text):
        if len(text) < self.min_size:
            return [text.lower()]

        chunks = []
        start = 0
        pos = self.min_size
        text = text.lower()  # Normalize case

        while pos < len(text):
            window = text[max(0, pos-self.window_size):pos]
            hash_val = self._rolling_hash(window)

            primary_div = 2**13  # 8192
            backup_div = 2**11   # 2048

            if (pos >= self.min_size and hash_val % primary_div == 0) or \
               (pos >= self.max_size and hash_val % backup_div == 0):
                chunks.append(text[start:pos])
                start = pos
            pos += 1

        if start < len(text):
            chunks.append(text[start:])
        return chunks

    def _compute_fingerprint(self, text):
        # Create a fingerprint similar to SimHash but based on chunks
        v = [0] * self.hash_size
        chunks = self._chunk_text(text)

        # Weight chunks by their frequency
        chunk_weights = defaultdict(int)
        for chunk in chunks:
            chunk_weights[chunk] += 1

        for chunk, weight in chunk_weights.items():
            chunk_hash = int(hashlib.sha1(chunk.encode('utf-8')).hexdigest(), 16)
            for i in range(self.hash_size):
                bit = (chunk_hash >> i) & 1
                v[i] += weight if bit else -weight

        fingerprint = 0
        for i in range(self.hash_size):
            if v[i] > 0:
                fingerprint |= 1 << i
        return fingerprint

    def hamming_distance(self, other_fingerprint):
        x = self.fingerprint ^ other_fingerprint
        distance = 0
        while x:
            distance += x & 1
            x >>= 1
        return distance

In [29]:
def evaluate_deduplication(input_file, output_simhash, output_cdctttd,
                         simhash_threshold=3, cdctttd_threshold=3):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines() if line.strip()]

    total_lines = len(lines)
    metrics = {
        'simhash': {'ratio': 0, 'time': 0, 'memory': 0},
        'cdctttd': {'ratio': 0, 'time': 0, 'memory': 0}
    }

    # SimHash
    start_time = time.time()
    start_memory = get_memory_usage()
    simhash_seen = []
    simhash_unique = []
    for line in lines:
        simhash = SimHash(line)
        is_duplicate = False
        for seen_hash in simhash_seen:
            if simhash.hamming_distance(seen_hash) <= simhash_threshold:
                is_duplicate = True
                break
        if not is_duplicate:
            simhash_seen.append(simhash.simhash)
            simhash_unique.append(line)

    metrics['simhash']['time'] = time.time() - start_time
    metrics['simhash']['memory'] = get_memory_usage() - start_memory
    metrics['simhash']['ratio'] = (total_lines - len(simhash_unique)) / total_lines if total_lines > 0 else 0

    with open(output_simhash, 'w', encoding='utf-8') as f:
        f.writelines(line + '\n' for line in simhash_unique)

    # CDC-TTTD
    start_time = time.time()
    start_memory = get_memory_usage()
    cdctttd_seen = []
    cdctttd_unique = []
    for line in lines:
        cdctttd = CDCTTTD(line)
        is_duplicate = False
        for seen_hash in cdctttd_seen:
            if cdctttd.hamming_distance(seen_hash) <= cdctttd_threshold:
                is_duplicate = True
                break
        if not is_duplicate:
            cdctttd_seen.append(cdctttd.fingerprint)
            cdctttd_unique.append(line)

    metrics['cdctttd']['time'] = time.time() - start_time
    metrics['cdctttd']['memory'] = get_memory_usage() - start_memory
    metrics['cdctttd']['ratio'] = (total_lines - len(cdctttd_unique)) / total_lines if total_lines > 0 else 0

    with open(output_cdctttd, 'w', encoding='utf-8') as f:
        f.writelines(line + '\n' for line in cdctttd_unique)

    return metrics

In [30]:
input_file = 'input.txt'
output_simhash = 'output_simhash.txt'
output_cdctttd = 'output_cdctttd.txt'

In [35]:
sample_content = """The quick brown fox jumps over the lazy dog
    The quick brown fox jumps over the lazy dog
    The Quick Brown Fox Jumps Over The Lazy Dog!
    The quick brown fox leaps over the idle dog.
    A swift brown fox jumps over a resting dog
    Completely different content about cats
    Rain falls gently on the green meadow
    Rain Falls Gently on the Green Meadow
    rain falls gently on the green meadow!
    Rain drops softly on the verdant field
    Sunshine warms the golden hills today
    Sunshine warms the golden hills today
    SUNSHINE WARMS THE GOLDEN HILLS TODAY
    Sunshine heats the yellow slopes now
    The old oak tree stands tall
    The old oak tree stands tall
    the OLD Oak Tree Stands TALL!
    The ancient oak tree rises high
    Birds sing sweetly in the morning
    Birds sing sweetly in the morning
    birds SING sweetly in the MORNING
    Birds chirp pleasantly at dawn
    Quiet rivers flow through the valley
    Quiet rivers flow through the valley
    QUIET RIVERS FLOW THROUGH THE VALLEY!
    Silent streams run across the vale
    Snow covers the mountain peaks
    Snow covers the mountain peaks
    snow COVERS the mountain PEAKS
    Frost blankets the high summits
    Wind blows across the open plains
    Wind blows across the open plains
    WIND BLOWS ACROSS THE OPEN PLAINS
    Breeze sweeps over the wide fields
    Stars shine brightly in the night sky
    Stars shine brightly in the night sky
    stars SHINE brightly in the NIGHT sky!
    Stars glow vividly in the dark heavens
    Moonlight dances on the calm lake
    Moonlight dances on the calm lake
    MOONLIGHT DANCES ON THE CALM LAKE
    Moonbeams play on the still water
    Children play happily in the park
    Children play happily in the park
    children PLAY happily in the PARK!
    Kids frolic joyfully in the playground
    Flowers bloom in the spring garden
    Flowers bloom in the spring garden
    FLOWERS BLOOM IN THE SPRING GARDEN
    Blossoms open in the vernal yard
    Thunder rumbles in the distance
    Thunder rumbles in the distance
    THUNDER RUMBLES IN THE DISTANCE!
    Thunder rolls far away
    Waves crash against the rocky shore
    Waves crash against the rocky shore
    waves CRASH against the ROCKY shore
    Surf pounds on the stone coast
    Total unique content about space travel
    Another completely different topic
    Yet another distinct subject matter
"""

In [36]:
with open(input_file, 'w', encoding='utf-8') as f:
        f.write(sample_content)

In [37]:
results = evaluate_deduplication(input_file, output_simhash, output_cdctttd)

In [38]:
print("Evaluation Metrics:")
print("\nSimHash:")
print(f"Deduplication Ratio: {results['simhash']['ratio']:.2%}")
print(f"Execution Time: {results['simhash']['time']:.4f} seconds")
print(f"Memory Utilization: {results['simhash']['memory']:.2f} MB")

print("\nCDC-TTTD:")
print(f"Deduplication Ratio: {results['cdctttd']['ratio']:.2%}")
print(f"Execution Time: {results['cdctttd']['time']:.4f} seconds")
print(f"Memory Utilization: {results['cdctttd']['memory']:.2f} MB")

Evaluation Metrics:

SimHash:
Deduplication Ratio: 45.90%
Execution Time: 0.0111 seconds
Memory Utilization: 0.00 MB

CDC-TTTD:
Deduplication Ratio: 34.43%
Execution Time: 0.0081 seconds
Memory Utilization: 0.00 MB
