# Word Overlap
Calculate word overlap between corpora.  
We use Conceptual Captions corpus in this notebook.  
Make sure that the conceptual_captions directory exists in the working directory.  
The conceptual_captions directory is expected to have the following files:  
- Train_GCC-training.tsv
- Validation_GCC-1.1.0-Validation.tsv
- train_ids.txt
- valid_ids.txt

The tsv files are available at https://ai.google.com/research/ConceptualCaptions/download  
The txt file are avaiable at https://github.com/e-bug/volta/tree/main/data/conceptual_captions  

In [1]:
# We use nltk to obtain the English stopwords
!pip install nltk



In [2]:
import os
import json
import numpy as np
from scipy.spatial import distance
import nltk
import datasets
from conceptual_captions import ConceptualCaptionsTextDataset

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ikitaichi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# The GLUE tasks
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
    'cc': ('sentence', None)
}

In [5]:
def load_dataset(task):
    if task == 'cc':
        return ConceptualCaptionsTextDataset()
    return datasets.load_dataset('glue', task)

In [6]:
def word_count(dataset_split, keys, do_uncase):
    """Count words in the sentences specified with keys in a dataset split.
    Returns a dict {word: count}"""
    counter = {}
    for i in range(len(dataset_split)):
        example = dataset_split[i]
        for key in keys:
            text = example[key]
            if do_uncase:
                text = text.lower()
            for word in text.split():
                counter[word] = counter.get(word, 0) + 1
    return counter

In [7]:
def calc_word_overlap(count1, count2, stop_words=[], top_k=None):
    """Calculate simpson coefficient between c1 and c2"""
    def preprocess(c):
        sorted_c = sorted(c.items(), key=lambda t: -t[-1]) # descending order
        f = [w for w, _ in sorted_c if w not in stop_words]
        return f[:top_k] if top_k is not None else f
    
    s1 = set(preprocess(count1))
    s2 = set(preprocess(count2))
    
    return len(s1 & s2) / min(len(s1), len(s2))

In [8]:
def calc_jaccard(count1, count2, stop_words=[], top_k=None):
    """Calculate simpson coefficient between c1 and c2"""
    def preprocess(c):
        sorted_c = sorted(c.items(), key=lambda t: -t[-1]) # descending order
        f = [w for w, _ in sorted_c if w not in stop_words]
        return f[:top_k] if top_k is not None else f
    
    s1 = set(preprocess(count1))
    s2 = set(preprocess(count2))
    
    return len(s1 & s2) / len(s1 | s2)

In [9]:
# Configuration
# We uncased sentences and splited them with white spaces.
# After removing stop words (defined in nltk), we used up to the top 10,000 words to calculate the Simpson coefficient.
do_uncase = True
stop_words = nltk.corpus.stopwords.words('english')
top_k = 10000
ref_split = 'cc_train'
header = ['ref_split', '_id', 'n_examples', 'keys', 'do_uncase', 'vocab_size', 'overlap', 'jaccard']

In [10]:
#  Stop words
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
# Calculate word counts for all copora
targets = []
targets.extend((task, 'train') for task in task_to_keys.keys())
targets.append(('cc', 'train'))
targets.append(('cc', 'valid'))

corpus_data = {}
for task, split  in targets:
    _id = f'{task}_{split}'
    keys = tuple(key for key in task_to_keys[task] if key)
    dataset = load_dataset(task)
    count = word_count(dataset[split], keys, do_uncase)
    corpus_data[_id] = {
        '_id': _id,
        'n_examples': len(dataset[split]),
        'keys': keys,
        'do_uncase': do_uncase,
        'vocab_size': len(count),
        'count': count,
    }

Reusing dataset glue (/home/ikitaichi/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/home/ikitaichi/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/5 [00:00<?, ?it/s]

Reusing dataset glue (/home/ikitaichi/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/home/ikitaichi/.cache/huggingface/datasets/glue/qnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/home/ikitaichi/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/home/ikitaichi/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/home/ikitaichi/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/home/ikitaichi/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/home/ikitaichi/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
# Output the table
print(*header, sep='\t')
for _id, data in sorted(corpus_data.items()):
    data = data.copy()
    data['ref_split'] = ref_split
    data['overlap'] = calc_word_overlap(corpus_data[ref_split]['count'], data['count'], stop_words, 10000)
    data['jaccard'] = calc_jaccard(corpus_data[ref_split]['count'], data['count'], stop_words, 10000)
    print(*[data[_] for _ in header], sep='\t')

ref_split	_id	n_examples	keys	do_uncase	vocab_size	overlap	jaccard
cc_train	cc_train	2779407	('sentence',)	True	48360	1.0	1.0
cc_train	cc_valid	14464	('sentence',)	True	10442	0.7734	0.6305233980107614
cc_train	cola_train	8551	('sentence',)	True	7845	0.35630841121495327	0.18350157096062572
cc_train	mnli_train	392702	('premise', 'hypothesis')	True	167790	0.5021	0.33520261699712933
cc_train	mrpc_train	3668	('sentence1', 'sentence2')	True	13926	0.4006	0.2504689258471927
cc_train	qnli_train	104743	('question', 'sentence')	True	148413	0.4431	0.2846040208105851
cc_train	qqp_train	363846	('question1', 'question2')	True	193041	0.4282	0.27242651736862195
cc_train	rte_train	2490	('sentence1', 'sentence2')	True	23341	0.3751	0.23084497507538926
cc_train	sst2_train	67349	('sentence',)	True	14816	0.4025	0.2519561815336463
cc_train	stsb_train	5749	('sentence1', 'sentence2')	True	16436	0.3938	0.24517494708006474
cc_train	wnli_train	635	('sentence1', 'sentence2')	True	1622	0.4990072799470549	0.070093892