In [1]:
import re
import collections

In [2]:
def readFile(fileName):
    with open(fileName) as f:
        return str([line.strip() for line in f.readlines()]).lower()

In [3]:
class BytePairEncoding:
    
    def __init__(self, max_iteration, data):
        self.max_iteration = max_iteration
        self.data = self.cleanAndModifyData(data)
        self.sequence = []
    
    def cleanAndModifyData(self,data):
        return [' '.join(list(word+'_')) for word in re.findall("([^\W]+)", data)]
    
    def calc_frequency(self):
        pairs = collections.defaultdict(int)
        for word in self.data:
            x = word.split()
            for i in range(len(x)-1):
                pairs[(x[i], x[i+1])] += 1
        return pairs
    
    def updated_data(self, to_match, data):
        new_formation = []

        for string in data: #'i s _'
            string = string.split()
            new_string = [string[0]];
        
            for idx in range(1,len(string)):
                
                if new_string[-1] + string[idx] == to_match:
                    new_string[-1] += string[idx]
                else:
                    new_string.append(string[idx])
                    
            new_string = ' '.join(new_string)
            
            new_formation.append(new_string)
            
        return new_formation
    
    def train(self):
        for _ in range(self.max_iteration):
            pairs = self.calc_frequency()
            best = max(pairs, key=pairs.get)
            self.data = self.updated_data(''.join(best),self.data)
            self.sequence.append(best)
        return self.data         
            
    def test(self,testData):
        testData = self.cleanAndModifyData(testData)
        for pair in self.sequence:
            testData = self.updated_data(''.join(pair),testData)
        return testData    
                

In [4]:
training_data = readFile('train.txt')

In [5]:
training_data

"['wider', 'low', 'low', 'lowest', 'newer', 'low', 'lowest', 'newer', 'low', 'newer', 'newer', 'new', 'newer', 'low', 'wider', 'newer', 'wider', 'new']"

In [6]:
model = BytePairEncoding(6,training_data)

In [7]:
model.data

['w i d e r _',
 'l o w _',
 'l o w _',
 'l o w e s t _',
 'n e w e r _',
 'l o w _',
 'l o w e s t _',
 'n e w e r _',
 'l o w _',
 'n e w e r _',
 'n e w e r _',
 'n e w _',
 'n e w e r _',
 'l o w _',
 'w i d e r _',
 'n e w e r _',
 'w i d e r _',
 'n e w _']

In [8]:
model.train()

['w i d er_',
 'low _',
 'low _',
 'low e s t _',
 'new er_',
 'low _',
 'low e s t _',
 'new er_',
 'low _',
 'new er_',
 'new er_',
 'new _',
 'new er_',
 'low _',
 'w i d er_',
 'new er_',
 'w i d er_',
 'new _']

In [9]:
model.data

['w i d er_',
 'low _',
 'low _',
 'low e s t _',
 'new er_',
 'low _',
 'low e s t _',
 'new er_',
 'low _',
 'new er_',
 'new er_',
 'new _',
 'new er_',
 'low _',
 'w i d er_',
 'new er_',
 'w i d er_',
 'new _']

In [10]:
model.sequence

[('e', 'r'), ('er', '_'), ('n', 'e'), ('ne', 'w'), ('l', 'o'), ('lo', 'w')]

In [11]:
testing_data = readFile('test.txt')

In [12]:
testing_data

"['lower', 'newest']"

In [13]:
model.test(testing_data)

['low er_', 'new e s t _']