<a href="https://colab.research.google.com/github/Coder-Nikita/spell_checker/blob/master/spell_checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
import os

**downloading dataset : google's one billion corpus**

In [1]:
!wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz

--2019-04-10 10:21:11--  http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
Resolving www.statmt.org (www.statmt.org)... 129.215.197.184
Connecting to www.statmt.org (www.statmt.org)|129.215.197.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1792209805 (1.7G) [application/x-gzip]
Saving to: ‘1-billion-word-language-modeling-benchmark-r13output.tar.gz’


2019-04-10 11:33:48 (402 KB/s) - ‘1-billion-word-language-modeling-benchmark-r13output.tar.gz’ saved [1792209805/1792209805]



**unzipping .tar.gz file**

In [2]:
!tar -zxvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz

1-billion-word-language-modeling-benchmark-r13output/
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00024-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00057-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00055-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00096-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00081-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00033-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00072-of-00100
1-billion-word-language-modeling-benchma

# **loading the data**

In [0]:
def load_file(path):
    """Load a text file from corpus"""
    input_file = os.path.join(path)
    with open(input_file) as f:
        text = f.read()
    return text

In [0]:
path = '1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/'
train_files = [f for f in listdir(path) if isfile(join(path, f))]
train_files = train_files[1:]

In [0]:
texts = []
for text in train_files:
    texts.append(load_file(path+text))

In [6]:
texts[0][:500]

'Already German dairy manufacturers Milram have said they are considering their options as the already battered image of cycling takes another blow .\nQuoting " people familiar with the situation , " Saturday \'s Wall Street Journal said discussions involving large MGM Mirage bondholders such as activist investor Carl Icahn and private-equity fund Oaktree Capital Management are focusing on having the Las Vegas firm file for Chapter 11 bankruptcy protection and then pumping in new capital .\nParadoxi'

# **preparing data**

In [11]:
import utils

# get list of words
words = utils.preprocess(text)
print(words[:30])

['as', 'well', 'as', 'funding', 'equipment', '<COMMA>', 'the', 'foundation', 'has', 'paid', 'for', 'the', 'salary', 'of', 'a', 'full-time', 'nurse', 'and', 'doctor', 'over', 'the', 'next', 'three', 'years', '<PERIOD>', 'the', 'alert', 'among', 'you', 'will']


In [12]:
print("Total words in text: {}".format(len(words)))
print("Unique words: {}".format(len(set(words))))

Total words in text: 7682879
Unique words: 36283


In [0]:
# a dictionary to convert the vocabulary to integers
vocab_to_int = {}
count = 0
for text in texts:
    for character in text:
        if character not in vocab_to_int:
            vocab_to_int[character] = count
            count += 1

# Add special tokens to vocab_to_int
codes = ['<PAD>','<EOS>','<GO>']
for code in codes:
    vocab_to_int[code] = count
    count += 1

In [8]:
vocab_size = len(vocab_to_int)
print("The vocabulary contains {} characters.".format(vocab_size))
print(sorted(vocab_to_int))

The vocabulary contains 946 characters.
['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '<EOS>', '<GO>', '<PAD>', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x80', '\x83', '\x88', '\x8a', '\x8c', '\x8e', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x99', '\x9a', '\x9c', '\x9d', '\x9e', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'à

In [0]:
int_to_vocab = {}
for character, value in vocab_to_int.items():
    int_to_vocab[value] = character

In [13]:
int_words = [vocab_to_int[word] for word in words]

print(int_words[:30])

KeyError: ignored

**making batches**

**generating batches**

# **building model**

In [0]:
import torch
from torch import nn
import torch.optim as optim

In [0]:
class SkipGram(nn.Module):
    def __init__(self, n_vocab, n_embed):
        super().__init__()
        self.embed = nn.Embedding(n_vocab,n_embed)
        self.output = nn.Linear(n_embed, n_vocab)
        self.log_softmax=nn.LogSoftmax(dim=1)
        
        # complete this SkipGram model
    
    def forward(self, x):
        x=self.embed(x)
        scores=self.output(x)
        log_ps = self.log_softmax(scores)
        # define the forward behavior
        
        return x

# **training**

In [0]:
# check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

embedding_dim=300 #can be changed

model = SkipGram(len(vocab_to_int), embedding_dim).to(device)

criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

print_every = 100
steps = 0
epochs = 5

# train for some number of epochs
for e in range(epochs):
    
    # get input and target batches
    for inputs, targets in get_batches(train_words, 512):
        steps += 1
        inputs, targets = torch.LongTensor(inputs), torch.LongTensor(targets)
        inputs, targets = inputs.to(device), targets.to(device)
        
        log_ps = model(inputs)
        loss = criterion(log_ps, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if steps % print_every == 0:                  
            # getting examples and similarities      
            valid_examples, valid_similarities = cosine_similarity(model.embed, device=device)
            _, closest_idxs = valid_similarities.topk(6) # topk highest similarities
            
            valid_examples, closest_idxs = valid_examples.to('cpu'), closest_idxs.to('cpu')
            for ii, valid_idx in enumerate(valid_examples):
                closest_words = [int_to_vocab[idx.item()] for idx in closest_idxs[ii]][1:]
                print(int_to_vocab[valid_idx.item()] + " | " + ', '.join(closest_words))
            print("...")

# **correcting spelling mistakes of input words**

In [0]:
def text_to_ints(text):
    return [vocab_to_int[word] for word in text]

In [0]:
text = "they are condisering their optons as the arleady"
text = text_to_ints(text)

In [0]:
pad = vocab_to_int["<PAD>"] 

print('\nText')
print('  Word Ids:    {}'.format([i for i in text]))
print('  Input Words: {}'.format("".join([int_to_vocab[i] for i in text])))

print('\nSummary')
print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format("".join([int_to_vocab[i] for i in answer_logits if i != pad])))