In [1]:
import pandas as pd
import numpy as np
import operator
import re

In [2]:
train = pd.read_csv("train.csv", encoding = 'utf-8')
test = pd.read_csv("test_x.csv", encoding = 'utf-8')

print("Number of train texts: ", train.shape[0])
print("Number of test texts: ", test.shape[0])

Number of train texts:  54879
Number of test texts:  19617


In [3]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    if file == 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='utf-8') if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

In [4]:
glove = 'embeddings/glove.840B.300d/glove.840B.300d.txt'

In [5]:
print("Extracting GloVe embedding")
embed_glove = load_embed(glove)

Extracting GloVe embedding


In [6]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [7]:
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

In [8]:
vocab_train = build_vocab(train['text'])
vocab_test = build_vocab(test['text'])

In [9]:
print("Glove : ")
oov_glove = check_coverage(vocab_train, embed_glove)

Glove : 
Found embeddings for 28.26% of vocab
Found embeddings for  83.07% of all text


In [10]:
print("Glove : ")
oov_glove = check_coverage(vocab_test, embed_glove)

Glove : 
Found embeddings for 32.97% of vocab
Found embeddings for  85.72% of all text


In [11]:
train['lowered_question'] = train['text'].apply(lambda x: x.lower())
test['lowered_question'] = test['text'].apply(lambda x: x.lower())

In [12]:
vocab_train_low = build_vocab(train['lowered_question'])
vocab_test_low = build_vocab(test['lowered_question'])

In [13]:
print("Glove : ")
oov_glove = check_coverage(vocab_train_low, embed_glove)

Glove : 
Found embeddings for 26.54% of vocab
Found embeddings for  82.97% of all text


In [14]:
print("Glove : ")
oov_glove = check_coverage(vocab_test_low, embed_glove)

Glove : 
Found embeddings for 31.37% of vocab
Found embeddings for  85.63% of all text


In [15]:
def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")

In [16]:
print("Glove : ")
add_lower(embed_glove, vocab_train)

Glove : 
Added 502 words to embedding


In [17]:
print("Glove : ")
add_lower(embed_glove, vocab_test)

Glove : 
Added 187 words to embedding


In [18]:
print("Glove : ")
oov_glove = check_coverage(vocab_train_low, embed_glove)

Glove : 
Found embeddings for 26.99% of vocab
Found embeddings for  83.10% of all text


In [19]:
print("Glove : ")
oov_glove = check_coverage(vocab_test_low, embed_glove)

Glove : 
Found embeddings for 31.86% of vocab
Found embeddings for  85.75% of all text


In [21]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

In [22]:
def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown

In [23]:
print("Glove :")
print(unknown_punct(embed_glove, punct))

Glove :
“ ” ’ ∞ θ ÷ α • à − β ∅ ³ π ‘ ₹ ´ ° £ € × ™ √ ² — – 


In [24]:
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

In [25]:
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

In [26]:
train['lowered_question'] = train['lowered_question'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))
test['lowered_question'] = test['lowered_question'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))

In [27]:
vocab_train = build_vocab(train['lowered_question'])
print("Glove : ")
oov_glove = check_coverage(vocab_train, embed_glove)

Glove : 
Found embeddings for 91.64% of vocab
Found embeddings for  99.74% of all text


In [28]:
vocab_test = build_vocab(test['lowered_question'])
print("Glove : ")
oov_glove = check_coverage(vocab_test, embed_glove)

Glove : 
Found embeddings for 92.49% of vocab
Found embeddings for  99.74% of all text


In [29]:
#remove special characters and punctuation
train['lowered_question'] = train['lowered_question'].replace(r'[^A-Za-z0-9 ]+', '')
test['lowered_question'] = test['lowered_question'].replace(r'[^A-Za-z0-9 ]+', '')

#remove single letters from text
train['lowered_question'] = train['lowered_question'].apply (lambda x: re.sub(r"((?<=^)|(?<= )).((?=$)|(?= ))", '', x).strip())
test['lowered_question'] = test['lowered_question'].apply (lambda x: re.sub(r"((?<=^)|(?<= )).((?=$)|(?= ))", '', x).strip())

In [30]:
train['text'] = train['lowered_question']
test['text'] = test['lowered_question']

del train['lowered_question']
del test['lowered_question']

In [35]:
def remove_custom(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in custom:
            final_text.append(i.strip())
    return " ".join(final_text)

In [36]:
custom = ['odin', 'mr', 'said']

In [38]:
train['text'] = train['text'].apply(remove_custom)
test['text'] = test['text'].apply(remove_custom)

In [39]:
train_word_list = []
for word in train['text']:
    train_word_list.append(word.split(" "))
train_word_list = [y for x in train_word_list for y in x]

test_word_list = []
for word in test['text']:
    test_word_list.append(word.split(" "))
test_word_list = [y for x in test_word_list for y in x]

In [40]:
def counter(input_list):
    word_count = {}
    for word in input_list:
        if word in  word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
    return word_count

In [47]:
word_count = counter(train_word_list)
word_count = sorted(word_count.items(), key=lambda x:x[1], reverse=True)
output_train = pd.DataFrame(word_count)

In [48]:
word_count = counter(test_word_list)
word_count = sorted(word_count.items(), key=lambda x:x[1], reverse=True)
output_test = pd.DataFrame(word_count)

In [55]:
overlapping_words = []
for a in range(output_train.shape[0]):
    if output_train[0][a] in list(output_test[0]):
        overlapping_words.append(output_train[0][a])
overlapping_words

['the',
 'and',
 'to',
 'of',
 'in',
 'he',
 'you',
 'it',
 'that',
 'was',
 'his',
 'with',
 'had',
 'for',
 'as',
 'not',
 'at',
 'her',
 'but',
 'my',
 'is',
 'have',
 'she',
 'be',
 'me',
 'him',
 'on',
 'all',
 'so',
 'this',
 'what',
 'there',
 'by',
 'from',
 'no',
 'which',
 'were',
 'we',
 'one',
 'they',
 'if',
 'would',
 'been',
 'are',
 'your',
 'an',
 'very',
 'do',
 'when',
 'could',
 'will',
 'out',
 'or',
 'up',
 'man',
 'upon',
 'them',
 'now',
 'more',
 'who',
 'then',
 'am',
 'some',
 'know',
 'into',
 'well',
 'about',
 'did',
 'time',
 'how',
 'only',
 'little',
 'can',
 'see',
 'their',
 'come',
 'like',
 'before',
 'should',
 'must',
 'here',
 'such',
 'any',
 'good',
 'has',
 'down',
 'than',
 'say',
 'much',
 'think',
 'us',
 'again',
 'never',
 'our',
 'too',
 'cried',
 'may',
 'sir',
 'two',
 'go',
 'other',
 'don',
 'over',
 'after',
 'though',
 'nothing',
 'made',
 'himself',
 'old',
 'own',
 'came',
 'great',
 'last',
 'why',
 'way',
 'thought',
 'might',


In [56]:
train_unique = []
for txt in output_train[0]:
    if txt in overlapping_words:
        pass
    else:
        train_unique.append(txt)

test_unique = []
for txt in output_test[0]:
    if txt in overlapping_words:
        pass
    else:
        test_unique.append(txt)

In [59]:
train_unique = pd.Series(train_unique)
train_unique.to_csv('train_unique.csv')

In [60]:
test_unique = pd.Series(test_unique)
test_unique.to_csv('test_unique.csv')

In [None]:
# https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing