In [1]:
import pickle

# Using simple tokenizer

In [2]:
with open('raw_sentences.pickle', 'rb') as file:
    all_sentences = pickle.load(file)


In [3]:
len(all_sentences)

99766

```Total number of sentences: 99766```

In [4]:
single_sequence = [word for sent in all_sentences for word in sent]
len(single_sequence)

858434

In [5]:
unique_words = list(set(single_sequence))
len(unique_words)

230891

#### Raw Dataset (Simple Tokenizer)
```
Total number of words: 858434
Total number of unique words: 230891
Percentage of unique words among all words: 26.9%
```

Lesser percentage = more repition

### Annotated Dataset

In [6]:
with open(r'..\AnnotatedDatasetParsing\full_dataset_131.pickle', 'rb') as file:
    obj = pickle.load(file)

ad_sentences = []
for sentence in obj:
    temp = []
    for chunk in sentence.list_of_chunks:
        for word in chunk.list_of_words:
            temp.append(word.kannada_word)
    ad_sentences.append(temp)

len(ad_sentences)

6318

In [7]:
single_sequence_annotated = [word for sent in ad_sentences for word in sent]
len(single_sequence_annotated)

103475

In [8]:
unique_annotated_words = list(set(single_sequence_annotated))
len(unique_annotated_words)

14690

#### Annotated Dataset
```
Total number of words: 103475
Total number of unique words: 14690
Percentage of unique words among all words: 14.2%
```

### Number of similar words between annotated dataset and raw dataset

In [9]:
len(list(set(unique_annotated_words) - set(unique_words)))

8210

In [10]:
len(list(set(unique_words) & set(unique_annotated_words)))

6480

# Using special character tokenizer

In [11]:
with open('delimited_sentences.pickle', 'rb') as file:
    delimited_sentences = pickle.load(file)
len(delimited_sentences)

99766

In [12]:
delimited_single_sequence = [word for sent in delimited_sentences for word in sent]
len(delimited_single_sequence)

1082609

In [13]:
delimited_unique_words = list(set(delimited_single_sequence))
len(delimited_unique_words)

186852

#### Raw Dataset (Special Symbol Tokenizer)
```
(Total) Extra words obtained by delimiting special characters also:  858434 --> 1082609
Unique words obtained by delimiting special characters also: 230891 --> 186852
```

In [14]:
len(list(set(delimited_unique_words) & set(unique_annotated_words)))

6914

In [15]:
from collections import Counter
from pprint import pprint

word_frequency = Counter(delimited_single_sequence)

top_words = word_frequency.most_common(30)
pprint(top_words)

[('.', 98070),
 (',', 43269),
 ('?', 10246),
 ('“', 9779),
 ('”', 9357),
 ('\u200c', 8811),
 ('"', 6246),
 ('ಈ', 6201),
 ('’', 5600),
 ('ಎಂದು', 5564),
 ('‘', 5491),
 ('ಆ', 5442),
 ('!', 5030),
 ('-', 4854),
 ('ಒಂದು', 4641),
 ('ತನ್ನ', 3765),
 ('ಮೇಲೆ', 3583),
 ('ಮತ್ತು', 3566),
 ('ನನ್ನ', 3131),
 ('ನಾನು', 3075),
 ('ಆದರೆ', 2628),
 ('ಅವರ', 2550),
 ('ಅಂತ', 2414),
 ('ಅವನ', 2404),
 ('ಅವರು', 2384),
 ('…', 2212),
 ('ಅದು', 1945),
 ('ತಮ್ಮ', 1941),
 ('ಇಲ್ಲ', 1813),
 ('ಹೋಗಿ', 1773)]


# Comparing with fastText unique words

In [16]:
fastText_words = []

with open('../Embeddings/all_words.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        
        fastText_words.append(line)
len(fastText_words)

1600000

In [17]:
fastText_words[:10]

['.', ',', '</s>', "'", '-', 'ಮತ್ತು', ')', '(', '}', ':']

In [18]:
fastText_raw_common = list(set(delimited_unique_words) & set(fastText_words))
len(fastText_raw_common)

91590

```
Size of fastText Embeddings: 1600000
Size of unique words in raw dataset: 186852

Common available embeddings among the two: 91_590.  (49.02% of total tokens)
Need embeddings for: 95262 tokens

91_590 out of 1_86_852 have embeddings in fastText. 
```

```
What to do about the remaining unknown embeddings?
    1. Train new set of embeddings
    2. Stemming  (Prefix matching)
            Need to check for stemming and stuff (but getting correct word embeddings from the stem requires the word's POS tag)
```

In [19]:
fastText_raw_common[:10]

['ದಿಟ್ಟಿಸುತ್ತಿದ್ದರು',
 'ಬಟ್ಟ',
 'ಪರಿಗಣಿಸಲಾಗುವುದೆಂದು',
 'ಭಾರವೆಲ್ಲ',
 'ಗುಳಿಗೆಯೊಂದನ್ನು',
 'ಮಟ್ಟಿನಲ್ಲಿ',
 'ಯಾರ್ನೂ',
 'ಹೊಳೆದಂತಾಗಿ',
 'ಕಡಲಲ್ಲಿ',
 'ದೊರಕಲಿಲ್ಲ']

In [20]:
oov = list(set(delimited_unique_words) - set(fastText_raw_common))
len(oov)

95262

In [21]:
oov[:10]

['ಪ್ರಾತಿನಿಧ್ಯವನ್ನೂ',
 'ಚಲಿಸುತ್ತಿದ್ದುವು',
 'ತಪ್ಪಿತಸ್ಥಳಾಗಿ',
 'ಆದುದೇಕೆಂದು',
 'ಬಂದಾರೇ',
 'ಇಂಬಳಗಳನ್ನೆಲ್ಲ',
 'ವಾಲಾಡುತ್ತಿದ್ದ',
 'ಉಮ್ಮತ್ತೂರಿನಿಂದ',
 'ಸರಿಕಾಣದೆ',
 'ಮಾಡ್ಸುಕ್']

#### What is the frequency distribution of words for which embeddings are unknown? (In the raw data)

In [None]:
delimited_single_sequence and oov COUNTER

In [23]:
len(oov), len(delimited_single_sequence)

(95262, 1082609)

In [None]:
oov_word_counts = {}

for word in oov:
    count = delimited_single_sequence.count(word)
    oov_word_counts[word] = count

len(oov_word_counts)

In [24]:
from collections import Counter
Counter(delimited_single_sequence)

Counter({'.': 98070,
         ',': 43269,
         '?': 10246,
         '“': 9779,
         '”': 9357,
         '\u200c': 8811,
         '"': 6246,
         'ಈ': 6201,
         '’': 5600,
         'ಎಂದು': 5564,
         '‘': 5491,
         'ಆ': 5442,
         '!': 5030,
         '-': 4854,
         'ಒಂದು': 4641,
         'ತನ್ನ': 3765,
         'ಮೇಲೆ': 3583,
         'ಮತ್ತು': 3566,
         'ನನ್ನ': 3131,
         'ನಾನು': 3075,
         'ಆದರೆ': 2628,
         'ಅವರ': 2550,
         'ಅಂತ': 2414,
         'ಅವನ': 2404,
         'ಅವರು': 2384,
         '…': 2212,
         'ಅದು': 1945,
         'ತಮ್ಮ': 1941,
         'ಇಲ್ಲ': 1813,
         'ಹೋಗಿ': 1773,
         'ಬಂದು': 1745,
         'ನಿನ್ನ': 1692,
         'ನೀನು': 1689,
         'ಹೇಳಿ': 1683,
         'ಅವನು': 1658,
         'ಮಾಡಿ': 1633,
         'ಎಂಬ': 1575,
         ')': 1560,
         'ಅವಳ': 1535,
         'ನಮ್ಮ': 1532,
         ';': 1445,
         '(': 1444,
         'ಈಗ': 1392,
         'ಮತ್ತೆ': 1361,
         'ತಾನು': 1297,
         'ಮನೆ