In [40]:
import pickle

# Using simple tokenizer

In [41]:
with open('raw_sentences.pickle', 'rb') as file:
    all_sentences = pickle.load(file)


In [42]:
len(all_sentences)

99766

```Total number of sentences: 99766```

In [43]:
single_sequence = [word for sent in all_sentences for word in sent]
len(single_sequence)

858434

In [44]:
unique_words = list(set(single_sequence))
len(unique_words)

230891

#### Raw Dataset
```
Total number of words: 858434
Total number of unique words: 230891
Percentage of unique words among all words: 26.9%
```

Lesser percentage = more repition

### Annotated Dataset

In [45]:
with open(r'..\AnnotatedDatasetParsing\full_dataset_131.pickle', 'rb') as file:
    obj = pickle.load(file)

ad_sentences = []
for sentence in obj:
    temp = []
    for chunk in sentence.list_of_chunks:
        for word in chunk.list_of_words:
            temp.append(word.kannada_word)
    ad_sentences.append(temp)

len(ad_sentences)

6318

In [48]:
single_sequence_annotated = [word for sent in ad_sentences for word in sent]
len(single_sequence_annotated)

103475

In [50]:
unique_annotated_words = list(set(single_sequence_annotated))
len(unique_annotated_words)

14690

#### Annotated Dataset
```
Total number of words: 103475
Total number of unique words: 14690
Percentage of unique words among all words: 14.2%
```

### Number of similar words between annotated dataset and raw dataset

In [52]:
len(list(set(unique_annotated_words) - set(unique_words)))

8210

In [54]:
len(list(set(unique_words) & set(unique_annotated_words)))

6480

# Using special character tokenizer

In [55]:
with open('delimited_sentences.pickle', 'rb') as file:
    delimited_sentences = pickle.load(file)
len(delimited_sentences)

99766

In [56]:
delimited_single_sequence = [word for sent in delimited_sentences for word in sent]
len(delimited_single_sequence)

1082609

In [57]:
delimited_unique_words = list(set(delimited_single_sequence))
len(delimited_unique_words)

186852

```
(Total) Extra words obtained by delimiting special characters also:  858434 --> 1082609
Unique words obtained by delimiting special characters also: 230891 --> 186852
```

In [59]:
len(list(set(delimited_unique_words) & set(unique_annotated_words)))

6914

In [75]:
from collections import Counter
from pprint import pprint

word_frequency = Counter(delimited_single_sequence)

top_words = word_frequency.most_common(20)
pprint(top_words)

[('.', 98070),
 (',', 43269),
 ('?', 10246),
 ('“', 9779),
 ('”', 9357),
 ('\u200c', 8811),
 ('"', 6246),
 ('ಈ', 6201),
 ('’', 5600),
 ('ಎಂದು', 5564),
 ('‘', 5491),
 ('ಆ', 5442),
 ('!', 5030),
 ('-', 4854),
 ('ಒಂದು', 4641),
 ('ತನ್ನ', 3765),
 ('ಮೇಲೆ', 3583),
 ('ಮತ್ತು', 3566),
 ('ನನ್ನ', 3131),
 ('ನಾನು', 3075)]


# Comparing with fastText unique words

In [66]:
fastText_words = []

with open('../Embeddings/all_words.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        
        fastText_words.append(line)
len(fastText_words)

1600000

In [76]:
fastText_raw_common = list(set(delimited_unique_words) & set(fastText_words))
len(fastText_raw_common)

91590

```
Size of fastText Embeddings: 91590
Size of unique words in raw dataset: 91590

91_590 out of 1_86_852 have embeddings in fastText. (49.02%)

Need to check for stemming and stuff (but getting correct word embeddings from the stem requires the word's POS tag)
```

```
What to do about the remaining unknown embeddings?
    1. Train new set of embeddings
    2. Stemming  (Prefix matching)
```

In [77]:
fastText_raw_common[:10]

['ಶಾರದೆಗೆ',
 'ಒಪ್ಪಿಕೊಂಡಿಲ್ಲ',
 'ಕೇಂದ್ರದ',
 'ಪರಿಣಾಮವೇ',
 'ಘರ್ಷಣೆಗೂ',
 'ತಟ್ಟಿಕೊಂಡ',
 'ಎನ್ನುವವರು',
 'ಮುಸುಕೆಳೆದು',
 'ಸಂಪಾದಿಸಿದ್ದ',
 '೪೬']

In [84]:
oov = list(set(delimited_unique_words) - set(fastText_raw_common))
len(oov)

95262

In [85]:
oov[:10]

['ಆಳುಗಳೆಲ್ಲಾ',
 'ಕರ್ಚುಮಾಡಿ',
 'ಅರ್ತಿಯುಳ್ಳವರೆಲ್ಲರ',
 'ನೀವೇನಾಗ್ಬೇಕು',
 'ಮಾಡಾಕಾಗ್ತದಪ್ಪಾ',
 'ಕಲಿಸ್ತೆ',
 'ಸಹಾಯಕವಾಗುವುದೇ',
 'ಕಜ್ಜಾಯದೂಟವನ್ನೇ',
 'ಮಾಯ್ನಹಣ್',
 'ಕರೆದಮ್ಯಾಲೆ']