In [1]:
import csv

In [2]:
stories = []

In [3]:
with open('stories.csv') as f:
    reader = csv.reader(f)
    for story in reader:
        stories.append({'story_id': story[1], 'title': story[2], 'english_title': story[3], 'reading_level_updated': story[4], 'story_language': story[5], 'synopsis': story[6], 'content': story[7], 'category_name': story[8], 'tag_name': story[9], 'story_original_title': story[10]})
    

Stories currently has all stories through all languages. Now, taking only stories in Hindi language.

In [4]:
stories_hindi = []
for each in stories:
    if each['story_language']=='Hindi':
        stories_hindi.append(each)

In [5]:
len(stories_hindi)

920

In [6]:
from cltk.corpus.utils.importer import CorpusImporter

Using tokenizer based on rules written by LTRC.

In [7]:
c = CorpusImporter('hindi')

In [8]:
from cltk.tokenize.sentence import TokenizeSentence

Loading the Hindi tokenizer.

In [9]:
hindi_tokenizer = TokenizeSentence('hindi')

Testing the tokenizer.

In [10]:
stories_hindi[6]['content']

'पहलवान जी।  दस किलो दूध पी जाते।  पचास रोटी खाते।  100 किलो वज़न उठाते।  खूब अकड़ कर चलते।  सब बच्चों पर हुक्म चलाते।   एक दिन गप्पू बोला, “मुझसे कुश्ती लड़ोगे?” \u2003 पहलवान हँसा। फिर मान गया।  दोनों ने ताल ठोंकी।  पहलवान ने गप्पू को दबोच लिया।  गप्पू ने उसके पेट में गुदगुदी लगायी।  ही...ही...ही... पहलवान उछल पड़ा।  गप्पू उसे नचाने लगा।  पहलवान की हालत खराब। मैदान छोड़ भाग लिया।  अब बच्चे उससे नहीं डरते। वह बच्चों से डरता है। '

In [11]:
six_tokenized = hindi_tokenizer.tokenize(stories_hindi[6]['content'])

In [12]:
six_tokenized

['पहलवान',
 'जी',
 '।',
 'दस',
 'किलो',
 'दूध',
 'पी',
 'जाते',
 '।',
 'पचास',
 'रोटी',
 'खाते',
 '।',
 '100',
 'किलो',
 'वज़न',
 'उठाते',
 '।',
 'खूब',
 'अकड़',
 'कर',
 'चलते',
 '।',
 'सब',
 'बच्चों',
 'पर',
 'हुक्म',
 'चलाते',
 '।',
 'एक',
 'दिन',
 'गप्पू',
 'बोला',
 ',',
 '“मुझसे',
 'कुश्ती',
 'लड़ोगे',
 '?',
 '”',
 '\u2003',
 'पहलवान',
 'हँसा',
 '।',
 'फिर',
 'मान',
 'गया',
 '।',
 'दोनों',
 'ने',
 'ताल',
 'ठोंकी',
 '।',
 'पहलवान',
 'ने',
 'गप्पू',
 'को',
 'दबोच',
 'लिया',
 '।',
 'गप्पू',
 'ने',
 'उसके',
 'पेट',
 'में',
 'गुदगुदी',
 'लगायी',
 '।',
 'ही',
 '.',
 '.',
 '.',
 'ही',
 '.',
 '.',
 '.',
 'ही',
 '.',
 '.',
 '.',
 'पहलवान',
 'उछल',
 'पड़ा',
 '।',
 'गप्पू',
 'उसे',
 'नचाने',
 'लगा',
 '।',
 'पहलवान',
 'की',
 'हालत',
 'खराब',
 '।',
 'मैदान',
 'छोड़',
 'भाग',
 'लिया',
 '।',
 'अब',
 'बच्चे',
 'उससे',
 'नहीं',
 'डरते',
 '।',
 'वह',
 'बच्चों',
 'से',
 'डरता',
 'है',
 '।']

Working well!

In [13]:
for index, each in enumerate(stories_hindi):
    stories_hindi[index]['tok_content'] = hindi_tokenizer.tokenize(stories_hindi[index]['content'])

In [14]:
reading_levels = {}

In [15]:
for index, each in enumerate(stories_hindi):
    reading_levels[each['reading_level_updated']] = []

In [16]:
reading_levels

{'L1': [], 'L2': [], 'L3': [], 'L4': []}

This gives us the number of reading levels - in case it's not fixed.

In [17]:
num_levels = len(reading_levels)

In [18]:
for each in stories_hindi:
    reading_levels[each['reading_level_updated']].append(each['tok_content'])

# Now to see what differentiates each difficulty level from the other!

### Firstly, what are the most frequently occuring tokens in each difficulty level?

In [19]:
import nltk

In [31]:
all_counts = {}
for index, each in enumerate(reading_levels):
    all_tokens = []
    print(len(reading_levels[each]))
    for each1 in reading_levels[each]:
        all_tokens = all_tokens + each1
    print(len(all_tokens))
    #all_counts[each]=nltk.FreqDist(nltk.ngrams(all_tokens, len(all_tokens)))
    all_counts[each]=all_tokens

392
98659
303
171488
137
199805
88
251993


In [32]:
len(all_counts['L1'])

98659

In [34]:
from collections import Counter
for each in all_counts:
    all_counts[each] = Counter(all_counts[each])

In case it was hard to follow, what has been done till now is that for each difficulty level - L1, L2, L3, and L4 - we have found the count for each word. Nowe, we can see what the top 'x' words are for each - this way we're checking the difficulty of lexical items that appear at each level!

In [43]:
all_counts['L1'].most_common(10)

[(';', 9882),
 ('&', 9251),
 ('nbsp', 9227),
 ('.', 3431),
 ('-', 2671),
 ('।', 2611),
 ('है', 1698),
 (',', 1644),
 ('!', 1180),
 ('"', 997)]

In [48]:
for each in reading_levels:
    print(all_counts[each].most_common(10))
    print("xxxxxxxxxxxxxx")

[(';', 9882), ('&', 9251), ('nbsp', 9227), ('.', 3431), ('-', 2671), ('।', 2611), ('है', 1698), (',', 1644), ('!', 1180), ('"', 997)]
xxxxxxxxxxxxxx
[(';', 9684), ('&', 8693), ('nbsp', 8676), ('।', 7377), (',', 4054), ('.', 3470), ('-', 3039), ('"', 2681), ('है', 2478), ('के', 2244)]
xxxxxxxxxxxxxx
[('-', 8830), ('।', 7823), (';', 7577), (',', 5188), (':', 4110), ('&', 4064), ('\n', 4011), ('nbsp', 3902), ('.', 3136), ('के', 2880)]
xxxxxxxxxxxxxx
[('-', 13744), (';', 11740), ('।', 9153), (':', 6323), ('\n', 6233), ('&', 6036), ('nbsp', 5789), (',', 5162), ('के', 3934), ('.', 3771)]
xxxxxxxxxxxxxx


As can be seen, the first 10 words are pretty much the exact same amongst all 4 levels - this is because in any large text corpora, the stop words and punctutions and special signs will always rank highest in freqency.

In [49]:
for each in reading_levels:
    print(all_counts[each].most_common(20))
    print("xxxxxxxxxxxxxx")

[(';', 9882), ('&', 9251), ('nbsp', 9227), ('.', 3431), ('-', 2671), ('।', 2611), ('है', 1698), (',', 1644), ('!', 1180), ('"', 997), ('के', 896), ('\n', 815), ('ने', 780), ('से', 761), ('?', 735), ('और', 727), ('में', 685), ('को', 679), (':', 679), ('हैं', 671)]
xxxxxxxxxxxxxx
[(';', 9684), ('&', 8693), ('nbsp', 8676), ('।', 7377), (',', 4054), ('.', 3470), ('-', 3039), ('"', 2681), ('है', 2478), ('के', 2244), ('और', 1931), ('!', 1915), ('से', 1895), ('में', 1871), ('ने', 1641), ('को', 1561), ('\n', 1472), ('की', 1446), (':', 1349), ('हैं', 1233)]
xxxxxxxxxxxxxx
[('-', 8830), ('।', 7823), (';', 7577), (',', 5188), (':', 4110), ('&', 4064), ('\n', 4011), ('nbsp', 3902), ('.', 3136), ('के', 2880), ('है', 2858), ('में', 2498), ('"', 2456), ('font', 2292), ('और', 2253), ('से', 2224), ('की', 1984), ('mso', 1864), ('हैं', 1648), ('को', 1631)]
xxxxxxxxxxxxxx
[('-', 13744), (';', 11740), ('।', 9153), (':', 6323), ('\n', 6233), ('&', 6036), ('nbsp', 5789), (',', 5162), ('के', 3934), ('.', 3771

Due to the size of the corpora, as well as the nature of the Hindi language (lots of lexical items added for extra information (tha, hain, se, etc))

In [54]:
for each in reading_levels:
    print(all_counts[each].most_common(70)[60:])
    print("xxxxxxxxxxxxxx")

[('उस', 143), ('ही', 140), ('2', 138), ('कुछ', 137), ('हम', 137), ('दिया', 136), ('रहा', 133), ('घर', 131), ('ये', 130), ('देखा', 125)]
xxxxxxxxxxxxxx
[('अब', 314), ('साथ', 312), ('mso', 307), ('माँ', 301), ('पास', 299), ('अपनी', 291), ('उसके', 290), ('फिर', 284), ('रहे', 276), ('family', 276)]
xxxxxxxxxxxxxx
[('रहा', 320), ('मैं', 317), ('साथ', 317), ('तरह', 316), ('फिर', 312), ('अपनी', 310), ('1', 306), ('रही', 301), ('इस', 300), ('@', 299)]
xxxxxxxxxxxxxx
[('मैं', 457), ('”', 456), ('रहा', 444), ('3', 442), ('इस', 426), ('कहा', 411), ('size', 399), ('रहे', 394), ('@', 390), ('साथ', 384)]
xxxxxxxxxxxxxx


On initial analysis, it does not look like non-stop words make a difference, at least those that are higher infrequency.

In [57]:
for each in reading_levels:
    print(all_counts[each].most_common(1600)[1590:])
    print("xxxxxxxxxxxxxx")

[('अंकल', 4), ('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nएक', 4), ('पट्टी', 4), ('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nमैं', 4), ('हँसती', 4), ('टुक', 4), ('मूँछ', 4), ('स\u200dे', 4), ('चिड़ियाघर', 4), ('इसी', 4)]
xxxxxxxxxxxxxx
[('भई', 9), ('बैंगन', 9), ('छी', 9), ('बचने', 9), ('हाल', 9), ('गड़बड़', 9), ('ऐनक', 9), ('आपका', 9), ('जमा', 9), ('the', 9)]
xxxxxxxxxxxxxx
[('टुकड़ा', 10), ('छा', 10), ('अणुओं', 10), ('परत', 10), ('अणु', 10), ('ख़याल', 10), ('वर्ष', 10), ('घूम', 10), ('टूट', 10), ('मना', 10)]
xxxxxxxxxxxxxx
[('Paragraph\\', 12), ('रुप', 12), ('जीत', 12), ('चलाने', 12), ('स्तर', 12), ('लंबी', 12), ('शोला', 12), ('गुब्बारा', 12), ('प्रदूषण', 12), ('बालू', 12)]
xxxxxxxxxxxxxx


Same with mediocre-frequency words. Lets move on to the cream of the language - the words with unit-frequency. As can be seen with the trend of the word-frequency, this data and it's frequency very much holds with Zipf's law too (https://simple.wikipedia.org/wiki/Zipf%27s_law).

In [61]:
len(all_counts['L1'])

7587

In [62]:
for each in reading_levels:
    print(all_counts[each].most_common(7000)[6990:])
    print("xxxxxxxxxxxxxx")

[('लगाये', 1), ('घुमाये', 1), ('मेगी', 1), ('\n\nमुस्कराती', 1), ('भी\n\nमुस्कराते', 1), ('\nम्याऊँ', 1), ('सूची1', 1), ('बड़ा2', 1), ('नीचे3', 1), ('चमकीला4', 1)]
xxxxxxxxxxxxxx
[('\n\nपेंसिल', 1), ('दिखला', 1), ('\n\nसकती', 1), ('है\n\n\n\nकई', 1), ('\n\nपत्ते', 1), ('गमले', 1), ('\n\nरेखा', 1), ('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nरेल', 1), ('चली\n\n', 1), ('\n\nछुक', 1)]
xxxxxxxxxxxxxx
[('छोटे\nबच्चे', 1), ('पकड़ाए', 1), ('अब\nसब', 1), ('\n\nरोज़', 1), ('कुछ\nकाम', 1), ('बढ़ता\nही', 1), ('योग्यता', 1), ('समाते', 1), ('बजे\nउसे', 1), ('पतंगें\nमायूस', 1)]
xxxxxxxxxxxxxx
[('छोड़ना', 2), ('पारदर्शी', 2), ('अपमानित', 2), ('\n\nसर्प', 2), ('सिखाना', 2), ('सप्ताह', 2), ('पूर्णमासी', 2), ('खोए', 2), ('हटते', 2), ('बचाई', 2)]
xxxxxxxxxxxxxx


The difference in difficulty of words can be seen here, with words of lower frequency. From world knowledge, it can be said that words like 'मुस्कराती' are used way more commonly than words like 'अपमानित'.