In [27]:
"""ATTENTION:

This notebook relies on the cleaned english data after running clean_english_dataset.py. The only prerequisite is that the data doesn't have invalid or non-english tokens.
"""
import pandas as pd
import nltk
import numpy as np
import os
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gensim.parsing.preprocessing import STOPWORDS
from gensim.test.utils import datapath, get_tmpfile
from gensim.utils import simple_preprocess, lemmatize



### Load data for story content and their tags
#### Note: Load data from the google drive link: https://drive.google.com/drive/u/0/folders/1vfCX_Zlnx1YyL_1uDBK6ZA6mdBPLUpaV

In [4]:
def print_rows(matrix, n_rows=5):
    for i in range(n_rows):
        print(matrix[i])
        
def pos_tokenize_document(doc):
    tokens = simple_preprocess(doc)
    # lemmatizes, POS tags and remove stopwords (including empty strings) from the tokens list for stories
    pos_tokens = [lemmatize(t) for t in tokens if t not in STOPWORDS and len(t)>0]
    # flatten the list-of-lists of POS tokens created by previous operation and return
    return [word for inner_list in pos_tokens for word in inner_list]
    
    
def get_noun_and_adjective(doc):
    pos_tokens = pos_tokenize_document(doc)
    NOUNS = ['NN', 'NNS', 'NNP', 'NNPS']
    ADJECTIVES = ['JJ', 'JJR', 'JJS']
    word_pos_tuples = [token.decode('utf-8').split('/') for token in pos_tokens]
    return_words = [word for word, pos in word_pos_tuples if pos in NOUNS+ADJECTIVES]
    return return_words
    

In [5]:
DATA_DIR = "../../data/raw/"
DATA_FILE = "stories_content_english_corrected.csv"
TAGS_FILE = "stories_tags_all.csv"

words = set(nltk.corpus.words.words())
# load stories-data and drop those rows where story_content or story_publishing_type are NaN
stories_data = pd.read_csv(DATA_DIR + DATA_FILE).dropna(subset=['story_content', 'story_publishing_type'])
# load tag-data and drop those rows where tags are NaN
tags_data = pd.read_csv(DATA_DIR + TAGS_FILE).dropna(subset=['story_tag_name','story_id'])

publisher_stories_data = stories_data.loc[stories_data['story_publishing_type'] == 'Publisher Story']
non_publisher_stories_data = stories_data.loc[stories_data['story_publishing_type'] != 'Publisher Story']

publisher_tags_data = tags_data[tags_data['story_id'].isin(publisher_stories_data['story_id'])]
non_publisher_tags_data = tags_data[tags_data['story_id'].isin(non_publisher_stories_data['story_id'])]

N_TOTAL_STORIES = len(stories_data)
N_PUBLISHER_STORIES = len(publisher_stories_data)
N_NON_PUBLISHER_STORIES = len(non_publisher_stories_data)
N_TAG_ROWS = len(tags_data)
DF_TAG_LABEL = 'story_tag_name'

print("Total stories: ", N_TOTAL_STORIES)
print("Publisher stories: ", N_PUBLISHER_STORIES)
print("Non-publisher stories: ", N_NON_PUBLISHER_STORIES)
print("Tags rows in tags-file: ", N_TAG_ROWS)

publisher_stories_data.iloc[0]['story_content']

Total stories:  2026
Publisher stories:  311
Non-publisher stories:  1715
Tags rows in tags-file:  35092


"A fawn was racing in the forest. He was ahead of the rabbit. He was ahead of the elephant. He leapt and cleared the stream. He ran past the crumbling wall. There was a large boulder on the grassy plain. He stumbled and fell down. He burst into tears. The monkey massaged his leg. Tears flowed from the fawn's eyes. Brother Bear picked him up. The fawn didn't stop crying. His mother came. She said, “Look, we’ll beat up this bad boulder!” The fawn said, “Oh, don’t do that or he will also start crying.” His mother laughed. So did the fawn."

### We will use non-publisher stories for training, and publisher stories for validation testing

In [6]:
#TODO: expand apostrophe words
all_stories_data = pd.concat([non_publisher_stories_data, publisher_stories_data], axis=0)
all_tokens_data = []

for i in range(len(all_stories_data)):
    row = get_noun_and_adjective(all_stories_data.iloc[i]['story_content'])
    all_tokens_data.append(row)
    
print_rows(all_tokens_data, 2)
print(len(all_tokens_data))

['dog', 'bird', 'dog', 'man', 'bird', 'man', 'swell', 'anger', 'poor', 'bird', 'lesson', 'lesson', 'understood', 'chasing', 'bird', 'good', 'dog']
['gulli', 'grandpa', 'able', 'newspaper', 'glass', 'gulli', 'run', 'little', 'brown', 'box', 'thing', 'clink', 'clonk', 'dadum', 'dum', 'glass', 'bigger', 'gulli', 'grandpa', 'newspaper', 'gulli', 'uncle', 'mangal', 'chacha', 'trouble', 'kitchen', 'pour', 'oil', 'pack', 'bottle', 'neck', 'small', 'clink', 'clonk', 'dadum', 'dum', 'gulli', 'time', 'big', 'wide', 'mouth', 'pour', 'drop', 'drop', 'mangal', 'chacha', 'thank', 'gulli', 'gulli', 'grandma', 'able', 'grandpa', 'monkey', 'cap', 'needle', 'floor', 'clink', 'clonk', 'dadum', 'dum', 'time', 'magnet', 'metal', 'stick', 'grandma', 'needle', 'stick', 'magnet', 'metal', 'thing', 'stick', 'grandma', 'monkey', 'cap', 'isn', 'gulli', 'brown', 'box', 'thing', 'nice']
2026


In [29]:
## Story data
train_data = all_tokens_data[:N_NON_PUBLISHER_STORIES]
test_data = all_tokens_data[N_NON_PUBLISHER_STORIES:]

## Story-ids
train_story_ids = non_publisher_stories_data['story_id'].replace('', np.nan).dropna()
test_story_ids = publisher_stories_data['story_id'].replace('', np.nan).dropna()


#Use training data to build the dictionary for LDA model
id2word = Dictionary(train_data)
print(id2word)

train_data_bow = [id2word.doc2bow(doc) for doc in train_data]
test_data_bow = [id2word.doc2bow(doc) for doc in test_data]

##Save the dictionary too
DICTIONARY_FILE_LOCATION = "../../data/id2word_dictionary"
OS_PATH_TO_DICTIONARY_FILE = os.path.abspath(os.path.join(os.getcwd(), DICTIONARY_FILE_LOCATION))

id2word.save_as_text(datapath(OS_PATH_TO_DICTIONARY_FILE))


Dictionary(12253 unique tokens: ['anger', 'bird', 'chasing', 'dog', 'good']...)


In [8]:

print(len(train_data_bow))
print(len(test_data_bow))
print(train_data_bow[:1])

1715
311
[[(0, 1), (1, 4), (2, 1), (3, 3), (4, 1), (5, 2), (6, 2), (7, 1), (8, 1), (9, 1)]]


In [9]:
story_tags_dict = {}
tag_set = set()  # set of all unique tags in the dataset
tag_count = {}   # number of count for each tag
for story_id in all_stories_data['story_id']:
    tag_list = tags_data['story_tag_name'].loc[tags_data['story_id'] == story_id]
    if(len(tag_list) == 0):
        continue #didn't have tags for this story, so go to next story
    tag_list = [tag.replace('#','').lower() for tag in tag_list if (type(tag) == str and tag not in STOPWORDS)]
    story_tags_dict[story_id] = set(tag_list)
    
    #Update global tag variables (tag_count )
    for tag in tag_list:
        if tag not in tag_count:
            tag_count[tag] = 1
        else:
            tag_count[tag] += 1
    tag_set |= set(tag_list)
    
n_stories_with_tags = len(story_tags_dict)
n_tags_freq_more_than_5 = len([tag for tag in tag_count if tag_count[tag] >= 5])
n_tags_freq_more_than_10 = len([tag for tag in tag_count if tag_count[tag] >= 10])
n_tags_freq_more_than_15 = len([tag for tag in tag_count if tag_count[tag] >= 15])
n_tags_freq_more_than_20 = len([tag for tag in tag_count if tag_count[tag] >= 20])

print("Number of stories with tags: ", n_stories_with_tags)
print("Number of unique tags: ", len(tag_set))
print("Number of Tags occuring 5 times or more: ", n_tags_freq_more_than_5)
print("Number of Tags occuring 10 times or more: ", n_tags_freq_more_than_10)
print("Number of Tags occuring 15 times or more: ", n_tags_freq_more_than_15)
print("Number of Tags occuring 20 times or more: ", n_tags_freq_more_than_20)



Number of stories with tags:  1273
Number of unique tags:  2565
Number of Tags occuring 5 times or more:  256
Number of Tags occuring 10 times or more:  92
Number of Tags occuring 15 times or more:  50
Number of Tags occuring 20 times or more:  35


In [10]:
test_story_id_to_tags = {}

print(publisher_tags_data.columns)
for idx in test_story_ids:
#     print(idx, publisher_tags_data.loc[publisher_tags_data['story_id'] == idx][DF_TAG_LABEL])
    test_story_id_to_tags[idx] = publisher_tags_data.loc[publisher_tags_data['story_id'] == idx][DF_TAG_LABEL]
test_story_id_to_tags

Index(['story_id', 'story_title', 'story_english_title', 'story_tag_name'], dtype='object')


{2: Series([], Name: story_tag_name, dtype: object),
 7: Series([], Name: story_tag_name, dtype: object),
 16: Series([], Name: story_tag_name, dtype: object),
 21: Series([], Name: story_tag_name, dtype: object),
 33: Series([], Name: story_tag_name, dtype: object),
 38: Series([], Name: story_tag_name, dtype: object),
 40: Series([], Name: story_tag_name, dtype: object),
 49: Series([], Name: story_tag_name, dtype: object),
 54: Series([], Name: story_tag_name, dtype: object),
 61: Series([], Name: story_tag_name, dtype: object),
 67: Series([], Name: story_tag_name, dtype: object),
 71: Series([], Name: story_tag_name, dtype: object),
 80: Series([], Name: story_tag_name, dtype: object),
 82: Series([], Name: story_tag_name, dtype: object),
 86: Series([], Name: story_tag_name, dtype: object),
 88: Series([], Name: story_tag_name, dtype: object),
 89: Series([], Name: story_tag_name, dtype: object),
 90: Series([], Name: story_tag_name, dtype: object),
 94: Series([], Name: story_ta

## LDA model : 50 topics

In [20]:
PRETRAINED_MODEL_FILE_LOCATION = "../models/lda_model"
OS_PATH_TO_MODEL_FILE = os.path.abspath(os.path.join(os.getcwd(), PRETRAINED_MODEL_FILE_LOCATION))
print(OS_PATH_TO_MODEL_FILE)

ldamodel_topics_50 = None
try:
    print("Using pretrained model...")
    ldamodel_topics_50 = LdaMulticore.load(PRETRAINED_MODEL_FILE_LOCATION)
except:
    print("ERORR: Pretrained model not found. Training the model now...")
    ldamodel_topics_50 = LdaMulticore(train_data_bow, num_topics=50, id2word=id2word, passes=20, workers=2)
    ##Save this for future use
    print("\nSaving the trained model to location " + OS_PATH_TO_MODEL_FILE)
    ldamodel_topics_50.save(datapath(OS_PATH_TO_MODEL_FILE))


/Users/goelakas/goelakash/pratham_data/data_v3/PrathamBooks-Sprint-2018/python/models/lda_model
Using pretrained model...


In [21]:
topic_predictions = [ldamodel_topics_50[bow] for bow in test_data_bow]
print(len(topic_predictions))
print(len(test_story_ids))

story_id_to_topic_predictions = {}
for i in range(len(test_story_ids)):
    story_id = test_story_ids.get(i)
    topic = topic_predictions[i]
    story_id_to_topic_predictions[story_id] = topic



311
311


In [23]:
predicted_story_id_to_tags = {}

for story_id, pred in story_id_to_topic_predictions.items():
    word_to_proba = {}
    for topic_id, t_proba in pred:
        topic_words = ldamodel_topics_50.show_topic(topic_id)
        for word, w_proba in topic_words:
            if word not in word_to_proba.keys():
                word_to_proba[word] = 0.0
            word_to_proba[word] += t_proba*w_proba
    ranked_words_and_prob = sorted(word_to_proba.items(), key=lambda kv: kv[1], reverse=True)
    ranked_words = [word_tuple[0] for word_tuple in ranked_words_and_prob]
    predicted_story_id_to_tags[story_id] = ranked_words[:10]
    

In [24]:
def write_to_csv(dict_predictions):
    with open('predictions.csv', 'w') as output:
        output.write(','.join(['story_id'] + ['tag'+str(i) for i in range(1,11)])+'\n')
        for k,v in dict_predictions.items():
            output.write(','.join([str(k)]+v[:10])+'\n')
        output.close()

write_to_csv(predicted_story_id_to_tags)