In [1]:
"""ATTENTION:

This notebook relies on the cleaned english data after running clean_english_dataset.py. The only prerequisite is that the data doesn't have invalid or non-english tokens.
"""
import pandas as pd
import nltk
import numpy as np
import os
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gensim.parsing.preprocessing import STOPWORDS
from gensim.test.utils import datapath, get_tmpfile
from gensim.utils import simple_preprocess
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer


### Load data for story content and their tags
#### Note: Load data from the google drive link: https://drive.google.com/drive/u/0/folders/1vfCX_Zlnx1YyL_1uDBK6ZA6mdBPLUpaV

In [2]:
def print_rows(matrix, n_rows=5):
    for i in range(n_rows):
        print(matrix[i])
        
def lemmatized_stemming(token):
    return stemmmer.stem(WordNetLemmatizer().lemmatize(token))

def docstring_to_token_pos_tuples(doc):
    tokens = simple_preprocess(doc)
#     print(1, ' ', tokens)
    # lemmatizes, POS tags and remove stopwords (including empty strings) from the tokens list for stories
    tokens = [lemmatized_stemming(t) for t in tokens if t not in STOPWORDS and len(t)>0]
#     print(2, ' ', tokens)
    return nltk.pos_tag(tokens)
    
    
def get_noun_and_adjective(doc):
#     print(doc)
    token_pos_tuples = docstring_to_token_pos_tuples(doc)
#     print(token_pos_tuples)
    NOUNS = ['NN', 'NNS', 'NNP', 'NNPS']
    ADJECTIVES = ['JJ', 'JJR', 'JJS']
    return_words = [token for token, pos in token_pos_tuples if pos in NOUNS+ADJECTIVES]
#     print(return_words)
    return return_words
    

In [3]:
DATA_DIR = "../../data/raw/"
DATA_FILE = "spi_cleaned_new.csv"
# TAGS_FILE = "stories_tags_all.csv"

stemmmer = SnowballStemmer('english')
words = set(nltk.corpus.words.words())
# load stories-data and drop those rows where story_content or story_publishing_type are NaN
all_data = pd.read_csv(DATA_DIR + DATA_FILE).dropna(subset=['story_text', 'illustration_text'])
stories_data_english_only = all_data.loc[all_data['language_name']=='English']
illustration_data_all = all_data
illustration_data_all.loc[illustration_data_all['language_name'] != 'English', 'story_text'] = ''
illustration_data_all['combined_text'] = illustration_data_all['illustration_text'] + ' ' + illustration_data_all['story_text']
stories_data_english_only
# illustration_data_all

# load tag-data and drop those rows where tags are NaN
# tags_data = pd.read_csv(DATA_DIR + TAGS_FILE).dropna(subset=['story_tag_name','story_id'])

# publisher_stories_data = stories_data.loc[stories_data['story_publishing_type'] == 'Publisher Story']
# non_publisher_stories_data = stories_data.loc[stories_data['story_publishing_type'] != 'Publisher Story']

# publisher_tags_data = tags_data[tags_data['story_id'].isin(publisher_stories_data['story_id'])]
# non_publisher_tags_data = tags_data[tags_data['story_id'].isin(non_publisher_stories_data['story_id'])]

# N_TOTAL_STORIES = len(all_data)
# N_PUBLISHER_STORIES = len(publisher_stories_data)
# N_NON_PUBLISHER_STORIES = len(non_publisher_stories_data)
# N_TAG_ROWS = len(tags_data)
# DF_TAG_LABEL = 'story_tag_name'

# print("Total stories: ", N_TOTAL_STORIES)
# print("Publisher stories: ", N_PUBLISHER_STORIES)
# print("Non-publisher stories: ", N_NON_PUBLISHER_STORIES)
# print("Tags rows in tags-file: ", N_TAG_ROWS)

# publisher_stories_data.iloc[0]['story_content']

Unnamed: 0,story_id,title,language_name,story_text,illustration_text
1,2,Smile Please!,English,"His mother came. She said, “Look, we’ll beat u...",Mother and young fawn Mother and fawn laughing...
5,7,Fat King Thin Dog,English,The king catches the dog. Now the fat king is ...,King catching a dog Thin King and dog Fat King...
14,16,"Vayu, the Wind",English,The window curtains flutterAnd gently brush my...,Girl feeling the wind through an open window W...
16,21,Chuskit Goes to School!,English,Chuskit was born with legs&nbsp;that did not w...,"Man carrying girl on back Girl, man, cylindric..."
18,33,"Not Now, Not Now!",English,"Ajja said, “We can play cricket now.” Ma said,...",Family surrounding a birthday cake Children ho...
23,38,Counting on Moru,English,Moru made the children stand in a line - the s...,Children standing in order of height holding n...
24,40,Listen to My Body,English,Match&nbsp; the&nbsp; sounds&nbsp; with&nbsp; ...,"Hands clapping, lips puckered and feet tapping..."
31,49,"""My fish!"" ""No, my fish!""",English,Little Munia thought for sometime. “Please don...,Girl with cat talking to two boys carrying fis...
36,54,Susheela's Kolams,English,"That night, Susheela stood on her terrace and ...",Girl and bird looking at starry night sky with...
43,61,Pehelwaan Ji,English,"One day, Gappu said, “Will you wrestle with me...",Man talking to boy Man drinking milk from a ba...


### We will use non-publisher stories for training, and publisher stories for validation testing

In [4]:
# #TODO: expand apostrophe words
# all_stories_data = pd.concat([non_publisher_stories_data, publisher_stories_data], axis=0)
# all_tokens_data = []

# for i in range(len(all_stories_data)):
#     row = get_noun_and_adjective(all_stories_data.iloc[i]['story_content'])
#     all_tokens_data.append(row)
    
# # print(all_tokens_data)

In [5]:
# print(len(all_tokens_data))

In [6]:
# ## Story data
# train_data = all_tokens_data[:N_NON_PUBLISHER_STORIES]
# test_data = all_tokens_data[N_NON_PUBLISHER_STORIES:]

# ## Story-ids
# train_story_ids = non_publisher_stories_data['story_id'].replace('', np.nan).dropna()
# test_story_ids = publisher_stories_data['story_id'].replace('', np.nan).dropna()


# #Use training data to build the dictionary for LDA model
# id2word = Dictionary(train_data)
# print(id2word)

# train_data_bow = [id2word.doc2bow(doc) for doc in train_data]
# test_data_bow = [id2word.doc2bow(doc) for doc in test_data]

# ##Save the dictionary too
# DICTIONARY_FILE_LOCATION = "../../data/id2word_dictionary"
# OS_PATH_TO_DICTIONARY_FILE = os.path.abspath(os.path.join(os.getcwd(), DICTIONARY_FILE_LOCATION))

# id2word.save_as_text(datapath(OS_PATH_TO_DICTIONARY_FILE))


In [7]:
## Story data
stories_tokens = []
stories_and_illustration_tokens = []

for text in stories_data_english_only['story_text']:
    stories_tokens.append(get_noun_and_adjective(text))

for text in illustration_data_all['story_text']:
    stories_and_illustration_tokens.append(get_noun_and_adjective(text))

#Use training data to build the dictionary for LDA model
id2word_stories_and_illustration = Dictionary(stories_and_illustration_tokens)
print(id2word_stories_and_illustration)

stories_bow = [id2word_stories_and_illustration.doc2bow(doc) for doc in stories_tokens]
stories_and_illustrations_bow = [id2word_stories_and_illustration.doc2bow(doc) for doc in stories_and_illustration_tokens]

##Save the dictionary too
DICTIONARY_FILE_LOCATION = "../../data/id2word_dictionary"
OS_PATH_TO_DICTIONARY_FILE = os.path.abspath(os.path.join(os.getcwd(), DICTIONARY_FILE_LOCATION))

id2word_stories_and_illustration.save_as_text(datapath(OS_PATH_TO_DICTIONARY_FILE))


Dictionary(17278 unique tokens: ['bad', 'bear', 'beat', 'boulder', 'brother']...)


In [8]:
# story_tags_dict = {}
# tag_set = set()  # set of all unique tags in the dataset
# tag_count = {}   # number of count for each tag
# for story_id in all_stories_data['story_id']:
#     tag_list = tags_data['story_tag_name'].loc[tags_data['story_id'] == story_id]
#     if(len(tag_list) == 0):
#         continue #didn't have tags for this story, so go to next story
#     tag_list = [tag.replace('#','').lower() for tag in tag_list if (type(tag) == str and tag not in STOPWORDS)]
#     story_tags_dict[story_id] = set(tag_list)
    
#     #Update global tag variables (tag_count )
#     for tag in tag_list:
#         if tag not in tag_count:
#             tag_count[tag] = 1
#         else:
#             tag_count[tag] += 1
#     tag_set |= set(tag_list)
    
# n_stories_with_tags = len(story_tags_dict)
# n_tags_freq_more_than_5 = len([tag for tag in tag_count if tag_count[tag] >= 5])
# n_tags_freq_more_than_10 = len([tag for tag in tag_count if tag_count[tag] >= 10])
# n_tags_freq_more_than_15 = len([tag for tag in tag_count if tag_count[tag] >= 15])
# n_tags_freq_more_than_20 = len([tag for tag in tag_count if tag_count[tag] >= 20])

# print("Number of stories with tags: ", n_stories_with_tags)
# print("Number of unique tags: ", len(tag_set))
# print("Number of Tags occuring 5 times or more: ", n_tags_freq_more_than_5)
# print("Number of Tags occuring 10 times or more: ", n_tags_freq_more_than_10)
# print("Number of Tags occuring 15 times or more: ", n_tags_freq_more_than_15)
# print("Number of Tags occuring 20 times or more: ", n_tags_freq_more_than_20)



In [9]:
# test_story_id_to_tags = {}

# print(publisher_tags_data.columns)
# for idx in test_story_ids:
# #     print(idx, publisher_tags_data.loc[publisher_tags_data['story_id'] == idx][DF_TAG_LABEL])
#     test_story_id_to_tags[idx] = publisher_tags_data.loc[publisher_tags_data['story_id'] == idx][DF_TAG_LABEL]
# test_story_id_to_tags

## LDA model : 50 topics

In [11]:
PRETRAINED_STORIES_MODEL_FILE_LOCATION = "../models/lda_model_stories_only"
PRETRAINED_STORIES_AND_ILLUSTRATION_MODEL_FILE_LOCATION = "../models/lda_model_stories_and_illustration"
OS_PATH_TO_MODEL_FILE_STORIES = os.path.abspath(os.path.join(os.getcwd(), PRETRAINED_STORIES_MODEL_FILE_LOCATION))
OS_PATH_TO_MODEL_FILE_ILLUSTRATIONS = os.path.abspath(os.path.join(os.getcwd(), PRETRAINED_STORIES_AND_ILLUSTRATION_MODEL_FILE_LOCATION))
print(OS_PATH_TO_MODEL_FILE_STORIES)
print(OS_PATH_TO_MODEL_FILE_ILLUSTRATIONS)


ldamodel_topics_50_stories = None
ldamodel_topics_50_illustrations = None
# try:
#     print("Using pretrained model...")
#     ldamodel_topics_50_stories = LdaMulticore.load(PRETRAINED_STORIES_MODEL_FILE_LOCATION)
#     ldamodel_topics_50_illustrations = LdaMulticore.load(PRETRAINED_STORIES_AND_ILLUSTRATION_MODEL_FILE_LOCATION)
# except:
print("ERORR: Pretrained model not found. Training the model now...")
# print(stories_bow)
# print(stories_and_illustrations_bow)
ldamodel_topics_50_stories = LdaMulticore(stories_bow, num_topics=50, id2word=id2word_stories_and_illustration, passes=20, workers=2)
ldamodel_topics_50_illustrations = LdaMulticore(stories_and_illustrations_bow, num_topics=50, id2word=id2word_stories_and_illustration, passes=20, workers=2)
##Save this for future use
print("\nSaving the trained model to location " + OS_PATH_TO_MODEL_FILE_STORIES)
ldamodel_topics_50_stories.save(datapath(OS_PATH_TO_MODEL_FILE_STORIES))
print("\nSaving the trained model to location " + OS_PATH_TO_MODEL_FILE_ILLUSTRATIONS)
ldamodel_topics_50_illustrations.save(datapath(OS_PATH_TO_MODEL_FILE_ILLUSTRATIONS))


/Users/goelakas/goelakash/pratham_data/data_v3/PrathamBooks-Sprint-2018/python/models/lda_model_stories_only
/Users/goelakas/goelakash/pratham_data/data_v3/PrathamBooks-Sprint-2018/python/models/lda_model_stories_and_illustration
ERORR: Pretrained model not found. Training the model now...

Saving the trained model to location /Users/goelakas/goelakash/pratham_data/data_v3/PrathamBooks-Sprint-2018/python/models/lda_model_stories_only

Saving the trained model to location /Users/goelakas/goelakash/pratham_data/data_v3/PrathamBooks-Sprint-2018/python/models/lda_model_stories_and_illustration


In [21]:
topic_predictions = [ldamodel_topics_50[bow] for bow in test_data_bow]
print(len(topic_predictions))
print(len(test_story_ids))

story_id_to_topic_predictions = {}
for i in range(len(test_story_ids)):
    story_id = test_story_ids.get(i)
    topic = topic_predictions[i]
    story_id_to_topic_predictions[story_id] = topic



311
311


In [23]:
predicted_story_id_to_tags = {}

for story_id, pred in story_id_to_topic_predictions.items():
    word_to_proba = {}
    for topic_id, t_proba in pred:
        topic_words = ldamodel_topics_50.show_topic(topic_id)
        for word, w_proba in topic_words:
            if word not in word_to_proba.keys():
                word_to_proba[word] = 0.0
            word_to_proba[word] += t_proba*w_proba
    ranked_words_and_prob = sorted(word_to_proba.items(), key=lambda kv: kv[1], reverse=True)
    ranked_words = [word_tuple[0] for word_tuple in ranked_words_and_prob]
    predicted_story_id_to_tags[story_id] = ranked_words[:10]
    

In [24]:
def write_to_csv(dict_predictions):
    with open('predictions.csv', 'w') as output:
        output.write(','.join(['story_id'] + ['tag'+str(i) for i in range(1,11)])+'\n')
        for k,v in dict_predictions.items():
            output.write(','.join([str(k)]+v[:10])+'\n')
        output.close()

write_to_csv(predicted_story_id_to_tags)