### Loading Libraries

In [1]:
import os
import json

In [2]:
import random
import string

In [3]:
import pandas as pd

In [4]:
import requests

In [5]:
from bs4 import BeautifulSoup as bs

In [6]:
from urllib.parse import quote

In [7]:
from nltk.tag import StanfordPOSTagger, StanfordNERTagger

In [8]:
from nltk import word_tokenize

### Stanford POS tagger

In [9]:
## Setting the path to java.exe for nltk
java_path = "F:/java/java/bin/java.exe"
    
os.environ['JAVAHOME'] = java_path

In [10]:
## creating a StanfordPOStagger object
pos_base_path = "E:\Stanford Taggers\stanford-postagger-full-2015-04-20\stanford-postagger-full-2015-04-20"

st_pos = StanfordPOSTagger(model_filename = pos_base_path+"\models\english-bidirectional-distsim.tagger", path_to_jar = pos_base_path+"\stanford-postagger.jar")

In [11]:
## creating a StanfordPOStagger object
ner_base_path = "E:\Stanford Taggers\stanford-ner-4.2.0\stanford-ner-2020-11-17"
    
st_ner = StanfordNERTagger(model_filename = ner_base_path+"\classifiers\english.all.3class.distsim.crf.ser.gz", path_to_jar = ner_base_path+"\stanford-ner-4.2.0.jar")

In [287]:
# st_ner.tag(word_tokenize("Hyderabad is my favorate place"))

In [289]:
# st_pos.tag(word_tokenize("Hyderabad is my favorate place"))

## Functions

In [12]:
# function to return a random plot form data frame with Title and Genre
def sel_plot(df):
    # choosing a random value 
    num = random.choice([100, 150, 200, 250])
    val = random.randrange(1,num)

    print(f"Selected value : {val}")
    
    ## re-framing the data
    plots = {}
    for index,row in df.iterrows():
        plots[row["Title"]] = {"Genre":row["Genre"], "Plot": row["Plot"]}
        if index == val: break
            
    # choosing a random title
    titles = list(plots.keys())
    sel_movie = random.choice(titles)
    
    return plots[sel_movie]

In [13]:
## function to tokenize the plot into Parts of Speech
# func to get abb of pos
def get_abb(tag_obj):
    pos_dct = {}
    ner_dct = {}

    if type(tag_obj) == str: 
        pos_tags = st_pos.tag(tag_obj.split())
        ner_tags = st_ner.tag(tag_obj.split())
    elif type(tag_obj) == list: 
        pos_tags = st_pos.tag(tag_obj)
        ner_tags = st_ner.tag(tag_obj)
        
    for word,tag in pos_tags:
        try:
            pos_dct[word] = tag, abb[tag] 
        except:
            pos_dct[word] = tag 
    for word,tag in ner_tags:
        ner_dct[word] = tag

        
    return {"Parts of Speech": pos_dct, "Named Entities": ner_dct}

# func to get tokens
def tokenizer(plot_obj):
    plot = plot_obj["Plot"]
    get_dct = get_abb(plot)
      
    return plot_obj["Genre"],get_dct

In [377]:
# tokenizer(sel_plot(req_data))

### Fetching Data

In [193]:
## Fetching the movie data
movies_data = pd.read_csv("wiki_movie_plots_deduped.csv", encoding="utf-8")

req_data = movies_data[["Title", "Genre", "Plot"]]
req_data = req_data[req_data["Genre"] != "unknown"]

In [233]:
## Cleaning the data
# cleaning the plot encoding
def clean_str(st):
    for ch in st :
        if ch in '!#$%&\()*+-/:;<=>@[\\]^_`{|}~123456789':
            st = st.replace(ch,"")
    st = st.encode("ascii", "ignore").decode("utf-8")
    return st

req_data["Plot"] = req_data["Plot"].apply(lambda x:clean_str(x))

In [234]:
#saving the required data
req_data.to_csv("data_subset.csv")

In [14]:
#Loading the req_data as df
req_data = pd.read_csv("data_subset.csv")
req_data.head(3)

Unnamed: 0.1,Unnamed: 0,Title,Genre,Plot
0,6,The Great Train Robbery,western,The film opens with two bandits breaking into ...
1,7,The Suburbanite,comedy,The film is about a family who move to the sub...
2,10,Dream of a Rarebit Fiend,short,The Rarebit Fiend gorges on Welsh rarebit at a...


### POS tags Abrrevations

In [15]:
url = "https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html"
res = requests.get(url)
soup = bs(res.text)

In [16]:
table = soup.select("table tr")

abb = {}
for row in table[1:]:
    cols = row.find_all("td")
    col_lst = [col.get_text() for col in cols][1:]
    abb[col_lst[0].strip()] = col_lst[1].strip()

In [22]:
# saving the abb dictionary to a csv
abb_data = pd.DataFrame.from_dict(abb, orient="index", columns=["Full Form"]).transpose()
abb_data.to_csv("abbrevations.csv")

In [30]:
# loading abb_data as df
abb_data = pd.read_csv("abbrevations.csv")
abb_data

Unnamed: 0.1,Unnamed: 0,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,...,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB
0,Full Form,Coordinating conjunction,Cardinal number,Determiner,Existential there,Foreign word,Preposition or subordinating conjunction,Adjective,"Adjective, comparative","Adjective, superlative",...,"Verb, base form","Verb, past tense","Verb, gerund or present participle","Verb, past participle","Verb, non-3rd person singular present","Verb, 3rd person singular present",Wh-determiner,Wh-pronoun,Possessive wh-pronoun,Wh-adverb


## StoryGenerator

In [44]:
res = tokenizer(sel_plot(req_data))

Selected value : 85


In [45]:
genre = res[0]
pos_res = res[1]["Parts of Speech"]
ner_res = res[1]["Named Entities"]
words = list(pos_res.keys())

In [46]:
num = random.randrange(5, len(words)//5)
pos_choosen = []
for i in range(num):
    pos_choosen.append(random.choice(words))   
    
ner_choosen = [wrd for wrd,ner in list(ner_res.items()) if ner != "O"]

In [81]:
## Asking user for some words based on their
new_words = []
occurred = [] 
for wrd,tup in pos_res.items(): 
    if tup[1] not in occurred :
        if (wrd not in ner_choosen) & (wrd in pos_choosen):  
            occurred.append(tup[1])
            print (f"More on {tup[1]} at: https://www.google.com/search?q={quote(tup[1])}")
            new_word = input(f"Enter a {tup[1]}:")    
            if len(new_word) < 1: new_words.append(wrd)
            new_words.append(new_word)
            print("")
            
        elif wrd in ner_choosen:
            print (f"More on {ner_res[wrd]} at: https://www.google.com/search?q={quote(ner_res[wrd])}")
            new_word = input(f"Enter a {ner_res[wrd]}:")    
            if len(new_word) < 1: new_words.append(wrd)
            new_words.append(new_word)
            print("")
    else:
        new_words.append(wrd)

More on Verb, non-3rd person singular present at: https://www.google.com/search?q=Verb%2C%20non-3rd%20person%20singular%20present
Enter a Verb, non-3rd person singular present:

More on PERSON at: https://www.google.com/search?q=PERSON
Enter a PERSON:

More on Noun, singular or mass at: https://www.google.com/search?q=Noun%2C%20singular%20or%20mass
Enter a Noun, singular or mass:

More on Adjective at: https://www.google.com/search?q=Adjective
Enter a Adjective:

More on Verb, 3rd person singular present at: https://www.google.com/search?q=Verb%2C%203rd%20person%20singular%20present
Enter a Verb, 3rd person singular present:

More on Verb, gerund or present participle at: https://www.google.com/search?q=Verb%2C%20gerund%20or%20present%20participle
Enter a Verb, gerund or present participle:

More on Personal pronoun at: https://www.google.com/search?q=Personal%20pronoun
Enter a Personal pronoun:

More on Foreign word at: https://www.google.com/search?q=Foreign%20word
Enter a Foreign wo

In [88]:
print(" ".join(new_words))

are  Charlie  assistant.  work assistant room. have brief  squabble goes  waiting  room floor carpet sweeper. patient further starts. rear squabbling. dentist arrives, first in, pain. prepares nitrous oxide anaesthesic due unconsciousness. man unconscious he  pulls tooth, can't  him up. calls runs off.  tries eventually  hitting head mallet. revives starts laughing. knocks returns sent  drug store prescription. fighting from  Dr Pain's surgery Sunset Pharmacy. strikes standing newsstand outside. looks woman dentist's wife kicks stomach before chasing himself, incident occurs she loses skirt embarrassment. continues man, who  receives brick face, thus becoming patient. hits passerby equally losing tooth. Meanwhile, gets phone call maid has had "accident" home. empty. picks of female other lady leaves, leaving them alone. flirts very closely mouth, stealing kisses. struck by bricks  arrive. girl leaves. tall next. uses huge pair pliers noise victim enters final fight ensues.
