Aveek-Saha · copoer · Sep 18, 2021 · Sep 18, 2021 · Sep 18, 2021 · Sep 18, 2021
diff --git a/README.md b/README.md
@@ -201,6 +201,15 @@ A new metadata file is created with the following format:
         }
     }
 }
+
+### Get Word Occurance in Dialogue
+
+Run
+```
+python get_occurances.py
+```
+
+
 ```
 
 ## Directory structure

diff --git a/get_all_titles.py b/get_all_titles.py
@@ -0,0 +1,12 @@
+import os
+import json
+from tqdm import tqdm
+
+with open(os.path.join(os.getcwd(), 'scripts/metadata/clean_parsed_meta.json'), 'r', encoding='utf-8') as f:
+    data = json.load(f)
+    titles = dict()
+    for movie in data.items():
+        name = movie[1]['file']['name']
+        titles[name] = name.replace(' ', '-')+'.json'
+    with open('all_titles.json', 'w') as json_file:
+        json.dump(titles, json_file, indent=4, separators=(',',':'))
diff --git a/get_occurances.py b/get_occurances.py
@@ -0,0 +1,29 @@
+import os
+import json
+import glob
+from pathlib import Path
+from tqdm import tqdm
+
+WRITE_DIR = 'scripts/occurances/'
+if not os.path.exists(WRITE_DIR):
+    os.makedirs(WRITE_DIR)
+
+for filename in tqdm(glob.glob('scripts/parsed/dialogue/*.txt')):
+   with open(os.path.join(os.getcwd(), filename), 'r', encoding='utf-8') as f:
+       words = dict()
+       for line in f:
+           dialogue = line.split(">",1)[1].lower()
+           for word in dialogue.split():
+               if (word):
+                   list_of_chars = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
+                   for char in list_of_chars:
+                      word = word.replace(char,'')
+                   if (word in words):
+                       words[word] += 1
+                   else:
+                       words[word] = 1
+       sorted_words = dict(sorted(words.items(), key=lambda item: item[1]))
+       clean_filename = Path(filename).stem
+       clean_filename = clean_filename.replace('_dialogue', '')
+       with open(WRITE_DIR+clean_filename+'.json', 'w') as json_file:
+           json.dump(sorted_words, json_file, indent=4, separators=(',',':'))