In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re, pickle, os, sys, requests
from tqdm import tqdm
import shutil
import json
from glob import glob
from google.colab import output

In [None]:
!pip install scrapy beautifulsoup4
from bs4 import BeautifulSoup
!rm -rf sample_data/
output.clear()

In [None]:
def clean_path(path):
  if os.path.isdir(path):
    shutil.rmtree(path, ignore_errors=True)
  os.makedirs(path)

In [None]:
def crawl_data():
  seasons = {1:6,2:22,3:23,4:14,5:26,6:24,7:24,8:24,9:23}
  dialogs = {}
  def addd(c, d):
    dialogs[c] = [d]

  for season in seasons.keys():
    # if not season == 9:
    #   continue
    for episode in tqdm(range(1,seasons[season]+1)):

      if episode < 10:
        url = f"https://www.officequotes.net/no{season}-0{episode}.php"
      else:
        url = f"https://www.officequotes.net/no{season}-{episode}.php"

      r = requests.get(url)
      soup = BeautifulSoup(r.content, 'html.parser',)
      dialogues = soup.find_all('div', class_='quote')

      temp = [[t.split(":</b>")[0], re.sub("([\(\[]).*?([\)\]])", "", t.split(":</b>")[1].replace('\xa0','').replace('\u00e2\u20ac\u2122',"'").replace('¦',' ').replace('â\x80',"'").replace('\x99','').strip()).lstrip().strip()] for d in dialogues for t in \
      str(d).replace('<div class="quote">','').replace('</div>','').replace('\t','').replace(' <b>','').replace('<b>','').strip() \
      .split('<br/>') if "<u>Deleted Scene" not in t and len(t.split(":</b>")) > 1]

      [dialogs[d[0].lower()].append(d[1]) if d[0].lower() in dialogs.keys() else addd(d[0].lower(),d[1]) for d in temp]

  return dialogs

In [None]:
dialogs = crawl_data()

# Saving raw data

In [None]:
path = os.path.join("data","raw","all_characters")
clean_path(path)

In [None]:
def save_all_characters():
  path = os.path.join("data","raw","all_characters","all_characters.json")
  with open(path, "w") as f:
    json.dump(dialogs, f,sort_keys=True, indent=4)

In [None]:
def load_all_characters():
  path = os.path.join("data","raw","all_characters","all_characters.json")
  with open(path, 'r') as f:
      dialogs = json.load(f)
  return dialogs

In [None]:
save_all_characters()

In [None]:
dialogs = load_all_characters()

In [None]:
print(len(sent_tokenize(' '.join(dialogs['dwight']))), len(sent_tokenize(' '.join(dialogs['michael']).lower())))
print(len(word_tokenize(' '.join(dialogs['dwight']))), len(word_tokenize(' '.join(dialogs['michael']))))

## Save Michael and Dwight dialogues

In [None]:
path = os.path.join("data","raw","michael_dwight_dialogues")
clean_path(path)

In [None]:
characters = ['michael', 'dwight']
dialogs = {character: dialogs[character] for character in characters}

In [None]:
print(len(dialogs['michael']),len(dialogs['dwight']))

In [None]:
path = os.path.join("data","raw","michael_dwight_dialogues")
for character, dialogues in dialogs.items(): 
  with open(os.path.join(path, f"{character}.txt"), "w") as txt_file:
    for dialogue in dialogues:
      txt_file.write(f"{dialogue}\n")

# Clean up data

In [None]:
path = os.path.join("data","clean")
clean_path(path)

In [None]:
# !pip install transformers
!pip install unidecode contractions
output.clear()

In [None]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
import unidecode
import contractions
nltk.download('all')
lemmatizer = WordNetLemmatizer()
nlp = spacy.load('en')
output.clear()

In [None]:
def save(character, folder, sents):
  path = os.path.join("data","clean", folder)
  try:
    os.makedirs(path)
  except:
    pass
  path = os.path.join(path, f"{character}.txt")
  with open(path, "w") as txt_file:
    for sent in sents:
      txt_file.write(f"{sent}\n") # works with any number of elements in a line

In [None]:
def remove_accented_chars(text):
  text = unidecode.unidecode(text)
  return text

def expand_contractions(text):
  text = contractions.fix(text)
  return text

In [None]:
def export_sentences(character):
  return sent_tokenize(' '.join(dialogs[character]).lower())

In [None]:
def remove_accent_and_expand(character):
  sents = export_sentences(character)
  sents_expanded = [expand_contractions(remove_accented_chars(sent)) for sent in sents]
  save(character, "cleaned_broken_sentences", sents_expanded)
  sents_lemmatized = []
  word_count = 0
  for sent in sents_expanded:
    # sents_lemmatized.append(' '.join([word.lemma_ if word.lemma_ != "-PRON-" else word.lower_  for word in nlp(sent)]))
    sent_tokenized = word_tokenize(sent)
    if len(sent_tokenized) < 5:
      continue
    word_count += len(sent_tokenized)
    sents_lemmatized.append(' '.join([lemmatizer.lemmatize(word) for word in sent_tokenized]))
  save(character, "cleaned_lemmatized_broken_sentences", sents_lemmatized)
  return sents_lemmatized, len(sents_lemmatized), word_count

In [None]:
characters = ["michael","dwight"]
final_data = []
for character in characters:
  output, total_sents, word_count = remove_accent_and_expand(character)
  final_data.append(output)
  print(f"total number of {character} sentences and words: {total_sents} , {word_count}")

# Statistics

In [None]:
import matplotlib.pyplot as plt

In [None]:
path = os.path.join('data','clean','cleaned_lemmatized_broken_sentences')

In [None]:
data = {}
characters = []
for file in glob(os.path.join(path,'*.txt')):
  name = os.path.basename(file).split('.')[0]
  characters.append(name)
  with open(file,'r') as f:
    data[name] = [sent.strip() for sent in f.readlines()]

In [None]:
def plot(names, numerical_data, title, shift=False, is_large=False):
  if is_large:
    fig = plt.figure(figsize=(25,25))
  else:
    fig = plt.figure()
  ax = fig.add_axes([0,0,1,1])
  if shift:
    plt.xticks(rotation=90)
    ax.xaxis.set_tick_params(labelsize=10)
  ax.bar(names,numerical_data)
  plt.title(title)
  plt.show()

In [None]:
num = [sum([len(data[char]) for char in characters])]
print(num[0])
plot(["sentences"],num,"total sentence count")

In [None]:
words = sum([len(' '.join(data[char]).split()) for char in characters])
distinct_words = sum([ len(list(set(' '.join(data[char]).split()))) for char in characters])
num = [ sum([len(data[char]) for char in characters]), words, distinct_words]
print(f"total number of words: {num[0]} and distinct words: {num[1]}")
plot(["sentences", "words", "distinct words"],num,"total word count")

In [None]:
sentence_count = [len(data[characters[0]]), len(data[characters[1]])]
plot(characters,sentence_count,"sentence count based on each character")

In [None]:
michael_distinct = list(set(' '.join(data["michael"]).split()))
dwight_distinct = list(set(' '.join(data["dwight"]).split()))
both = 0
only_michael = 0
only_dwight = 0

for word in michael_distinct:
  if word in dwight_distinct:
    both += 1
  else:
    only_michael += 1

for word in dwight_distinct:
  if word not in michael_distinct:
    only_dwight += 1

print(f"distinct words in both classes: {both} , first class only: {only_dwight} , second class only: {only_michael}")

plot(["both" , "dwight" , "michael"],[both,only_michael,only_dwight],"distinct words based on each group")

In [None]:
def word_count(str):
    counts = dict()
    words = str.split()

    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts

In [None]:
michael_distinct_count = word_count(' '.join(data["michael"]))
dwight_distinct_count = word_count(' '.join(data["dwight"]))
michael_distinct_count = dict(sorted(michael_distinct_count.items(), key=lambda item: item[1], reverse = True))
dwight_distinct_count = dict(sorted(dwight_distinct_count.items(), key=lambda item: item[1], reverse = True))

dwight_most_repeated = []
michael_most_repeated = []
for word, count in michael_distinct_count.items():
  if word not in dwight_distinct_count.keys():
    michael_most_repeated.append((word,count))
  if len(michael_most_repeated) >= 10:
    break

for word, count in dwight_distinct_count.items():
  if word not in michael_distinct_count.keys():
    dwight_most_repeated.append((word,count))
  if len(dwight_most_repeated) >= 10:
    break

print(dwight_most_repeated)
print(michael_most_repeated)

In [None]:
plot([x[0] for x in dwight_most_repeated], [x[1] for x in dwight_most_repeated], "Dwight",True)

In [None]:
plot([x[0] for x in michael_most_repeated], [x[1] for x in michael_most_repeated], "Michael",True)

In [None]:
word_frequency = word_count(' '.join(data["michael"]) + ' '.join(data["dwight"]))

In [None]:
word_frequency = dict(sorted(word_frequency.items(), key=lambda item: item[1], reverse=True))

In [None]:
_ = [print(x) for x in list(word_frequency.items())[:10]]

In [None]:
plot(list(word_frequency.keys())[:180], list(word_frequency.values())[:180], "Histogram of Word Frequencies", True, True)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
vectorizer = TfidfVectorizer(analyzer='word' , stop_words='english')
vectors = vectorizer.fit_transform([' '.join(data['michael']), ' '.join(data['dwight'])])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [None]:
a = dict.fromkeys(feature_names, denselist)

In [None]:
a = dict(sorted(a.items(), key=lambda item: item[1], reverse=True))

In [None]:
[print(x) for x in list(a.items())[100]]

In [None]:
df

In [None]:
from nltk.corpus import stopwords
# stopwords.words('english')

In [None]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict
  

def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [None]:
documentA = ' '.join(data['michael'])
documentB = ' '.join(data['dwight'])

In [None]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [None]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [None]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [None]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [None]:
idfs = computeIDF([numOfWordsA, numOfWordsB])

In [None]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])

In [None]:
df

In [None]:
tfidf_1 = {}
tfidf_2 = {}
for k in df.keys():
  tfidf_1[k] = df[k][0]
  tfidf_2[k] = df[k][1]

In [None]:
tfidf_1 = dict(sorted(tfidf_1.items(), key=lambda item: item[1], reverse=True))
tfidf_2 = dict(sorted(tfidf_2.items(), key=lambda item: item[1], reverse=True))

In [None]:
_ = [print(item) for item in list(tfidf_1.items())[:10]]
plot(list(tfidf_1.keys())[:10], list(tfidf_1.values())[:10], "TF-IDF of Word Frequencies - Michael",True)

In [None]:
_ = [print(item) for item in list(tfidf_2.items())[:10]]
plot(list(tfidf_2.keys())[:10], list(tfidf_2.values())[:10], "TF-IDF of Word Frequencies - Dwight",True)

In [None]:
def compute_RNF(docA, docB):
  wc_A = word_count(docA)
  wc_B = word_count(docB)
  total_A = sum([value for value in wc_A.values()])
  total_B = sum([value for value in wc_B.values()])

  RNF = {}

  for word in wc_A.keys():
    if word not in wc_B.keys():
      continue
    RNF[word] = (wc_A[word]/total_A)/(wc_B[word]/total_B)
  
  return dict(sorted(RNF.items(), key=lambda item: item[1], reverse=True))

In [None]:
RNF_A = compute_RNF(documentA, documentB)
RNF_B = compute_RNF(documentB, documentA)

In [None]:
_ = [print(item) for item in list(RNF_A.items())[:10]]
plot(list(RNF_A.keys())[:10], list(RNF_A.values())[:10], "RNF - Michael",True)

In [None]:
_ = [print(item) for item in list(RNF_B.items())[:10]]
plot(list(RNF_B.keys())[:10], list(RNF_B.values())[:10], "RNF - Dwight",True)