<a href="https://colab.research.google.com/github/Benjohn2001/The-Office-Predictor/blob/main/The_Office_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd

#Initial importing of data
parentURL = 'https://raw.githubusercontent.com/Benjohn2001/The-Office-Predictor/main/parent_reply.csv'
talkingURL = 'https://raw.githubusercontent.com/Benjohn2001/The-Office-Predictor/main/talking_head.csv'
parentDS = pd.read_csv(parentURL)
talkingDS = pd.read_csv(talkingURL)

#Drop the columns that are not needed
#Rename reply to quote so both datasets have matching column names
#Combine both the datasets and use reset_index with drop true to reset indexes
#and discard the old indexes, all data is now combined
#We have 27899 quotes when combined
talkingDS=talkingDS.drop(columns=['quote_id'])
parentDS=parentDS.drop(columns=["parent_id", "parent"])
parentDS=parentDS.rename(columns={'reply': 'quote'})
combinedDS = pd.concat([parentDS, talkingDS]).reset_index(drop=True)

#Remove quotes of length less than 5, removing 10575 quotes, now 17324 quotes 
#Specification has limit of 10000 samples so now random sample to 10000
i=0
for item in combinedDS['quote'].values:
    if(len(item.split())<5):
        combinedDS=combinedDS.drop(i,axis=0)
    i=i+1
sampled = combinedDS.sample(n=10000).reset_index(drop=True)

#Split dataset in 60% training, 20% validation, and 20% test
trainDS=sampled[:6000]
validationDS=sampled[6000:8000]
testDS=sampled[8000:]

print(trainDS.head())


                                               quote character
0  Yeah, that’s right. Fifteen years and three mo...   Michael
1            Let’s see, since I saw you an hour ago?       Jim
2                                No. We took a walk.       Jim
3                    I thought that was your corner.       Pam
4  Oh, nothing can hurt you now. You’re a man in ...    Dwight


In [3]:
sampledCounts=sampled['character'].value_counts()
trainCounts=trainDS['character'].value_counts()
validationCounts=validationDS['character'].value_counts()
testCounts=testDS['character'].value_counts()

tableDataCount={
    'Dataset':['Initial', 'Train', 'Validation', 'Test'],
    'Michael':[sampledCounts['Michael'],trainCounts['Michael'],validationCounts['Michael'],testCounts['Michael']],
    'Dwight':[sampledCounts['Dwight'],trainCounts['Dwight'],validationCounts['Dwight'],testCounts['Dwight']],
    'Jim':[sampledCounts['Jim'],trainCounts['Jim'],validationCounts['Jim'],testCounts['Jim']],
    'Pam':[sampledCounts['Pam'],trainCounts['Pam'],validationCounts['Pam'],testCounts['Pam']],
}

tableDataPercent={
    'Dataset':['Initial', 'Train', 'Validation', 'Test'],
    'Michael':[str(round(sampledCounts['Michael']/10000*100,2)),str(round(trainCounts['Michael']/6000*100,2)),str(round(validationCounts['Michael']/2000*100,2)),str(round(testCounts['Michael']/2000*100,2))],
    'Dwight':[str(round(sampledCounts['Dwight']/10000*100,2)),str(round(trainCounts['Dwight']/6000*100,2)),str(round(validationCounts['Dwight']/2000*100,2)),str(round(testCounts['Dwight']/2000*100,2))],
    'Jim':[str(round(sampledCounts['Jim']/10000*100,2)),str(round(trainCounts['Jim']/6000*100,2)),str(round(validationCounts['Jim']/2000*100,2)),str(round(testCounts['Jim']/2000*100,2))],
    'Pam':[str(round(sampledCounts['Pam']/10000*100,2)),str(round(trainCounts['Pam']/6000*100,2)),str(round(validationCounts['Pam']/2000*100,2)),str(round(testCounts['Pam']/2000*100,2))],
}

tableFrameCount=pd.DataFrame(tableDataCount)
print('Table displaying the amount of quotes for each character in each dataset\n')
print(tableFrameCount.to_string(index=False))

tableFramePercent=pd.DataFrame(tableDataPercent)
print('\nTable displaying the percentage of character quotes in each dataset\n')
print(tableFramePercent.to_string(index=False))

Table displaying the amount of quotes for each character in each dataset

   Dataset  Michael  Dwight  Jim  Pam
   Initial     4136    2392 1959 1513
     Train     2505    1454 1173  868
Validation      805     475  405  315
      Test      826     463  381  330

Table displaying the percentage of character quotes in each dataset

   Dataset Michael Dwight   Jim   Pam
   Initial   41.36  23.92 19.59 15.13
     Train   41.75  24.23 19.55 14.47
Validation   40.25  23.75 20.25 15.75
      Test    41.3  23.15 19.05  16.5


In [12]:
#Using the english pipeline we tokenize each quote and add it to the dataframe
#The text_pipeline_spacy function is from the labs
import spacy
nlp = spacy.load("en_core_web_sm")

def text_pipeline_spacy(text):
    tokens = []
    doc = nlp(text)
    for t in doc:
        if not t.is_stop and not t.is_punct and not t.is_space:
            tokens.append(t.lemma_.lower())
    return tokens

tokensArr=[]
for q in trainDS.itertuples():
  tokensArr.append(text_pipeline_spacy(q.quote))

trainDSwTokens = trainDS.assign(tokens=tokensArr)

In [13]:
trainDSwTokens.head()

Unnamed: 0,quote,character,tokens
0,"Yeah, that’s right. Fifteen years and three mo...",Michael,"[yeah, right, year, month, wow, hire, think, r..."
1,"Let’s see, since I saw you an hour ago?",Jim,"[let, see, hour, ago]"
2,No. We took a walk.,Jim,"[take, walk]"
3,I thought that was your corner.,Pam,"[think, corner]"
4,"Oh, nothing can hurt you now. You’re a man in ...",Dwight,"[oh, hurt, man, love]"


In [16]:
#Generate the vocabulary for the quotes tokens
#make_vocabulary function is from the labs
def make_vocabulary(corpus):
  dic={}
  i=0
  for l in corpus:
    for token in l:
      if not token in dic:
        dic[token]=i
        i=i+1
  return dic

vocabTrain = make_vocabulary(tokensArr)

In [20]:
#Generate the document frequency for the quotes tokens
#doc_frequency function is from the labs

def doc_frequency(corpus):
  dic={}
  for l in corpus:
    already=[]
    for t in l:
      if t not in already:
        i = dic.get(t)
        if i == None:
          dic[t]=1
        else:
          dic[t]=i+1
        already.append(t)
  return dic

trainDocFreq=doc_frequency(trainDSwTokens.tokens)