In [1]:
import textract
import numpy as np
import scipy
import gensim
import os
import pandas as pd
import re
import math
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
#nltk.download('averaged_perceptron_tagger')
from collections import Counter
from matplotlib import pyplot as plt
from gensim import corpora, models
%matplotlib inline

In [2]:
class FocusGroup:
    def __init__(self, filename):
        self.raw_text=str(textract.process('FocusGroups/' + filename + ".docx")).replace('b\'', '').replace('\'', '')
        
        self.parent_moderator_discussion=self.raw_text.split('\\n\\n\\n')[0].split('\\n\\n')
        self.text_including_parents=np.array([parent_moderator_actual
                                    for parent_moderator_actual in self.parent_moderator_discussion 
                                    if not (('Parent'==re.sub(r" [0-9]:","",parent_moderator_actual)) or 
                                        ('Moderator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or 
                                        ('Administrator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
                                        ('Speaker'==re.sub(r" [0-9]:","",parent_moderator_actual)))])
        self.talkers_including_parents=np.array([parent_moderator_actual.replace(':', '') 
                                    for parent_moderator_actual in self.parent_moderator_discussion 
                                    if (('Parent'==re.sub(r" [0-9]:","",parent_moderator_actual)) or 
                                        ('Moderator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or 
                                        ('Administrator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
                                        ('Speaker'==re.sub(r" [0-9]:","",parent_moderator_actual)))])
        
        if len(self.raw_text.split('\\n\\n\\n'))>1:
            self.within_moderator_discussion=self.raw_text.split('\\n\\n\\n')[1].split('\\n\\n')
            self.text_only_moderators=np.array([parent_moderator_actual
                                    for parent_moderator_actual in self.within_moderator_discussion 
                                    if not (('Parent'==re.sub(r" [0-9]:","",parent_moderator_actual)) or 
                                        ('Moderator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or 
                                        ('Administrator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
                                        ('Speaker'==re.sub(r" [0-9]:","",parent_moderator_actual)))])
            self.talkers_only_moderators=np.array([parent_moderator_actual.replace(':', '') 
                                    for parent_moderator_actual in self.within_moderator_discussion 
                                    if (('Parent'==re.sub(r" [0-9]:","",parent_moderator_actual)) or 
                                        ('Moderator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or 
                                        ('Administrator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
                                        ('Speaker'==re.sub(r" [0-9]:","",parent_moderator_actual)))])
        
        self.parent_list=[participant for participant in set(self.talkers_including_parents) if 'Parent' in participant]
        self.moderator_list=[participant for participant in set(self.talkers_including_parents) if 'Moderator' in participant]
        
        
    def get_participant_text(self, participant):
        if 'Parent' in participant:
            mask=[member==participant for member in self.talkers_including_parents]
            return list(self.text_including_parents[mask])
        elif 'Moderator' in participant:
            mask=[member==participant for member in self.talkers_including_parents]
            text_from_parent_discussion=self.text_including_parents[mask]
            
            if len(self.raw_text.split('\\n\\n\\n'))==1:
                return list(text_from_parent_discussion)
            else:
                mask=[member==participant for member in self.talkers_only_moderators]
                text_from_moderator_discussion=self.text_only_moderators[mask]
                return list(text_from_parent_discussion) + list(text_from_moderator_discussion)

In [3]:
stopwords_list=stopwords.words('english')
custom_stopwords=['go','parent','say','0','yeah','would','okay','start','also','well','u','thank','inaudible','crosstalk','able','hear','actually','hi','oh','definitely','part','anything','sure','anyone','yes','thanks','everything','end','everybody','tand','administrator','whatever','sound','ti','moderator','though','mute','speak','silence','finish','bye','audio']
stopwords_list=stopwords_list+custom_stopwords
remove_stopwords_function=lambda tokenized_text, stopwords: [word for word in tokenized_text if word not in stopwords]
lemmatizer_instance=WordNetLemmatizer()
pos_tags_lemmatize_mapping_dict={'N': 'n', 'V': 'v', 'J': 'a', 'R': 'r'}

def pos_mapping_function(pos_tag, dictionary=pos_tags_lemmatize_mapping_dict):
    if pos_tag[0] in ['N', 'V', 'J', 'R']:
        return dictionary[pos_tag[0]]
    else:
        return 'n'
    
def lemmatizer_function(text, dictionary=pos_tags_lemmatize_mapping_dict, pos_mapping_function=pos_mapping_function,
                       lemmatizer=lemmatizer_instance):
    pos_tags_for_lemmatize=[(word, pos_mapping_function(pos_tag)) for word, pos_tag in nltk.pos_tag(text)]
    pos_tags_lemmatized=[lemmatizer_instance.lemmatize(word, pos=pos_tag) for word, pos_tag in pos_tags_for_lemmatize]
    return pos_tags_lemmatized

def text_processing_pipeline(text_list,additional_stopwords, min_token_count=1, stopwords_list=stopwords_list, 
                             lemmatizer_function=lemmatizer_function, dictionary=pos_tags_lemmatize_mapping_dict,
                             pos_mapping_function=pos_mapping_function, lemmatizer=lemmatizer_instance):
    stopwords_list=stopwords_list+additional_stopwords
    lowercase_text_list=[text.lower() for text in text_list] #Making text lowercase
    lowercase_text_list=[re.sub(r"[^a-zA-Z0-9]", " ", text) for text in lowercase_text_list] #Removal of punctuation
    lowercase_text_list=[text.split() for text in lowercase_text_list] #Tokenization
    filtering_original_text=[text_list[i] for i in range (len(lowercase_text_list)) if len(lowercase_text_list[i])>min_token_count]
    lowercase_text_list=[text for text in lowercase_text_list if len(text)>min_token_count] #Keeping text with an at least a pre-defined token count
    lowercase_text_list=[remove_stopwords_function(text, stopwords_list) for text in lowercase_text_list] #Removing stopwords
    lowercase_text_list=[lemmatizer_function(text) for text in lowercase_text_list] #Lemmatization
    lowercase_text_list=[remove_stopwords_function(text, stopwords_list) for text in lowercase_text_list] #Removing stopwords
    return lowercase_text_list, filtering_original_text

In [6]:
file_list=['Gaming_Group1', 'Gaming_Group2', 'Gaming_Group3', 'Gaming_Group4',
           'LowPIU_Group1', 'LowPIU_Group2', 'LowPIU_Group3',
           'Media_Group1', 'Media_Group2', 'Media_Group3', 'Media_Group4',
           'Social_Group1', 'Social_Group2', 'Social_Group3', 'Social_Group4']
additional_stopword_counts=list(dict(Counter([re.sub('[0-9]', '', file,) for file in file_list])).values())
Gaming_group_stopwords=['like', 'get', 'school', 'hour', 'day', 'even', 'think', 'thing', 'way', 'know', 'year', 'week', 'really', 'one',
                       'kid', 'game', 'use', 'time', 'want', 'play', 'much', 'back']
Low_PIU_group_stopwords=['school', 'like', 'time', 'get', 'think', 'kid', 'really',
                        'thing', '00', 'technology', 'year', 'child', 'back', 'lot',
                        'even', 'know', 'want', 'old', 'one']
Media_group_stopwords=['like', 'thing', 'get', 'really', 'kid', 'time', 'want',
                      'school', 'think', 'know', 'one', 'use',
                      'year', 'much', 'back', 'work', 'person', 'pandemic',
                      'see', 'lot', 'good', 'little', 'day', 'old']
Social_group_stopwords=['like', 'get', 'think', 'know', 'thing', 'time', 'school',
                       'really', 'child', 'see', 'want',
                       'kid', 'one', 'lot', 'even']
additional_stopwords_list=[Gaming_group_stopwords, Low_PIU_group_stopwords, Media_group_stopwords, Social_group_stopwords]
additional_stopwords_list=[[stopword_list]*count for count, stopword_list in zip(additional_stopword_counts, additional_stopwords_list)]
additional_stopwords_list=[stopword for additional_stopword in additional_stopwords_list for stopword in additional_stopword]
all_focusgroup_text=[FocusGroup(focus_group_file) for focus_group_file in file_list]
all_focusgroup_processed_text=[text_processing_pipeline(focus_group.text_including_parents,additional_stopword_list, min_token_count=60) for focus_group, additional_stopword_list in zip(all_focusgroup_text, additional_stopwords_list)]
flattened_list=[token for focusgroup in all_focusgroup_processed_text for text in focusgroup[0] for token in text]

In [137]:
most_frequent_words_df=pd.DataFrame(Counter(flattened_list).most_common())
most_frequent_words_df.to_csv('Most_frequent_words_from_survey.csv')