In [3]:
import textract
import numpy as np
import scipy
import gensim
import os
import pandas as pd
import re
import math
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
#nltk.download('averaged_perceptron_tagger')
from collections import Counter
from matplotlib import pyplot as plt
from gensim import corpora, models
%matplotlib inline

In [4]:
stopwords_list=stopwords.words('english')
remove_stopwords_function=lambda tokenized_text, stopwords: [word for word in tokenized_text if word not in stopwords]
lemmatizer_instance=WordNetLemmatizer()
pos_tags_lemmatize_mapping_dict={'N': 'n', 'V': 'v', 'J': 'a', 'R': 'r'}

def pos_mapping_function(pos_tag, dictionary=pos_tags_lemmatize_mapping_dict):
    if pos_tag[0] in ['N', 'V', 'J', 'R']:
        return dictionary[pos_tag[0]]
    else:
        return 'n'
    
def lemmatizer_function(text, dictionary=pos_tags_lemmatize_mapping_dict, pos_mapping_function=pos_mapping_function,
                       lemmatizer=lemmatizer_instance):
    pos_tags_for_lemmatize=[(word, pos_mapping_function(pos_tag)) for word, pos_tag in nltk.pos_tag(text)]
    pos_tags_lemmatized=[lemmatizer_instance.lemmatize(word, pos=pos_tag) for word, pos_tag in pos_tags_for_lemmatize]
    return pos_tags_lemmatized

def text_processing_pipeline(text_list,additional_stopwords, min_token_count=1, stopwords_list=stopwords_list, 
                             lemmatizer_function=lemmatizer_function, dictionary=pos_tags_lemmatize_mapping_dict,
                             pos_mapping_function=pos_mapping_function, lemmatizer=lemmatizer_instance):
    stopwords_list=stopwords_list+additional_stopwords
    lowercase_text_list=[text.lower() for text in text_list] #Making text lowercase
    lowercase_text_list=[re.sub(r"[^a-zA-Z0-9]", " ", text) for text in lowercase_text_list] #Removal of punctuation
    lowercase_text_list=[text.split() for text in lowercase_text_list] #Tokenization
    filtering_original_text=[text_list[i] for i in range (len(lowercase_text_list)) if len(lowercase_text_list[i])>min_token_count]
    lowercase_text_list=[text for text in lowercase_text_list if len(text)>min_token_count] #Keeping text with an at least a pre-defined token count
    lowercase_text_list=[remove_stopwords_function(text, stopwords_list) for text in lowercase_text_list] #Removing stopwords
    lowercase_text_list=[lemmatizer_function(text) for text in lowercase_text_list] #Lemmatization
    lowercase_text_list=[remove_stopwords_function(text, stopwords_list) for text in lowercase_text_list] #Removing stopwords
    return lowercase_text_list, filtering_original_text

In [16]:
crisis_logger=pd.read_csv('CrisisLogger/crisislogger.csv')
additional_stopwords=['time','get','go','school', 'work', 'know', 'like', 'really', 'think', 'home',
                     'family', 'feel', 'thing', 'kid', 'day', 'u', 'lot', 'child', 'also',
                     'make', 'people', 'one', 'see', 'take', 'friend', 'year', 'much',
                     'help', 'need', 'way', 'even',
                     'well', 'try', 'kind', 'daughter', 'would',
                     'able', 'old', 'say', 'back', 'want']
stopwords_list=stopwords_list+additional_stopwords
processed_transcriptions=text_processing_pipeline(list(crisis_logger.transcriptions), stopwords_list)
flattened_list=[token for crisislog in processed_transcriptions[0] for token in crisislog]
most_frequent_words_df=pd.DataFrame(Counter(flattened_list).most_common())
most_frequent_words_df.to_csv('Most_frequent_words_from_crisislogger.csv')

In [5]:
folders=['April 2020', 'May 2020', 'November 2020', 'April 2021']
Prolific_academic_list_adult=[pd.read_csv('ProlificAcademic/' + folder + '/Data/CRISIS_Adult_' + re.sub(' ', '_', folder) + '.csv')
                             for folder in folders]
Prolific_academic_list_parent=[pd.read_csv('ProlificAcademic/' + folder + '/Data/CRISIS_Parent_' + re.sub(' ', '_', folder) + '.csv')
                             for folder in folders]
Prolific_academic_list=Prolific_academic_list_adult+Prolific_academic_list_parent
specify_positive_data_orig=[list(Prolific_academic.specifypositive) for Prolific_academic in Prolific_academic_list]

  if (yield from self.run_code(code, result)):
  if (yield from self.run_code(code, result)):


In [7]:
folders=['April 2020', 'May 2020', 'November 2020', 'April 2021']
additional_stopwords_prolific_academic=[]
Prolific_academic_list_adult=[pd.read_csv('ProlificAcademic/' + folder + '/Data/CRISIS_Adult_' + re.sub(' ', '_', folder) + '.csv')
                             for folder in folders]
Prolific_academic_list_parent=[pd.read_csv('ProlificAcademic/' + folder + '/Data/CRISIS_Parent_' + re.sub(' ', '_', folder) + '.csv')
                             for folder in folders]
Prolific_academic_list=Prolific_academic_list_adult+Prolific_academic_list_parent
specify_positive_data_orig=[list(Prolific_academic.specifypositive) for Prolific_academic in Prolific_academic_list]
specify_positive_data=[answer for survey in specify_positive_data_orig for answer in survey if type(answer)==str]
keep_only_string=lambda survey: ([answer for answer in survey if type(answer)==str])
specify_positive_data_by_survey=[keep_only_string(survey) for survey in specify_positive_data_orig]
specify_positive_data_by_survey=[text_processing_pipeline(survey, additional_stopwords=additional_stopwords_prolific_academic) for survey in specify_positive_data_by_survey]
processed_specify_positive_data=text_processing_pipeline(specify_positive_data, additional_stopwords=additional_stopwords_prolific_academic)[0]
tokens_from_specify_positive_data=[token for answer in processed_specify_positive_data for token in answer]
most_frequent_words_df=pd.DataFrame(Counter(tokens_from_specify_positive_data).most_common())
most_frequent_words_df.to_csv('Most_frequent_words_from_ProlificAcademic.csv')