# CREATE WORDCLOUD FROM MULTIPLE DOCUMENTS

In [2]:
import os
import string
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud,STOPWORDS
import PyPDF2
import textract

# STEP 1: Read the PDF files

## METHOD 1: Using textract

NOTE: textract supports reading of other documents as well like DOCX, JPG etc
Refer: https://textract.readthedocs.io/en/stable/

In [3]:
def read_file_textract(filepath):
    text = textract.process(filepath)
    return text.decode("utf-8") 

## METHOD 2: Using PyPDF

In [4]:
def read_file_pypdf(filepath):
    pdfFileObj = open(filepath,'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    num_pages = pdfReader.numPages
    text = ""
    # Read all the pages
    for pg in range(num_pages):
        page = pdfReader.getPage(pg)
        text += page.extractText()
    return text  
        
        

## Read file using any of the pdf readers

In [5]:
def read_file(filepath, use_method = 'textract'):
    
    text = ""
    if not os.path.isfile(filepath):
        print(f'Invalid file:{filepath}')
    else:
        if use_method == 'textract':
            return read_file_textract(filepath)
        elif use_method == 'pypdf':
            return read_file_pypdf(filepath)
        else:
            print('Invalid method to read file. Supported formats: "textract" or "pypdf".')
    
    return text

# STEP 2: Identify the keywords in the document

In [6]:
def extract_keywords(text, ignore_words = [],
                     min_word_length = 0,
                     ignore_numbers = True,
                     ignore_case = True):
    # Remove words with special characters
    filtered_text = ''.join(filter(lambda x:x in string.printable, text))
    
    # Create word tokens from the text string
    tokens = word_tokenize(filtered_text)
    
    # List of punctuations to be ignored 
    punctuations = ['(',')',';',':','[',']',',','.','--','-','#','!','*','"','%']
    
    # Get the stopwords list to be ignored
    stop_words = stopwords.words('english')

    # Convert ignore words from user to lower case
    ignore_words_lower = [x.lower() for x in ignore_words]
    
    # Combine all the words to be ignored
    all_ignored_words = punctuations + stop_words + ignore_words_lower
    
    # Get the keywords list
    keywords = [word for word in tokens \
                    if  word.lower() not in all_ignored_words
                    and len(word) >= min_word_length]    

    # Remove keywords with only digits
    if ignore_numbers:
        keywords = [keyword for keyword in keywords if not keyword.isdigit()]

    # Return all keywords in lower case if case is not of significance
    if ignore_case:
        keywords = [keyword.lower() for keyword in keywords]
    
    return keywords
    
    

# STEP 3: Create the wordcloud

In [7]:
def create_word_cloud(keywords, maximum_words = 100, bg = 'white', cmap='Dark2',
                     maximum_font_size = 256, width = 3000, height = 2000,
                     random_state = 42, fig_w = 15, fig_h = 10, output_filepath = None):
    
    # Convert keywords to dictionary with values and its occurences
    word_could_dict=Counter(keywords)

    wordcloud = WordCloud(background_color=bg, max_words=maximum_words, colormap=cmap, 
                          stopwords=STOPWORDS, max_font_size=maximum_font_size,
                          random_state=random_state, 
                          width=width, height=height).generate_from_frequencies(word_could_dict)
    
    plt.figure(figsize=(fig_w,fig_h))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    if output_filepath:
        plt.savefig(output_filepath, bbox_inches='tight')
    plt.show()
    plt.close()

# PUTTING IT TOGETHER

## WORDCLOUD FOR A SINGLE FILE

In [17]:
import os 
filepath = os.path.join('sample_files', 'AI_Blockchain_Survey_2022.pdf')
file_text = read_file(filepath, pypdf)
print(file_text)
outputfile = 'outputfile.jpg'
#keywords = extract_keywords(file_text,min_word_length = 3)
#create_word_cloud(keywords, maximum_words=100, maximum_font_size = 400, output_filepath = outputfile)
            

NameError: name 'file_text' is not defined

## WORDCLOUD FOR ALL FILES IN A FOLDER

In [20]:
docs_path = 'C:\\Users\\Andrew\\Documents\\DSO_work\\AAS papers\\readings'
ignore_words = ['Fig','like','e.g.','i.e.','one']
all_keywords = []

for filename in os.listdir(docs_path):
    filepath = os.path.join(docs_path, filename)
    if os.path.isfile(filepath) and filename.endswith('.pdf'):
        print(f'Parsing file: {filename}')
        try:
            file_text = read_file(filepath)
            keywords = extract_keywords(file_text,min_word_length = 3, ignore_words = ignore_words)
            all_keywords.extend(keywords)
        except:
            print(f'ERROR!!! Unable to parse file: {filename}. Ignoring file!!')
        

print(f'Completed reading all pdf files in folder:{docs_path}')

create_word_cloud(all_keywords, bg = 'black', cmap = 'Set2',random_state = 100, width = 1000, height = 1000)

Parsing file: 3D Reconstruction of Non-Cooperative Resident Space Object using Instant NeRF and D-NeRF.pdf
ERROR!!! Unable to parse file: 3D Reconstruction of Non-Cooperative Resident Space Object using Instant NeRF and D-NeRF.pdf. Ignoring file!!
Parsing file: 3UCubed The IMAP Student Collaboration CubeSat Project.pdf
ERROR!!! Unable to parse file: 3UCubed The IMAP Student Collaboration CubeSat Project.pdf. Ignoring file!!
Parsing file: A Comparison of Deep Reinforcement Learning Algorithms for Earth-Observing Satellite Scheduling.pdf
ERROR!!! Unable to parse file: A Comparison of Deep Reinforcement Learning Algorithms for Earth-Observing Satellite Scheduling.pdf. Ignoring file!!
Parsing file: A hybrid stochastic-deterministic integrator for spacecraft dynamics with uncertainty.pdf
ERROR!!! Unable to parse file: A hybrid stochastic-deterministic integrator for spacecraft dynamics with uncertainty.pdf. Ignoring file!!
Parsing file: A Monte Carlo Analysis of Contingency Optimal Guidance

ValueError: We need at least 1 word to plot a word cloud, got 0.

# ADDON: LIST OF TOP KEYWORDS AS DATAFRAME

In [10]:
import pandas as pd 
pd.set_option("max_rows", None)
distinct_keywords_df = pd.DataFrame(all_keywords,columns=['keywords']).value_counts().rename_axis('keyword').reset_index(name='count')
distinct_keywords_df['word_len'] = distinct_keywords_df['keyword'].apply(lambda x: len(x))
distinct_keywords_df.head(50)




Unnamed: 0,keyword,count,word_len
