In [1]:
# Ben Caruso
# Pipeline to read in word documents

import docx

import os
import subprocess

## Functionality to read in docs, convert to docx, and then read them all into jupyter

**Step 1: Conversion of documents to .docx extension**

Initially, the documents I was given were audio recordings as .doc files in Word. Upon trying to read them in using the python-docx library I chose, I found that I was not able to as they weren't considered readable. Thus, I needed to convert each to a readable .docx file before reading them in. So, I used the subprocess module to automate the process, rather than convert all by hand. The function I wrote below checks the current directory for all .doc files, and runs a textutil (built-in functionality on mac that works with file conversion) command to copy the document as a .docx file. In order to work, the current directory must be set to whichever directory contains the .doc objects you want to convert. Change it with os.chdir(dir_name) 

In [2]:
# Change current directory to desired directory
#os.chdir(dir_name)

# Script to convert docs within a given directory to .docx - makes a copy for each doc
def convert_doc_to_docx():
    # os.listdir defaults to the current directory
    for filename in os.listdir():
        if filename.endswith('.doc'):
            # Creates a copy with the .docx extension
            # Works on mac because textutil is built in
            # soffice if using windows
            subprocess.call(['textutil', '-convert', 'docx', filename])

**Step 2: Read in documents with .docx extension**

This function reads all .docx files from the current directory using the python-docx module, and calls my docx_to_text() function below to convert it to a block of text. It assumes consistent formatting between all the .docx files passed in, which is important. The docx_to_text() function was written to handle the format of the interviews I've been working with, and will only work with this exact format, which suffices for this problem but may not for others. Again, in order for this function to work properly, the current directory must contain the .docx files of interest.

In [3]:
def docx_to_text(docx):
    '''
    This function will read in a word document (docx) utilizing the python-docx library
    
    Input
        docx (docx.Document object): Word document as a .docx
    
    Returns
        Interviewer (string): Block of text consisting of interviewer's statements
        Farmer (string): Block of text consisting of farmer's responses and statements
    '''
    
    # Convert document into paragraphs
    paragraphs = docx.paragraphs
    
    # Convert paragraphs into conversation
    convo = []
    for para in paragraphs:
        if para.text != '':
            convo.append(para.text)
    
    # Split into interviewer and interviewee lists
    interviewer = []
    farmer = []

    for para in convo:
        if 'Interviewer' in para or 'Meredith' in para:
            interviewer.append(para)
        else:
            farmer.append(para)
            
    # Process into cleaner blocks of text
    clean_interviewer = ""
    for text in interviewer:
        if '\t' in text:
            clean_interviewer += (text.split('\t')[1]) + " "
        else:
            clean_interviewer += text + " "
            
    clean_farmer = ""
    for text in farmer:
        if '\t' in text:
            clean_farmer += text.split('\t')[1] + " "
        else:
            clean_farmer += text + " "
        
    # preprocessing to clean up words and text so that text is lowercase
    clean_interviewer = clean_interviewer.lower()
    clean_farmer = clean_farmer.lower()
        
    return clean_interviewer, clean_farmer

In [4]:
def read_docx(docx):
    '''
    This function will read in a word document (docx) utilizing the python-docx library and return a
    list of Q & A tuples containing an interviewer's question/statement and the interviewee's direct response
    
    Input: 
        docx : docx.Document object - Word document as a .docx
    
    Output:
        Interviewer (list): List containing tuples that each contain an index and
                                        text consisting of interviewer's statements
        Farmer (list): List containing tuples that each contain an index and
                                        text consisting of farmers's responses and statements
    '''
    
    # Convert document into paragraphs
    paragraphs = docx.paragraphs
    
    # Convert paragraphs into conversation
    convo = []
    for para in paragraphs:
        if para.text != '':
            convo.append(para.text)
    
    # Split into interviewer and interviewee lists
    interviewer = []
    interviewee = []

    for index, para in enumerate(convo):
        if 'Interviewer' in para or 'Meredith' in para:
            interviewer.append([index, para])
        else:
            interviewee.append([index-1, para])
            
    # Process into cleaner lists
    clean_interviewer = []
    for index, text in interviewer:
        if '\t' in text:
            clean_interviewer.append((int(index / 2), text.split('\t')[1]))
    
    clean_interviewee = []
    for index, text in interviewee:
        if '\t' in text:
            clean_interviewee.append((int(index / 2), text.split('\t')[1]))
    
    return clean_interviewer, clean_interviewee

In [5]:
# Function to read in all docx files from the current directory
# Make sure you are in the correct directory before proceeding with this function
def read_in_docx():
    
    # Define list of doc_texts
    doc_texts = []
    
    # Searches current directory for .docx files
    for filename in os.listdir():
        if filename.endswith('.docx'):
            # Convert to doc object and append to list
            doc = docx.Document(filename)
            
            # Convert doc object to a block of text - ASSUMES CONSISTENT FORMATTING
            doc_texts.append(docx_to_text(doc))
            
            
    return doc_texts

In [6]:
os.chdir('Vermont Transcripts/farmer_interviews')
vt_farmer_texts = read_in_docx()
# Worked on vermont transcripts/farmer interviews

# Returned a list of tuples that consist of the entire interviewer text and farmer text

os.chdir('../../')

In [7]:
os.chdir('Vermont Transcripts/expert_interviews/')
vt_expert_texts = read_in_docx()
os.chdir('../../')

In [8]:
os.chdir('UMaine Transcripts/farmer_interviews')
me_farmer_texts = read_in_docx()
os.chdir('../../')

In [9]:
os.chdir('UMaine Transcripts/expert_interviews')
me_expert_texts = read_in_docx()
os.chdir('../../')

**Step 3: Convert to texts**

Next, I convert list of tuples returned by read_in_docx using a simple text_process function and return a large string containing only the interviewer's response

In [10]:
def text_process(text_param):
    # Takes a list of tuples and returns a string containing only the interviewer's responses
    text_list = [text[1] for text in text_param]
    
    text_str = ""
    
    return text_str.join(text_list)

In [11]:
vt_farmer_text = text_process(vt_farmer_texts)
vt_expert_text = text_process(vt_expert_texts)
me_farmer_text = text_process(me_farmer_texts)
me_expert_text = text_process(me_expert_texts)

In [12]:
# This is a collection of unprocessed (except for converted to lowercase) text of the vermont farmers' interview
# responses. This is the raw data that will be processed using spacy's nlp() function
vt_farmer_text[:100]

'sure. yes, i’m hannah doyle, i have boneyard farm here, and it is 10 acres total, though a lot of th'

In [13]:
# This is a collection of unprocessed (except for converted to lowercase) text of the vermont experts' interview
# responses. This is the raw data that will be processed using spacy's nlp() function
vt_expert_text[:100]

'cool. i didn’t prepare anything. i mean, i just –i made a couple notes this morning. okay, good. yea'

Read the texts out to .txt files so that I could perform preprocessing on the actual text data itself in another script

In [33]:
# Read out text strings to a text file
vt_farmer_file = open("vt_farmer_text.txt", "w")
vt_expert_file = open("vt_expert_text.txt", "w")
me_farmer_file = open("me_farmer_text.txt", "w")
me_expert_file = open("me_expert_text.txt", "w")

vt_farmer_file.write(vt_farmer_text)
vt_expert_file.write(vt_expert_text)
me_farmer_file.write(me_farmer_text)
me_expert_file.write(me_expert_text)

vt_farmer_file.close()
vt_expert_file.close()
me_farmer_file.close()
me_expert_file.close()

In [189]:
def read_in_docs_as_qna():
    # Define list of doc_texts
    doc_texts = []

    # Searches current directory for .docx files
    for filename in os.listdir():
        if filename.endswith('.docx'):

            # Convert to doc object and append to list
            doc = docx.Document(filename)

            # Convert doc object to Q&A tuples - ASSUMES CONSISTENT FORMATTING
            doc_texts.append(read_docx(doc))
    
    return doc_texts

In [197]:
def read_files(qna = True):
    """Reads in all files at once. Returns 4 items
       Run from home directory with pathnames double-checked"""
    
    os.chdir('Vermont Transcripts/farmer_interviews')
    if qna:
        vt_farmer_texts = read_in_docs_as_qna()
    else:
        vt_farmer_texts = read_in_docx()
    os.chdir('../../')
    
    os.chdir('Vermont Transcripts/expert_interviews/')
    if qna:
        vt_expert_texts = read_in_docs_as_qna()
    else:
        vt_expert_texts = read_in_docx()
    os.chdir('../../')

    os.chdir('UMaine Transcripts/farmer_interviews/')
    if qna:
        me_farmer_texts = read_in_docs_as_qna()
    else:
        me_farmer_texts = read_in_docx()
    os.chdir('../../')

    os.chdir('UMaine Transcripts/expert_interviews/')
    if qna:
        me_expert_texts = read_in_docs_as_qna()
    else:
        me_expert_texts = read_in_docx()
    os.chdir('../../')