In [3]:
#Goal: to practice using Python and Jupyter Notebooks to analyze text and then visualize the results. 

#For this crude project, I will look policy texts and see how their explicit use of key words of globalization, neoliberalism, and human capital theory change over time. I will be first define the key words I am looking for, then I will iterate over the policy text to count the frequency of the times these words are used. The policy documents will then be compared based on year published and the policy writing institution.

In [4]:
#Label and define key words

#Ideas and buzzwords were taken from skimming some articles on the topics. There isn't a great reason theoretical for choosing these specific articles, instead they serve as a quick guide to get a crude list of words for the main goal of improving coding and textual analysis. 

# Human Capital Theory: Holden and Biddle (2017), "The Introduction of Human Capital Theory into Education Policy in the United States", https://doi.org/10.1215/00182702-4296305
# Internationalization : Knight (1994), Internationalization-Elements and Checkpoints

human_capital_theory = ["economic growth", "economy", "economic" "poverty", "investment", "human capital", "capital", "unemployment", "employment", "jobs", "development", "careers"]

internationalization = ["international", "internationalization", "internationalisation", "internationalized" "global", "globalism", "globalization", "globalisation", "foreign", "cross-cultural",  ]

In [5]:
#importing libraries and Defining core functions, 

from collections import Counter
import os
import pandas as pd
import matplotlib.pylab as plt

def count_words (text):
    """
    Takes a text and returns a dict with word + number of times used
    """
    word_count = {}
    text = text.lower()
    skip_punctuation = [",", ".", "''", '""',";", ":", "-", "?", "!"]
    for char in skip_punctuation:
        text = text.replace(char,"")
    for word in text.split(" "):
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
    return word_count

def python_counter(text):
    """
    Create a counting dictionary of unique words and word counts, from an inputted text (that removes puncuation and lowercases)
    """
    text = text.lower()
    skip_punctuation = [",", ".", "'", '"',";", ":"]
    for char in skip_punctuation:
        text = text.replace(char,"")
    word_counts = Counter(text.split(" "))
    return word_counts

def read_book(title_path):
    """
    Read a book and return it as a string.
    """
    with open(title_path, "r", encoding="utf8") as current_file:
        text = current_file.read()
        text = text.replace("\n", "").replace("\r","")
    return text

def word_stats(python_counter):
    num_unique = len(python_counter)
    counts = python_counter.values()
    return (num_unique, counts)

def wordgroup_stats(python_counter, wordlist):
    """
    Takes dic of unique words and count, read through pythoncounter, and a wordlist.
    Outputs a dictionary of matched words and their count.
    """
    unique_textwords = python_counter
    matchedwords = {}
    for word, value in unique_textwords.items():
        if word in wordlist:
            matchedwords[word] = value
    no_of_matched = len(matchedwords.keys())
    return matchedwords, no_of_matched

    

In [6]:
#texts to upload and use
ADBtextpaths = ["/Users/samschmoker/Desktop/Coding/Projects/MasterPython/DataPractice/Policy/ADB/ADBpdftotext/ADB Education Policy 2002.txt", "/Users/samschmoker/Desktop/Coding/Projects/MasterPython/DataPractice/Policy/ADB/ADBpdftotext/ADB, 2008-Education is a Core Operation Area of ADB.txt", "/Users/samschmoker/Desktop/Coding/Projects/MasterPython/DataPractice/Policy/ADB/ADBpdftotext/ADB, 2013a-ADB’s Support for Achieving the Millennium Development Goals.txt"]



In [7]:
#Lets see what words are commonly used in all documents

wordsindocs = {}
wordsindoc1 = {}
wordsindoc2 = {}
wordsinalldocs= {}
for document in ADBtextpaths[0:1]:
    title = document.rsplit("/", 1)[-1]
    text = read_book(document)
    wordsindoc1 = count_words(text)

for document in ADBtextpaths[1:2]:
    title = document.rsplit("/", 1)[-1]
    text = read_book(document)
    wordsindoc2 = count_words(text)
    for word in wordsindoc1.keys():
        if word in wordsindoc2.keys():
            wordsinalldocs[word] = 1
 
for document in ADBtextpaths[2:]:
    title = document.rsplit("/", 1)[-1]
    text = read_book(document)
    wordsinthisdoc = count_words(text)
    for word in wordsinalldocs.copy():
        if word in wordsinthisdoc:
            wordsinalldocs[word] += 1
        else:
            del wordsinalldocs[word]

for word in wordsinalldocs.copy():
    if len(word) < 4:
        del wordsinalldocs[word]

wordsgreaterthan4 = sorted (list(wordsinalldocs.keys()))


print (len(wordsgreaterthan4))



219


In [9]:
#Clean it!
#Inspect the Word list for words greater than 4. Perhaps there are some generic words that should be taken out. 

#print (wordsgreaterthan4)

wordsto_remove = ["adb's'", "areas", "beyond", "billion", "despite", "numbers", "about", "also","basic", "become", "every", "forthe", "from", "have", "high", "higher","important", "given", "good", "includes", "inthe", "less", "levelsof", "made", "many", "meet", "ofadb", "(adb)","other", "plays", "plan", "plans", "refers", "reflect","response", "role","strong","than", "that", "their", "this", "total", "tothe", "uses", "well", "will", "while", "with", "further", "example", "especially"]

edited_wordsinalldoc = []
for word in wordsgreaterthan4:
    if word not in wordsto_remove:
        edited_wordsinalldoc.append(word)

print (len(edited_wordsinalldoc))


170


In [10]:
#Now compare each doc to the overall wordmatching list

table = pd.DataFrame(columns=("Doc Title", "Unique_Words", "Total_Words", "No. of Matched","words", "% matched to total"))
i = 1
file_dict = {}
for document in ADBtextpaths:
    title = document.rsplit("/", 1)[-1]
    text = read_book(document)
    (num_unique, counts) = word_stats(count_words(text))    
    matchedwords, no_of_matched = wordgroup_stats(count_words(text), edited_wordsinalldoc)
    percent_match = (sum(matchedwords.values()) / sum(counts)) * 100

    #input text stats into table      
    table.loc[i] = title, num_unique, sum(counts), sum(matchedwords.values()), matchedwords, percent_match
    i += 1

print (table)

                                           Doc Title Unique_Words Total_Words  \
1                      ADB Education Policy 2002.txt         3934       17861   
2  ADB, 2008-Education is a Core Operation Area o...          887        1511   
3  ADB, 2013a-ADB’s Support for Achieving the Mil...         7518       38587   

  No. of Matched                                              words  \
1           2844  {'framework': 26, 'policy': 115, 'asian': 17, ...   
2            381  {'education': 46, 'core': 2, 'operation': 2, '...   
3           4311  {'support': 388, 'development': 296, 'knowledg...   

   % matched to total  
1           15.922961  
2           25.215089  
3           11.172156  


In [11]:
#Analyze and categorize 

#Look at edited_wordinalldoc list. Then manually sort into sub-lists

place_words = ["asia", "asian", "bangladesh", "cambodia", "china", "east", "indonesia", "kyrgyz", "manila", "nepal", "pacific", "pakistan","republic", "rural","south", "southeast","theasian", "viet", "countries", "region", "region's", "regional", "national"]

verbs = ["access", "collaboration", "contribute", "contribution", "cooperation", "delivery", "develop", "developing", "enroll", "ensuring", "expand", "expanding", "facilitate", "financed", "financing", "focus", "growing", "guarantees", "help", "helped", "helping", "impact", "improve", "improved", "improving", "increase", "increasing", "increasingly", "leverage", "monitoring", "need", "needs", "performance", "practice", "provide", "provides", "providing", "pursue", "reduce", "reduced", "reflects", "sharing", "strengthening", "support", "supports", "supported", "supporting", "toimprove", "study"]

misc = ["alone", "especially", "example", "further", "decades", "individual", "million", "publication", "policy", "significant", "specific", "studies", "through", "critical", "core", "completion"]

other = []

for word in edited_wordsinalldoc:
    if word not in place_words and word not in verbs and word not in misc:
        other.append(word)

print (other)


['adb’s', 'aims', 'andvocational', 'assessment', 'assistance', 'bank', 'basiceducation', 'capacity', 'community', 'comprehensive', 'cost', 'decentralized', 'democratic', 'development', 'developmentadb', 'developmentand', 'dmcs', 'economic', 'education', 'effective', 'efficiency', 'equity', 'expansion', 'female', 'financial', 'framework', 'generation', 'government', 'grants', 'information', 'infrastructure', 'initiatives', 'innovation', 'institutions', 'knowledge', 'labor', 'lessons', 'loans', 'longterm', 'market', 'operation', 'operations', 'opportunities', 'participation', 'partnerships', 'past', 'people', 'people’s', 'poor', 'portfolio', 'poverty', 'private', 'programs', 'progress', 'project', 'projects', 'public', 'quality', 'region’s', 'relevance', 'remote', 'secondary', 'secondaryeducation', 'sector', 'service', 'services', 'share', 'skills', 'social', 'stakeholders', 'strategy', 'subsectors', 'substantial', 'substantially', 'success', 'successfully', 'sustainable', 'system', 'sys

In [None]:
#Quick Recapping thoughts:

#This was a pretty quick play-around that practiced some different things in Pythong:
    #Practiced looping over .txts in a folder (so really easy to scale up with a lot more documents!)

    #First, I practiced searching the texts for specific words I thought of apriori of reading the documents. 
#this was just a fun hunch that didn't end up showing anything useful in this case. I think I could improve on this in two ways: first by first reading through a subset of the texts to get more familiar with the language used and so what words might crop up and second by looping through a much larger set of texts (ideally to show how the use of certain words increased/decreased over time). Overall, the Python part worked fine, I just didn't put enough time into the specific question to make it worth while. 

    #Second, I looped over the texts to find all the words present in all texts. This was kind of cool and useful,though the second document had considerably less words than the other two so most likely considerably skewed the word list to this document. I think if I was to scale this with more documents I wouldn't look at words that matched 100% to all documents. Instead I would set up a Python dictionary who's values would count how many documents the words are in. That would mean it would be easy to see that, for example, the word "rural" was present in 75% of documents looked at. 

    #After finding all of the similiar words in the texts the next step is to display these in a clear way. I am not doing that now because that wasn't the real goal of this practice. The results that I did find--most easily seen in the last lists of categorized words--have some interesting tidbits. Notably the emphasis on certain countries in ASEAN, and China, without mentioning others. Mentioning Manila makes sense since that is the homebase of ADB, and China is obviously a huge player. Curious that other powers, like Japan, SK, Thailand and others are mentioned. Also it doesn't seem that other international institutions are included--but maybe that is a formatting issue that should be looked at more closely. 

    #Just skimming through this word list at the end is a better indicator of my original goal of assuming certain neoliberal/human capital word use. Work, skills, development, infastructure, poverty, private are all seen in all documents and all (most likely) represent a human capital justification of the education policies. Clearly I would need to dive more into each document and loop through more documents to get a clearer picture. 