# Importing and Working with multiple texts in Python

Jeremy Mikecz

Research Data Services

Dartmouth College

In [1]:
from pathlib import Path

In [7]:
sotu_dir = Path("../../texts/sotu/txt")
pathlist = list(sotu_dir.glob("*.txt"))
print([path.name for path in pathlist])
print([path.name for path in pathlist])

['Adams_1797.txt', 'Adams_1798.txt', 'Adams_1799.txt', 'Adams_1800.txt', 'Adams_1825.txt', 'Adams_1826.txt', 'Adams_1827.txt', 'Adams_1828.txt', 'Arthur_1881.txt', 'Arthur_1882.txt', 'Arthur_1883.txt', 'Arthur_1884.txt', 'Biden_2021.txt', 'Biden_2022.txt', 'Biden_2023.txt', 'Buchanan_1857.txt', 'Buchanan_1858.txt', 'Buchanan_1859.txt', 'Buchanan_1860.txt', 'Buren_1837.txt', 'Buren_1838.txt', 'Buren_1839.txt', 'Buren_1840.txt', 'Bush_1989.txt', 'Bush_1990.txt', 'Bush_1991.txt', 'Bush_1992.txt', 'Bush_2001.txt', 'Bush_2002.txt', 'Bush_2003.txt', 'Bush_2004.txt', 'Bush_2005.txt', 'Bush_2006.txt', 'Bush_2007.txt', 'Bush_2008.txt', 'Carter_1978.txt', 'Carter_1979.txt', 'Carter_1980.txt', 'Carter_1981.txt', 'Cleveland_1885.txt', 'Cleveland_1886.txt', 'Cleveland_1887.txt', 'Cleveland_1888.txt', 'Cleveland_1893.txt', 'Cleveland_1894.txt', 'Cleveland_1895.txt', 'Cleveland_1896.txt', 'Clinton_1993.txt', 'Clinton_1994.txt', 'Clinton_1995.txt', 'Clinton_1996.txt', 'Clinton_1997.txt', 'Clinton_1998

In [21]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")

def get_text_data(txt: str):
    numtokens_split = len(txt.split())
    doc = nlp(txt)
    tokens = [tok.text for tok in doc]
    numtokens_spacy = len(tokens)

    #remove stop words
    filtered_tokens = [tok.text for tok in doc if not tok.is_stop]
    gpelist, personlist = [], []
    for ent in doc.ents:
        if ent.label_ == "GPE":
            gpelist.append(ent.text)
        elif ent.label_ == "PERSON":
            personlist.append(ent.text)

    return(numtokens_split, numtokens_spacy, gpelist, personlist, filtered_tokens)

In [22]:
import pandas as pd
txtlist = []
for path in pathlist:
    with open(path, encoding='utf-8') as f:
        print(path.stem)
        txt = f.read()
    numtoks_split, numtoks_spacy, gpes, people, filtered_tokens = get_text_data(txt)
    presname = path.stem.split("_")[0]
    presyear = path.stem.split("_")[1]
    txtlist.append([presyear, presname, path.name, numtoks_split, numtoks_spacy, gpes, people, txt, filtered_tokens])    

Adams_1797
Adams_1798
Adams_1799
Adams_1800
Adams_1825
Adams_1826
Adams_1827
Adams_1828
Arthur_1881
Arthur_1882
Arthur_1883
Arthur_1884
Biden_2021
Biden_2022
Biden_2023
Buchanan_1857
Buchanan_1858
Buchanan_1859
Buchanan_1860
Buren_1837
Buren_1838
Buren_1839
Buren_1840
Bush_1989
Bush_1990
Bush_1991
Bush_1992
Bush_2001
Bush_2002
Bush_2003
Bush_2004
Bush_2005
Bush_2006
Bush_2007
Bush_2008
Carter_1978
Carter_1979
Carter_1980
Carter_1981
Cleveland_1885
Cleveland_1886
Cleveland_1887
Cleveland_1888
Cleveland_1893
Cleveland_1894
Cleveland_1895
Cleveland_1896
Clinton_1993
Clinton_1994
Clinton_1995
Clinton_1996
Clinton_1997
Clinton_1998
Clinton_1999
Clinton_2000
Coolidge_1923
Coolidge_1924
Coolidge_1925
Coolidge_1926
Coolidge_1927
Coolidge_1928
Eisenhower_1954
Eisenhower_1955
Eisenhower_1956
Eisenhower_1957
Eisenhower_1958
Eisenhower_1959
Eisenhower_1960
Eisenhower_1961
Fillmore_1850
Fillmore_1851
Fillmore_1852
Ford_1975
Ford_1976
Ford_1977
Grant_1869
Grant_1870
Grant_1871
Grant_1872
Grant_1873


In [25]:
sotu_df = pd.DataFrame(txtlist, columns = ["year", "pres", "pathname", "numtoks_split", "numtoks_spacy", "Geopolitical_entities", "people", "fulltext", "filtered_tokens"])
sotu_df.head()

Unnamed: 0,year,pres,pathname,numtoks_split,numtoks_spacy,Geopolitical_entities,people,fulltext,filtered_tokens
0,1797,Adams,Adams_1797.txt,2057,2415,"[Philadelphia, Holland, Holland, Paris, the Un...",[],Gentlemen of the Senate and Gentlemen of the H...,"[Gentlemen, Senate, Gentlemen, House, Represen..."
1,1798,Adams,Adams_1798.txt,2218,2558,"[States, the United States, France, United Sta...","[Hitherto, Natchez, Croix, Croix]",Gentlemen of the Senate and Gentlemen of the H...,"[Gentlemen, Senate, Gentlemen, House, Represen..."
2,1799,Adams,Adams_1799.txt,1505,1754,"[the United\nStates of America, Pennsylvania, ...",[Majesty],Gentlemen of the Senate and Gentlemen of the H...,"[Gentlemen, Senate, Gentlemen, House, Represen..."
3,1800,Adams,Adams_1800.txt,1372,1614,"[Philadelphia, the United States, United State...",[],Gentlemen of the Senate and Gentlemen of the H...,"[Gentlemen, Senate, Gentlemen, House, Represen..."
4,1825,Adams,Adams_1825.txt,9003,10633,"[Great Britain, the United States, Great Brita...","[Monroe Hampton, Barbary States, Greeks, Warri...",Fellow Citizens of the Senate and of the House...,"[Fellow, Citizens, Senate, House, Representati..."


In [26]:
from collections import Counter
sotu_df["mostfreqwords"] = sotu_df["filtered_tokens"].apply(lambda x: Counter(x).most_common(10))

In [27]:
sotu_df.head()

Unnamed: 0,year,pres,pathname,numtoks_split,numtoks_spacy,Geopolitical_entities,people,fulltext,filtered_tokens,mostfreqwords
0,1797,Adams,Adams_1797.txt,2057,2415,"[Philadelphia, Holland, Holland, Paris, the Un...",[],Gentlemen of the Senate and Gentlemen of the H...,"[Gentlemen, Senate, Gentlemen, House, Represen...","[(\n, 158), (,, 110), (., 58), (\n\n, 22), (Un..."
1,1798,Adams,Adams_1798.txt,2218,2558,"[States, the United States, France, United Sta...","[Hitherto, Natchez, Croix, Croix]",Gentlemen of the Senate and Gentlemen of the H...,"[Gentlemen, Senate, Gentlemen, House, Represen...","[(\n, 168), (,, 81), (., 54), (\n\n, 26), (Sta..."
2,1799,Adams,Adams_1799.txt,1505,1754,"[the United\nStates of America, Pennsylvania, ...",[Majesty],Gentlemen of the Senate and Gentlemen of the H...,"[Gentlemen, Senate, Gentlemen, House, Represen...","[(\n, 117), (,, 75), (., 35), (\n\n, 14), (Uni..."
3,1800,Adams,Adams_1800.txt,1372,1614,"[Philadelphia, the United States, United State...",[],Gentlemen of the Senate and Gentlemen of the H...,"[Gentlemen, Senate, Gentlemen, House, Represen...","[(\n, 102), (,, 71), (., 37), (\n\n, 19), (pub..."
4,1825,Adams,Adams_1825.txt,9003,10633,"[Great Britain, the United States, Great Brita...","[Monroe Hampton, Barbary States, Greeks, Warri...",Fellow Citizens of the Senate and of the House...,"[Fellow, Citizens, Senate, House, Representati...","[(\n, 747), (,, 492), (., 201), (\n\n, 63), (C..."
