In [28]:
import pandas as pd
from tqdm import tqdm
import os
import re
from pathlib import Path

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk_stopwords = stopwords.words("english")

[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
article_type = 'research-article'

In [30]:
file_dir = Path(os.getcwd()).resolve()
unigram_dir = file_dir / 'ngram1'

In [31]:
def make_unigram_table(atype, a_ids=None):
    """Makes a dataframe by concatenating all unigrams from
        articles specified by a_ids.

    Arguments:
        atype {String} -- the article type to source unigrams
                          from: 'book-review', 'research-article',
                          or 'both'.

    Keyword Arguments:
        a_id {List} -- List of article IDs to include, by default
                       all articles are included. (default: {None})
    """
    if atype == "both":
        raise ValueError("Feature not yet supported!\n")
    curr_path = unigram_dir / atype
    dfs = []
    for f in tqdm(curr_path.iterdir()):
        f_id = re.search(f"{atype}/(.*)-n", str(f)).group(1)
        if a_ids is not None:
            if f_id in a_ids:
                # print("Match!")  # for debugging
                unigrams = pd.read_csv(f, sep='\t', names=["word", "count"])
                # print(f"Just read {f_id}")  # for debugging
                dfs.append(unigrams)
            else:
                # print(f"{f_id} not in a_ids list")
                continue
        else:
            unigrams = pd.read_csv(f, sep='\t', names=["word", "count"])
            # print(f"Just read {f_id}")  # for debugging
            dfs.append(unigrams)
    print(f"\nCollected {len(dfs)} articles\n")
    if not dfs:
        raise ValueError("Unable to collect dataframes")
    concat_df = pd.concat(dfs)
    summed_df = concat_df.groupby(["word"]).sum().reset_index()
    sorted_df = summed_df.sort_values(by=["count"], ascending=False)
    return sorted_df

In [32]:
unigrams = make_unigram_table(article_type)
print(f"The unigram dataframe has size: {unigrams.shape}")
unigrams.head()

392it [00:02, 162.15it/s]

Collected 392 articles

The unigram dataframe has size: (209491, 2)


Unnamed: 0,word,count
153833,s,35062
3270,1,28182
76686,his,26923
114170,n,23915
78330,i,22514


Removing any unigrams with counts below a predetermined threshold can result in quicker processing and more succint analysis.

In [33]:
def drop_counts(df, threshold):
    """Removes rows from an ngram table with counts fewer than threshold.

    Arguments:
        df {Pandas dataframe} -- ngram table with columns "word" and "count"
        threshold {Integer} -- minimum count to keep rows
    """
    dropped_df = df[df['count'] >= threshold]
    return dropped_df

In [36]:
truncated_unigrams = drop_counts(unigrams, 10)
print(f"The truncated unigrams dataframe has size: {truncated_unigrams.shape}")
truncated_unigrams.head()

The truncated unigrams dataframe has size: (24863, 2)


Unnamed: 0,word,count
153833,s,35062
3270,1,28182
76686,his,26923
114170,n,23915
78330,i,22514


In [37]:
def lemmatize_table(df):
    """Lemmatizes the words in the word column of an ngram table. Sums
       frequencies of words that map to the same lemma.

    Arguments:
        df {Pandas dataframe} -- ngram table with columns 'word' and 'count'.
    """
    if df is None:
        print("The passed object is of None type")
        return "Error"
    lemmatizer = WordNetLemmatizer()

    def lemmatize_word(word):
        """Uses lemmatization from spacy to map words to lemmas.
           (Could do with some tweaking to better group lemmas).

        Arguments:
            word {String} -- lowercase word to lemmatize

        Returns:
            String -- lemmatized word
        """
        return lemmatizer.lemmatize(word)
    df_lemma = df.copy()
    df_lemma['word'] = df['word'].map(lemmatize_word, na_action="ignore")
    summed_df = df_lemma.groupby(["word"]).sum().reset_index()
    sorted_df = summed_df.sort_values(by=["count"], ascending=False)
    return sorted_df

In [38]:
lemmatized_unigrams = lemmatize_table(truncated_unigrams)
print(f"The lemmatized unigram dataframe has size: {lemmatized_unigrams.shape}")
lemmatized_unigrams.head()

The lemmatized unigram dataframe has size: (22739, 2)


Unnamed: 0,word,count
17225,s,35250
85,1,28275
9516,his,26923
13307,n,24388
9750,i,22514


Use the following function to include custom stop words into the stop word list for eventual removal.

In [39]:
def make_custom_stopword_list(df):
    """Creates and saves a custom stopword list as a plain text file in the data directory.

    Arguments:
        df {Pandas Dataframe} -- A dataframe of with columns 'word', 'count', 'stopword'
        which is a table of ngram frequencies that has been manually labeled with an 'x'
        to indicate inclusion. Words marked with 'r' have already been reviewed and will
        not be added to the stop list. All other words are added to the stop list.
    """
    stopwords = []
    for i, row in tqdm(df.iterrows(), desc="Creating stopwords\n"):
        stopword_status = row['stopword']
        if stopword_status == 'r': # if 'r', word has already been reviewed, so ignore
            continue
        if stopword_status == 'x': # if 'x', word is marked for inclusion, so ignore
            df.loc[i,'stopword'] = 'r' # update 'x' to 'r'
        if stopword_status == "unchecked": # if 'unchecked', wait for manual review
            continue
        else:
            stopwords.append(row['word'])
    with open(data_dir / "custom_stopwords.txt", "w+") as filehandle:
        for word in stopwords:
            filehandle.write(f"{word}\n")

In [40]:
def is_year(string):
    """Returns a boolean value if the input string likely represents a year.
       Only valid for years 1000-2999.
    Arguments:
        string {String} -- candidate string
    """
    return bool(re.match("(1|2)\d{3}", string))


def remove_numerals(df, remove_mixed_strings=True):
    """Removes rows from an ngram table with words that are numerals. This
       does not include 4-digit numbers which are interpreted as years.

    Arguments:
        df {Pandas dataframe} -- A dataframe of with columns 'word', 'count'.

    Keyword Arguments:
        remove_mixed_strings {bool} -- Whether to remove rows with words that
        are mixtures of numerals and letters. (default: {True})
    """
    no_numerals_df = df.copy().reset_index()
    for i, row in tqdm(no_numerals_df.iterrows()):
        word = row['word']
        if remove_mixed_strings:
            if any([c.isnumeric() for c in word]) and \
               not is_year(word):
                no_numerals_df.drop(i, axis=0, inplace=True)
        else:
            if word.isnumeric() and len(word) != 4:
                no_numerals_df.drop(i, axis=0, inplace=True)
    return no_numerals_df

In [41]:
no_nums_df = remove_numerals(lemmatized_unigrams)
print(f"The unigram dataframe with numerals removed has size: {no_nums_df.shape}")
no_nums_df.head()

22739it [00:11, 1964.61it/s]The unigram dataframe with numerals removed has size: (20992, 3)



Unnamed: 0,index,word,count
0,17225,s,35250
2,9516,his,26923
3,13307,n,24388
4,9750,i,22514
5,9286,he,20488


In [48]:
def remove_stopwords(df, include_custom=False):
    """Removes rows from an ngram table with words
       in a custom stopword list or in NLTK list.

    Arguments:
        df {Pandas dataframe} -- ngram table with columns "word" and "stopword"
    """
    custom_stop = set()
    if include_custom:
        with open(data_dir / "custom_stopwords.txt") as f:
            custom_stop = f.readlines()
    custom_stop = [w.strip() for w in custom_stop]
    no_stops_df = df.copy()
    for i, row in tqdm(no_stops_df.iterrows()):
        word = row['word']
        if include_custom:
            if word in nltk_stopwords or word in custom_stop:
                no_stops_df.drop(i, axis=0, inplace=True)
        else:
            if word in nltk_stopwords:
                no_stops_df.drop(i, axis=0, inplace=True)
    return no_stops_df

In [50]:
no_stops_df = remove_stopwords(no_nums_df)
print(f"The unigram dataframe with stopwords removed has size: {no_stops_df.shape}")
no_stops_df.head()

20992it [00:04, 4334.15it/s]The unigram dataframe with stopwords removed has size: (20874, 3)



Unnamed: 0,index,word,count
3,13307,n,24388
7,10909,jewish,17390
9,8993,ha,15356
10,17571,see,15304
11,14554,p,13117
