In [1]:
import pandas as pd
from tqdm import tqdm
import gender_guesser.detector as gender

In [2]:
def remove_misc_articles(df):
    """Removes articles with the type 'misc' and stores them in a
    separate dataframe. Returns a tuple of the misc dataframe
    and a copy of df with the misc article rows removed.

    Args:
        df (Pandas dataframe): Dataframe from which to remove misc rows

    Returns:
        [Tuple]: (misc dataframe, copy of original dataframe with misc removed)
    """
    clean_df = df.copy()
    misc_indices = df[df['type'] == 'misc'].index
    misc_df = df.loc[misc_indices]
    clean_df.drop(misc_indices, axis=0, inplace=True)
    return (clean_df, misc_df)

In [3]:
raw_article_df = pd.read_csv("articles_raw.csv")
print(f'Size of raw article dataframe: {raw_article_df.shape}')
raw_article_df.head()

Size of raw article dataframe: (1644, 8)


Unnamed: 0,id,type,title,author,year,lang,file_x,file_y
0,journal-article-10.2307_1486423,research-article,A Sociological Portrait of German Jewish Immig...,Stephen G. Mostov,1978,eng,journal-article-10.2307_1486423.xml,journal-article-10.2307_1486423-ngram1.txt
1,journal-article-10.2307_1486465,research-article,"Lawrence Perlman's ""Buber's Anti-Kantianism"": ...",Steven T. Katz,1990,eng,journal-article-10.2307_1486465.xml,journal-article-10.2307_1486465-ngram1.txt
2,journal-article-10.2307_1486694,misc,Collected Studies,,1996,eng,journal-article-10.2307_1486694.xml,journal-article-10.2307_1486694-ngram1.txt
3,journal-article-10.2307_4131787,book-review,,Gordon M. Freeman,2003,eng,journal-article-10.2307_4131787.xml,journal-article-10.2307_4131787-ngram1.txt
4,journal-article-10.2307_1566672,book-review,\n,James S. Diamond,2000,eng,journal-article-10.2307_1566672.xml,journal-article-10.2307_1566672-ngram1.txt


In [4]:
no_misc_df, misc_df = remove_misc_articles(raw_article_df)
print(f'Size of article dataframe with miscellaneous articles removed: {no_misc_df.shape}')
no_misc_df.head()

Size of article dataframe with miscellaneous articles removed: (1469, 8)


Unnamed: 0,id,type,title,author,year,lang,file_x,file_y
0,journal-article-10.2307_1486423,research-article,A Sociological Portrait of German Jewish Immig...,Stephen G. Mostov,1978,eng,journal-article-10.2307_1486423.xml,journal-article-10.2307_1486423-ngram1.txt
1,journal-article-10.2307_1486465,research-article,"Lawrence Perlman's ""Buber's Anti-Kantianism"": ...",Steven T. Katz,1990,eng,journal-article-10.2307_1486465.xml,journal-article-10.2307_1486465-ngram1.txt
3,journal-article-10.2307_4131787,book-review,,Gordon M. Freeman,2003,eng,journal-article-10.2307_4131787.xml,journal-article-10.2307_4131787-ngram1.txt
4,journal-article-10.2307_1566672,book-review,\n,James S. Diamond,2000,eng,journal-article-10.2307_1566672.xml,journal-article-10.2307_1566672-ngram1.txt
5,journal-article-10.2307_27564391,book-review,,Alisa Braun,2008,eng,journal-article-10.2307_27564391.xml,journal-article-10.2307_27564391-ngram1.txt


In [5]:
def name_to_gender(row):
    """Estimates the gender of a person based on their given name.
    The given name must be separated from the rest of the name by 
    a space character. The author column must be named 'auth1'.

    Arguments:
        row {Series} -- Series of a dataframe with a field
                         representing a person's full name.
    """
    d = gender.Detector()
    try:
        forename = row['author'].split(" ")[0]
        return d.get_gender(forename)
    except AttributeError:
        return 'unknown'

def infer_gender(df):
    """Infers the gender of authors and returns a new
    dataframe with a gender column.

    Args:
        df (Pandas datafframe): Must include a column
        with authors' names.

    Returns:
        [Pandas dataframe]: A copy of df with the new gender column
    """
    gender_df = df.copy()
    auth_gender = df.apply(name_to_gender, axis=1)
    gender_df.insert(4, 'auth_gender', auth_gender)
    return gender_df

In [6]:
inferred_gender_df = infer_gender(no_misc_df)
inferred_gender_df.head()

Unnamed: 0,id,type,title,author,auth_gender,year,lang,file_x,file_y
0,journal-article-10.2307_1486423,research-article,A Sociological Portrait of German Jewish Immig...,Stephen G. Mostov,male,1978,eng,journal-article-10.2307_1486423.xml,journal-article-10.2307_1486423-ngram1.txt
1,journal-article-10.2307_1486465,research-article,"Lawrence Perlman's ""Buber's Anti-Kantianism"": ...",Steven T. Katz,male,1990,eng,journal-article-10.2307_1486465.xml,journal-article-10.2307_1486465-ngram1.txt
3,journal-article-10.2307_4131787,book-review,,Gordon M. Freeman,male,2003,eng,journal-article-10.2307_4131787.xml,journal-article-10.2307_4131787-ngram1.txt
4,journal-article-10.2307_1566672,book-review,\n,James S. Diamond,male,2000,eng,journal-article-10.2307_1566672.xml,journal-article-10.2307_1566672-ngram1.txt
5,journal-article-10.2307_27564391,book-review,,Alisa Braun,female,2008,eng,journal-article-10.2307_27564391.xml,journal-article-10.2307_27564391-ngram1.txt


Please inspect the inferred genders, and if necessary, correct any mistakes and fill in any genders marked as 'unknown' if possible. The tool used here is only a heuristic and is by no means perfect.

In [7]:
inferred_gender_df.to_csv('articles_gender.csv', index=False)