# Features Extraction

This notebook performs feature extraction on preprocessed journal data.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/data_cleaned.csv')
df.head()

Unnamed: 0,Year,Year_Scaled,Year_STD,Month,Keywords,Abstract,Abstract_Cleaned,Abstract Length,Keywords_Cleaned,Number of Keywords,Month_Cleaned
0,2020,1.243352,169.971142,March,"['Capital structure', 'Corporate taxation', 'D...","Absent theoretical guidance, empiricists have ...",absent theoret guidance empiricist forc reli u...,1047,"['capit structur', 'corpor taxat', 'difference...",5,3
1,2020,1.243352,169.971142,March,"['Credit spreads', 'LBO risk', 'Structural mod...",Recent decades have witnessed several waves of...,recent decad wit sever wave buyout activity fi...,580,"['credit spread', 'lbo risk', 'structur model'...",4,3
2,2020,1.243352,169.971142,March,"['Fire sales', 'Liquidity management', 'Mutual...",We develop three novel measures of the incenti...,develop three novel measur incent equiti mutua...,586,"['fire sale', 'liquid manag', 'mutual fund']",3,3
3,2020,1.243352,169.971142,March,"['Asset pricing', 'Leverage constraints', 'Lot...",We test whether the low-risk effect is driven ...,test whether lowrisk effect driven leverag con...,861,"['asset price', 'leverag constraint', 'lotteri...",5,3
4,2020,1.243352,169.971142,March,"['Gender gap', 'Entrepreneurship', 'Angel inve...",We study whether early stage investors have ge...,studi whether earli stage investor gender bias...,742,"['gender gap', 'entrepreneurship', 'angel inve...",4,3


## Keywords

In [3]:
# Build keywords dataframe
years_list = list(df['Year'].unique())
keywords_list = []
kw_count = []
kw_uniques = []
kw_uniques_count = []

# iterate through each year
for year in years_list:
    # get all keywords for the selected year
    keywords = []
    years_kws_temp = df.loc[(df['Year'] == year) & (df['Number of Keywords'] > 0)]['Keywords_Cleaned']
    for kw in years_kws_temp:
        if len(kw) == 2:
            continue
            
        # kw's format: ["'keyword_1', ..., 'keywords_xx']
        # remove apostrophe, double quotation marks and the square bracket
        kw_str = kw.replace('\'', "")[1:-1].replace('“', '').replace('”', '').replace('"', '').strip()
        keywords.extend(kw_str.split(', '))
    
    # store all keywords for selected year
    keywords_str = ', '.join(keywords)
    keywords_list.append(keywords_str)
    kw_count.append(len(keywords))
    
    # get a list of unique keywords
    unique_kw = list(set(keywords))
    kw_uniques.append(', '.join(unique_kw))
    kw_uniques_count.append(len(unique_kw))

# create dataframe of keywords per year
years_keywords = pd.DataFrame(dict({'Year': years_list, 
                                    'Keywords': keywords_list,
                                    'Unique Keywords': kw_uniques,
                                    'Number of Keywords': kw_count,
                                    'Number of Unique Keywords': kw_uniques_count}))

# sort the dataframe by year
years_keywords.sort_values(by='Year', inplace=True)

# look at data
years_keywords.tail()

Unnamed: 0,Year,Keywords,Unique Keywords,Number of Keywords,Number of Unique Keywords
4,2016,"option exercis, friction, short-sal cost, tran...","taylor rule residu, financi specul, financi cr...",560,459
3,2017,"offshor oper, oper hedg, financi hedg, risk ma...","loan fee, communic, fragment, stagger board, b...",523,423
2,2018,"financi crisi, subprim mortgag, financi fraud,...","dividend-to-pric ratio, execut retir, mortgag ...",516,436
1,2019,"mortgag market, asymmetr inform, signal, volat...","ipo underpr, financi crisi, negoti, hous marke...",581,478
0,2020,"capit structur, corpor taxat, difference-in-di...","public financ, bitcoin, hedg funds,inform envi...",274,258


### Preprocess Keywords

In [4]:
def tokenize(text):
    return text.split(', ')

In [5]:
import nltk

# get unique English stopwords
stopwords = set(nltk.corpus.stopwords.words("english"))    

def remove_stop(tokens):
    # remove stopwords from tokens
    return [t for t in tokens if t not in stopwords]

In [6]:
# create a data pipeline to perform tokenization, stopwords removal, and stemming
pipeline = [tokenize, remove_stop]

def prepare(text, pipeline):
    tokens = text
    
    for transform in pipeline:
        tokens = transform(tokens)
    return tokens

In [7]:
from nltk.stem.snowball import SnowballStemmer

# create a Snowball Stemmer object for stemming
stemmer = SnowballStemmer("english")       

# tokenize keywords
years_keywords['Keyword Tokens'] = years_keywords['Keywords'].apply(prepare, pipeline=pipeline)
years_keywords['Keyword Tokens'] = years_keywords['Keyword Tokens'].map(lambda x:
                                                                        [stemmer.stem(token) for token in x])
years_keywords.tail()

Unnamed: 0,Year,Keywords,Unique Keywords,Number of Keywords,Number of Unique Keywords,Keyword Tokens
4,2016,"option exercis, friction, short-sal cost, tran...","taylor rule residu, financi specul, financi cr...",560,459,"[option exerci, friction, short-sal cost, tran..."
3,2017,"offshor oper, oper hedg, financi hedg, risk ma...","loan fee, communic, fragment, stagger board, b...",523,423,"[offshor op, oper hedg, financi hedg, risk man..."
2,2018,"financi crisi, subprim mortgag, financi fraud,...","dividend-to-pric ratio, execut retir, mortgag ...",516,436,"[financi crisi, subprim mortgag, financi fraud..."
1,2019,"mortgag market, asymmetr inform, signal, volat...","ipo underpr, financi crisi, negoti, hous marke...",581,478,"[mortgag market, asymmetr inform, signal, vola..."
0,2020,"capit structur, corpor taxat, difference-in-di...","public financ, bitcoin, hedg funds,inform envi...",274,258,"[capit structur, corpor taxat, difference-in-d..."


### Keyword Frequency Distribution per Year

In [8]:
from collections import Counter

# a list of dataframe freq_kw
freq_kw_list = []

# get a list of years that have keywords
years_list = years_keywords[years_keywords['Number of Keywords'] > 0]['Year'].to_list()

# iterate through years and get the top count tokens
for year in years_list:
    token_kw_list = years_keywords.loc[(years_keywords['Year'] == year) & \
                                       (years_keywords['Number of Keywords'] > 0)]['Keyword Tokens'].values[0]
    counter = Counter(token_kw_list)
    
    # transform counter into a DataFrame
    freq_kw = pd.DataFrame.from_dict(counter, orient='index', columns=['Frequency'])
    freq_kw['Year'] = year
    freq_kw.index.name = 'Token'
    freq_kw.reset_index(inplace=True)
    freq_kw.sort_values(by='Frequency', ascending=False, inplace=True)
    
    freq_kw_list.append(freq_kw)

print('Length of freq_kw_list:', len(freq_kw_list))

# merge all freq_kw dataframes 
years_freq_kw = pd.concat(freq_kw_list, axis=0, ignore_index=True)
years_freq_kw.sort_values(by='Year', inplace=True)
print('Shape of dataframe:', years_freq_kw.shape)
years_freq_kw.head()

Length of freq_kw_list: 28
Shape of dataframe: (8086, 3)


Unnamed: 0,Token,Frequency,Year
0,initi public off,4,1993
20,monitor,1,1993
21,partial adjust,1,1993
22,ipo,1,1993
23,condit asset price model,1,1993


### Total Count per Keyword

In [9]:
# get keyword counts
keyword_counts = pd.DataFrame(years_freq_kw.groupby('Token')['Frequency'].sum().sort_values(ascending=False))
keyword_counts.reset_index(inplace=True)
keyword_counts.rename(columns={'Token':'Keyword'}, inplace=True)

print(keyword_counts.shape)
keyword_counts.head()

(4124, 2)


Unnamed: 0,Keyword,Frequency
0,corpor govern,152
1,liquid,88
2,capit structur,80
3,asset pric,74
4,mutual fund,74


### Keywords Distribution per Year

In [10]:
keywords_df_list = []       # a list of data frame of keyword distribution
keywords_timeline = []      # a list of year that keyword occurs

for token in keyword_counts['Keyword'].values:
    # get keywords distribution per year
    temp_kw_df = years_freq_kw[years_freq_kw['Token'] == token]
    keywords_df_list.append(temp_kw_df)
    keywords_timeline.append(list(temp_kw_df['Year'].values))
    
# merge all dataframes 
kw_dist = pd.concat(keywords_df_list, axis=0, ignore_index=True)
print('Data shape:', kw_dist.shape)
kw_dist.head()

Data shape: (8086, 3)


Unnamed: 0,Token,Frequency,Year
0,corpor govern,2,1994
1,corpor govern,1,1995
2,corpor govern,2,1996
3,corpor govern,4,1997
4,corpor govern,1,1998


### Keywords Timeline

In [11]:
# build timeline list for each Top 20 Keywords
keywords_timeline_str = []


for years_list in keywords_timeline:
    min_year = []      # lower bound of timeline
    max_year = []      # upper bound of timeline
    
    # build lower bound and upper bound for timeline
    for i, year in enumerate(years_list):
        if i == 0: 
            # set value for lower bound if this is the first item in years_list
            min_year.append(year)
        else:
            if len(max_year) == 0:
                if year == min_year[len(min_year) - 1] + 1:
                    # if current year equals previous min year + 1
                    # set value for upper bound
                    max_year.append(year)
                else:
                    # if current year is not an increment of previous min year (non-consecutive year)
                    max_year.append(0)      # set upper bound to zero to indicate a gap in the timeline
                    min_year.append(year)   # set value for the next sequence in the timeline (lower bound)
            elif len(min_year) > len(max_year) and year == min_year[len(min_year) - 1] + 1:
                # if there exists a lower bound but no upper bound value
                # and current year is an increment of the previous min_year
                # set upper bound value to current year
                max_year.append(year)
            elif len(min_year) > len(max_year) and year > min_year[len(min_year) - 1] + 1:
                # if there exists a lower bound but no upper bound value
                # and current year is NOT an increment of the previous min_year
                # append 0 to max_year to indicate there is a gap in year
                max_year.append(0)
                min_year.append(year)
            elif len(min_year) == len(max_year) and year == max_year[len(max_year) - 1] + 1:
                # if there exists a timeline for current year
                # update the upper bound to current year
                max_year[len(max_year) - 1] = year
            elif len(min_year) == len(max_year) and year > max_year[len(max_year) - 1] + 1:
                # if current year is not an increment of the upper bound of the current timeline
                # add year to the new lower bound timeline
                min_year.append(year)
    
    # if len of lower bound and len of upper bound are not equal
    # set the last item in upper to zero to signify the end of timeline
    if len(min_year) > len(max_year):
        max_year.append(0)
    
    # iterate through min_year
    text = ''
    for j, yr in enumerate(min_year):
        if len(text) == 0:
            text = str(yr)
        else:
            text = text + ', ' + str(yr)

        if max_year[j] > 0:
            text = text + '-' + str(max_year[j])
            
    keywords_timeline_str.append(text)


# add timeline to keyword_counts dataframe
keyword_counts['Timeline'] = keywords_timeline_str
keyword_counts.head()

Unnamed: 0,Keyword,Frequency,Timeline
0,corpor govern,152,1994-2020
1,liquid,88,"1995-1998, 2000-2017, 2019"
2,capit structur,80,"1994-1996, 1998-2020"
3,asset pric,74,"1993-2001, 2003-2020"
4,mutual fund,74,"1995-2005, 2007-2020"


## Export Data

In [12]:
# keywords per year
years_keywords.to_csv('data/keywords_per_year.csv', index=False)

# keywords distribution per year
years_freq_kw.to_csv('data/keywords_dist_per_year.csv', index=False)

# keyword counts (all years)
keyword_counts.to_csv('data/keyword_counts.csv', index=False)