# Feature Engineering

This notebook prepares data for model training.

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from mlxtend.preprocessing import DenseTransformer
from sklearn.cluster import KMeans
from tqdm import tqdm

In [2]:
# load pre-processed data
df = pd.read_csv('data/data_cleaned.csv')
df.head()

Unnamed: 0,Year,Year_Scaled,Year_STD,Month,Keywords,Abstract,Abstract_Cleaned,Abstract Length,Keywords_Cleaned,Number of Keywords,Month_Cleaned
0,2020,1.243352,169.971142,March,"['Capital structure', 'Corporate taxation', 'D...","Absent theoretical guidance, empiricists have ...",absent theoret guidance empiricist forc reli u...,1047,"['capit structur', 'corpor taxat', 'difference...",5,3
1,2020,1.243352,169.971142,March,"['Credit spreads', 'LBO risk', 'Structural mod...",Recent decades have witnessed several waves of...,recent decad wit sever wave buyout activity fi...,580,"['credit spread', 'lbo risk', 'structur model'...",4,3
2,2020,1.243352,169.971142,March,"['Fire sales', 'Liquidity management', 'Mutual...",We develop three novel measures of the incenti...,develop three novel measur incent equiti mutua...,586,"['fire sale', 'liquid manag', 'mutual fund']",3,3
3,2020,1.243352,169.971142,March,"['Asset pricing', 'Leverage constraints', 'Lot...",We test whether the low-risk effect is driven ...,test whether lowrisk effect driven leverag con...,861,"['asset price', 'leverag constraint', 'lotteri...",5,3
4,2020,1.243352,169.971142,March,"['Gender gap', 'Entrepreneurship', 'Angel inve...",We study whether early stage investors have ge...,studi whether earli stage investor gender bias...,742,"['gender gap', 'entrepreneurship', 'angel inve...",4,3


In [3]:
# build a data pipeline
pipeline = Pipeline([('vect', CountVectorizer()),                                 # bag-of-words
                     ('lda', LatentDirichletAllocation(n_components=50,           # topic modeling
                                                       random_state=42,
                                                       topic_word_prior=None)),
                     ('tfidf', TfidfTransformer()),
                     ('svd', TruncatedSVD(n_components=10, random_state=42)),     # components reduction
                     ('to_dense', DenseTransformer())])                           # data transform

# build data vector representation of abstract
x_vector = pipeline.fit_transform(df.Abstract_Cleaned)

In [4]:
# normalize x_vector
x_vector = Normalizer().fit_transform(x_vector)
df_x_vector = pd.DataFrame(x_vector, index=None)
df_x_vector = df_x_vector / df_x_vector.std()
x_vector = df_x_vector.to_numpy()

# display x_vector
x_vector

array([[ 1.22002264,  2.2219857 , -1.01177663, ..., -0.73937831,
        -1.06955482, -1.67230182],
       [ 1.02046173,  2.82636214,  0.48123842, ..., -0.16985951,
         1.05417153,  0.10407414],
       [ 0.30568027,  0.80914701, -0.70299143, ...,  4.07895096,
        -0.01572586,  0.83661459],
       ...,
       [ 3.05213089, -0.84898762,  0.13323143, ..., -0.03871609,
         0.00791277, -0.12780114],
       [ 0.27029913,  0.63435036, -1.07016025, ...,  0.27322576,
        -0.32526024, -0.50543736],
       [ 1.23598864,  1.26528023, -0.87203966, ..., -0.28934377,
         1.11214526,  1.32729393]])

### Build Terms Sparse Matrix

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
import json

# create TF-IDF transformer
tfidf = TfidfVectorizer()
tfidf.fit(df.Abstract_Cleaned)
terms_sparse_matrix = tfidf.transform(df.Abstract_Cleaned)
terms_label = tfidf.get_feature_names()

# save data
# to load sparse matrix: sparse_matrix = scipy.sparse.load_npz('/tmp/sparse_matrix.npz')
scipy.sparse.save_npz('data/terms_sparse_matrix.npz', terms_sparse_matrix)

# save term labels
with open("data/terms_label.txt", "w") as fp:
    json.dump(terms_label, fp)

## Save Data for Model Development

In [6]:
# save x_vector
with open('data/x_vector.npy', 'wb') as file:
    np.save(file, x_vector)