In [76]:
import pandas as pd
from tqdm import tqdm
import nlpaug.augmenter.word as naw

In [92]:
df = pd.read_csv('../data/train.csv', encoding='cp1252')
print("Shape of dataframe: ", df.shape)
df.head()

Shape of dataframe:  (10692, 19)


Unnamed: 0,ID,TITLE,ABSTRACT,Activist_Investors,Cost_Reduction,Covid_19,Digital_capabilities,Diversity___Inclusiveness,Headquarters_Relocation,International_Expansions,M_A,Management_changes,Other,Restructuring,Rewards___benefits,Spin_offs__Split_offs,Tax_Risk,Upskilling__reskilling,Wage_Dispute
0,1,"Chevron, Total others slash production, cut a...",EnergyMix The world's five largest oil product...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Survey shows 4% of companies in Singapore sla...,[Source: TODAY] SINGAPORE - The livelihoods of...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,"""Dark day for Brighouse"" as two bank branches ...",[Source: examiner.co.uk] Barclays Bank and Yor...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,"""Helping Hands"": BASF donates 40,000 liters of...","BASF is supplying 40,000 liters of hand saniti...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,$2B Chicago area transit packaging group will ...,A $2 billion transit packaging company will re...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


Since 'Other' category is a mutually exclusive category and has no actual semantic patterns to it, we will remove the column altogether from the training data. During prediction, if the model cannot assign the text to any of the trained categories, we will assign it to 'Other' category.

In [93]:
df = df.drop(columns=['Other'])
df

Unnamed: 0,ID,TITLE,ABSTRACT,Activist_Investors,Cost_Reduction,Covid_19,Digital_capabilities,Diversity___Inclusiveness,Headquarters_Relocation,International_Expansions,M_A,Management_changes,Restructuring,Rewards___benefits,Spin_offs__Split_offs,Tax_Risk,Upskilling__reskilling,Wage_Dispute
0,1,"Chevron, Total others slash production, cut a...",EnergyMix The world's five largest oil product...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Survey shows 4% of companies in Singapore sla...,[Source: TODAY] SINGAPORE - The livelihoods of...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,3,"""Dark day for Brighouse"" as two bank branches ...",[Source: examiner.co.uk] Barclays Bank and Yor...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,"""Helping Hands"": BASF donates 40,000 liters of...","BASF is supplying 40,000 liters of hand saniti...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,5,$2B Chicago area transit packaging group will ...,A $2 billion transit packaging company will re...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10427,10688,‘This is the final straw’: Elon Musk threatens...,Tesla CEO has been outspoken about stay-at-hom...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
10428,10689,‘Upskilling Workforce Critical to Organisation...,"The Head, Innovation and Sustainability at Hon...",0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
10429,10690,‘We’re not falling for it’: AOC tells McDonald...,Fast-food workers on strike in 15 cities to pu...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
10430,10691,£60m for wind farms... to close,[Source: Scottish Daily Mail] WIND farm operat...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [94]:
print("Value Counts of Categories:\n")
for category in df.columns[3:]:
    print(f"{category}: {df[category].sum()}")

Value Counts of Categories:

Activist_Investors: 218
Cost_Reduction: 2834
Covid_19: 2120
Digital_capabilities: 594
Diversity___Inclusiveness: 625
Headquarters_Relocation: 461
International_Expansions: 539
M_A: 265
Management_changes: 1112
Restructuring: 705
Rewards___benefits: 352
Spin_offs__Split_offs: 524
Tax_Risk: 546
Upskilling__reskilling: 608
Wage_Dispute: 615


The data is highly imbalanced and some of the categories seem to have a lot of data while some of the others don't. We will increase the amount of data for some of the categories by using backtranslation text augmentation method. 

In [95]:
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)
    
def generate_augmented_text(text):
    augmented_text = back_translation_aug.augment(text)
    return augmented_text

In [98]:
## Example of Backtranslation
text = '''Canada's competition laws should be changed to prohibit cartel-like practices and wage-fixing deals 
in the country's grocery sector, a new report by the House of Commons industry committee says. The report comes 
a year after Canada's big three grocers.'''

print("Original Text: ", text)
print("Augmented Text: ", generate_augmented_text(text))

Original Text:  Canada's competition laws should be changed to prohibit cartel-like practices and wage-fixing deals 
in the country's grocery sector, a new report by the House of Commons industry committee says. The report comes 
a year after Canada's big three grocers.
Augmented Text:  Canada's competition laws should be amended to prohibit cartel-like practices and wage bargaining in the country's food sector, according to a new report by the House of Commons industry committee, one year after Canada's big three grocers.


In [96]:
def augment_category(category_label: str, df: pd.DataFrame):
    indexes = df[df[category_label] == 1].index

    for idx in tqdm(indexes):
        text = df.loc[idx]['ABSTRACT']
        df = df.append(df.loc[[idx]].assign(**{'ABSTRACT': generate_augmented_text(text)}), 
                  ignore_index=True)
    return df

In [97]:
categories_to_augment = ['Activist_Investors', 'M_A', 'Rewards___benefits', 'Digital_capabilities', 'Headquarters_Relocation',
                         'International_Expansions', 'Spin_offs__Split_offs', 'Tax_Risk']

for category in categories_to_augment:
    df = augment_category(category, df=df)

100%|██████████| 218/218 [10:19<00:00,  2.84s/it]
100%|██████████| 286/286 [14:03<00:00,  2.95s/it]
100%|██████████| 359/359 [17:50<00:00,  2.98s/it]
100%|██████████| 603/603 [27:49<00:00,  2.77s/it]
100%|██████████| 464/464 [21:31<00:00,  2.78s/it]
100%|██████████| 692/692 [32:52<00:00,  2.85s/it]
100%|██████████| 540/540 [25:45<00:00,  2.86s/it]
100%|██████████| 710/710 [35:42<00:00,  3.02s/it]


In [99]:
df

Unnamed: 0,ID,TITLE,ABSTRACT,Activist_Investors,Cost_Reduction,Covid_19,Digital_capabilities,Diversity___Inclusiveness,Headquarters_Relocation,International_Expansions,M_A,Management_changes,Restructuring,Rewards___benefits,Spin_offs__Split_offs,Tax_Risk,Upskilling__reskilling,Wage_Dispute
0,1,"Chevron, Total others slash production, cut a...",EnergyMix The world's five largest oil product...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Survey shows 4% of companies in Singapore sla...,[Source: TODAY] SINGAPORE - The livelihoods of...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,3,"""Dark day for Brighouse"" as two bank branches ...",[Source: examiner.co.uk] Barclays Bank and Yor...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,"""Helping Hands"": BASF donates 40,000 liters of...","BASF is supplying 40,000 liters of hand saniti...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,5,$2B Chicago area transit packaging group will ...,A $2 billion transit packaging company will re...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14299,9633,UPDATE 1-U.S. hedge fund launches activist cam...,"[Source: Reuters News] (Adds background, quote...",1,0,0,0,0,0,0,1,0,0,0,1,1,0,0
14300,4054,GE HEALTHCARE ACQUIRES PRISMATIC SENSORS.,GE Healthcare has acquired Prismatic Sensors A...,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0
14301,5614,LG Chem Plans to Spin off Battery Unit,[Source: Dow Jones Newswires Chinese] (MORE TO...,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0
14302,9633,UPDATE 1-U.S. hedge fund launches activist cam...,"[Source: Reuters News] (Adds background, quote...",1,0,0,0,0,0,0,1,0,0,0,1,1,0,0


After Augmentation, we will take a look at the value counts of each categories

In [103]:
df = df.drop_duplicates()
print("Value Counts of Categories:\n")
for category in df.columns[3:]:
    print(f"{category}: {df[category].sum()}")

Value Counts of Categories:

Activist_Investors: 454
Cost_Reduction: 3061
Covid_19: 2331
Digital_capabilities: 1204
Diversity___Inclusiveness: 637
Headquarters_Relocation: 933
International_Expansions: 1173
M_A: 671
Management_changes: 1191
Restructuring: 723
Rewards___benefits: 713
Spin_offs__Split_offs: 1058
Tax_Risk: 1162
Upskilling__reskilling: 634
Wage_Dispute: 616


In [118]:
df.to_csv('../data/train_augmented.csv', index=False)