#Import necessary library and other files

In [1]:
# Basic imports for data handling
import numpy as np
import pandas as pd
import os
import time
import gc
import random
# Imports for progress bar
from tqdm import tqdm_notebook as tqdm
# Keras imports for the model
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
# PyTorch imports for neural network functionality
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
# Gensim for loading word vectors
from gensim.models import KeyedVectors
# Google Colab import for drive mounting
from google.colab import drive
import matplotlib.pyplot as plt


In [2]:
from tqdm import tqdm
tqdm.pandas()


In [3]:
#Mount to the dir of dataset
drive = drive.mount('/content/drive')
data_dir = 'drive/MyDrive/kaggle_data'

Mounted at /content/drive


In [4]:
# to gpu
import torch
cuda_available = torch.cuda.is_available()
print("CUDA available:", cuda_available)
device = torch.device('cuda')

CUDA available: True


In [5]:
NUM_MODELS = 2
BATCH_SIZE = 256
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220


In [6]:
import zipfile

# 定义压缩文件的路径
zip_file_paths = [
    '/content/drive/MyDrive/kaggle_data/crawl-300d-2M.vec.zip',
    '/content/drive/MyDrive/kaggle_data/glove.840B.300d.txt.zip'
]

# 解压缩文件
for zip_file_path in zip_file_paths:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall('/content/drive/MyDrive/kaggle_data/')


In [7]:
#预训练的词向量文件

EMBEDDING_FILES = [
    '/content/drive/MyDrive/kaggle_data/crawl-300d-2M.vec',
    '/content/drive/MyDrive/kaggle_data/glove.840B.300d.txt'
]

CRAWL_EMBEDDING_PATH = '/content/drive/MyDrive/kaggle_data/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = '/content/drive/MyDrive/kaggle_data/glove.840B.300d.txt'

In [8]:
# 使用文件路径读取 CSV 文件
train_x = pd.read_csv(os.path.join(data_dir, 'train_x.csv'))
train_y = pd.read_csv(os.path.join(data_dir, 'train_y.csv'))
val_x = pd.read_csv(os.path.join(data_dir, 'val_x.csv'))
val_y = pd.read_csv(os.path.join(data_dir, 'val_y.csv'))
test_x = pd.read_csv(os.path.join(data_dir, 'test_x.csv'))

In [11]:
symbols_to_isolate = '.,?!-;*"…:—()%#$&_/@＼・ω+=”“[]^–>\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\x96\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁'
symbols_to_delete = '\n🍕\r🐵😑\xa0\ue014\t\uf818\uf04a\xad😢🐶️\uf0e0😜😎👊\u200b\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\u202a\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\x81エンジ故障\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\ufeff\u2028😉😤⛺🙂\u3000تحكسة👮💙فزط😏🍾🎉😞\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\u200a🌠🐟💫💰💎эпрд\x95🖐🙅⛲🍰🤐👆🙌\u2002💛🙁👀🙊🙉\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\x13🚬🤓\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\uf0b7\uf04c\x9f\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\u202d💤🍇\ue613小土豆🏡❔⁉\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\x9c\x9d🗑\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\u2000үսᴦᎥһͺ\u2007հ\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\uf10aლڡ🐦\U0001f92f\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
small_caps_mapping = {
    "ᴀ": "a", "ʙ": "b", "ᴄ": "c", "ᴅ": "d", "ᴇ": "e", "ғ": "f", "ɢ": "g", "ʜ": "h", "ɪ": "i",
    "ᴊ": "j", "ᴋ": "k", "ʟ": "l", "ᴍ": "m", "ɴ": "n", "ᴏ": "o", "ᴘ": "p", "ǫ": "q", "ʀ": "r",
    "s": "s", "ᴛ": "t", "ᴜ": "u", "ᴠ": "v", "ᴡ": "w", "x": "x", "ʏ": "y", "ᴢ": "z"}
contraction_mapping = {
    "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
    "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
    "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
    "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've":
    "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
    "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
    "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
    "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
    "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not",
    "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's":"this is","that'd": "that would",
    "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is",
    "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have",
    "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have",
    "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will",
    "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have",
    "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have",
    "trump's": "trump is", "obama's": "obama is", "canada's": "canada is", "today's": "today is"}
specail_signs = { "…": "...", "₂": "2"}
specials = ["’", "‘", "´", "`"]
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()


isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}


def handle_punctuation(x):
    x = x.translate(remove_dict)
    x = x.translate(isolate_dict)
    return x

def handle_contractions(x):
    x = tokenizer.tokenize(x)
    return x

def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

def preprocess(x):
    x = handle_punctuation(x)
    x = handle_contractions(x)
    x = fix_quote(x)
    return x

In [12]:
train_x['string'] = train_x['string'].progress_apply(lambda x:preprocess(x))
val_x['string'] = val_x['string'].progress_apply(lambda x:preprocess(x))

test_x['string'] = test_x['string'].progress_apply(lambda x:preprocess(x))

100%|██████████| 269038/269038 [01:34<00:00, 2848.10it/s]
100%|██████████| 45180/45180 [00:12<00:00, 3557.36it/s]
100%|██████████| 133782/133782 [00:39<00:00, 3354.77it/s]


In [13]:
train_x.head(), train_x.shape

(   index                                             string
 0      0                even up here . . . . . . . BLACKS !
 1      1  Blame men . There s always an excuse to blame ...
 2      2  You have no business making any comments on th...
 3      3  `` Let s get the black folks and the white fol...
 4      4  I guess the issue is people not willing to put...,
 (269038, 2))

#EDA

#Prepare the model

In [14]:
def seed_everything(seed=10086):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [15]:
NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 1
MAX_LEN = 200
MAX_FEATURES = 120000

IDENTITY_COLUMNS = ['male',	'female',	'LGBTQ',	'christian',	'muslim',	'other_religions',	'black',	'white']
AUX_COLUMNS = ['severe_toxicity',	'obscene',	'threat',	'insult',	'identity_attack',	'sexual_explicit','y']
TEXT_COLUMN = 'string'
TARGET_COLUMN = 'y'


In [None]:
# Overall
weights = np.ones((len(train_x),)) / 4
# Subgroup
weights += (train_y[IDENTITY_COLUMNS].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4
# Background Positive, Subgroup Negative
weights += (( (train_y['y'].values>=0.5).astype(bool).astype(np.int) +
   (train_y[IDENTITY_COLUMNS].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# Background Negative, Subgroup Positive
weights += (( (train_y['y'].values<0.5).astype(bool).astype(np.int) +
   (train_y[IDENTITY_COLUMNS].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
loss_weight = 1.0 / weights.mean()

In [17]:
#preprocessing the data for later usage
x_train = train_x[TEXT_COLUMN].astype(str)

#y_aux_train = train_y[AUX_COLUMNS].values
#y_train = np.vstack([(train_y['y'].values>=0.5).astype(np.int),weights]).T

x_val = val_x[TEXT_COLUMN].astype(str)

x_test = test_x[TEXT_COLUMN].astype(str)

In [18]:
#tokenizing the corpus, limiting the tokenizer to 120000 words
#for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
#train_y[column] = np.where(train_y[column] >= 0.5, True, False)

tokenizer = text.Tokenizer(num_words=MAX_FEATURES, filters='',lower=False)
tokenizer.fit_on_texts(list(x_train) + list(x_val) + list(x_test))


In [19]:
#Sequence Bucketing
x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)

In [20]:
#making sure that every sentence is of equal length by adding padding
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_val = sequence.pad_sequences(x_val, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [21]:
train_y = pd.read_csv(os.path.join(data_dir, 'train_y.csv'))
train_y = train_y.drop('from_source_domain', axis=1)

In [22]:
# 将序列拆分为200列，并创建DataFrame
df_x = pd.DataFrame(x_train, columns=['feature_' + str(i) for i in range(200)])

# 检查 DataFrame 结构
print(df_x.shape)

(269038, 200)


In [23]:
df_x.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_190,feature_191,feature_192,feature_193,feature_194,feature_195,feature_196,feature_197,feature_198,feature_199
0,0,0,0,0,0,0,0,0,0,0,...,143,1,1,1,1,1,1,1,26470,38
1,0,0,0,0,0,0,0,0,0,0,...,1538,4,661,135,14,2,4610,6,75,1
2,0,0,0,0,0,0,0,0,0,0,...,72,334,335,10,8,2641,4,51,2302,1
3,0,0,0,0,0,0,0,0,0,0,...,439,5,2,104,439,50,429,225,7248,11
4,0,0,0,0,0,0,0,0,0,0,...,27,17,895,4,293,26,2033,5,2505,1


In [24]:
def get_tail_label(df):
    """
    Give tail label colums of the given target dataframe

    args
    df: pandas.DataFrame, target label df whose tail label has to identified

    return
    tail_label: list, a list containing column name of all the tail label
    """
    columns = df.columns
    n = len(columns)
    irpl = np.zeros(n)
    for column in range(n):
        irpl[column] = df[columns[column]].value_counts()[1]
    irpl = max(irpl)/irpl
    mir = np.average(irpl)
    tail_label = []
    for i in range(n):
        if irpl[i] > mir:
            tail_label.append(columns[i])
    return tail_label

tail_labels =['black','white', 'LGBTQ','muslim']
def get_index(df):
  """
  give the index of all tail_label rows
  args
  df: pandas.DataFrame, target label df from which index for tail label has to identified

  return
  index: list, a list containing index number of all the tail label
  """
  tail_labels = ['black','white', 'LGBTQ','muslim']
  index = set()
  for tail_label in tail_labels:
    sub_index = set(df[df[tail_label]==1].index)
    index = index.union(sub_index)
  return list(index)

def get_minority_instace(X, y):
    """
    Give minority dataframe containing all the tail labels

    args
    X: pandas.DataFrame, the feature vector dataframe
    y: pandas.DataFrame, the target vector dataframe

    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    index = get_index(y)
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub


In [25]:
from sklearn.neighbors import NearestNeighbors

def nearest_neighbour(X):
    """
    Give index of 5 nearest neighbor of all the instance

    args
    X: np.array, array whose nearest neighbor has to find

    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs=NearestNeighbors(n_neighbors=5,metric='euclidean',algorithm='kd_tree').fit(X)
    euclidean,indices= nbs.kneighbors(X)
    return indices

def MLSMOTE(X,y, n_sample):
    """
    Give the augmented data using MLSMOTE algorithm

    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample

    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0,n-1)
        neighbour = random.choice(indices2[reference,1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val>2 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbour,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    new_X = pd.concat([X, new_X], axis=0)
    target = pd.concat([y, target], axis=0)
    return new_X, target


In [26]:
X_sub, y_sub = get_minority_instace(df_x,train_y)#Getting minority instance of that datframe

In [27]:
y_sub.shape

(42930, 16)

In [28]:
X_res,y_res =MLSMOTE(X_sub, y_sub, 50000) #Applying MLSMOTE to augment the dataframe

In [30]:
# 合并特征数据帧
df_x_augmented = pd.concat([df_x, X_res], ignore_index=True)

# 合并标签数据帧
train_y_augmented = pd.concat([train_y, y_res], ignore_index=True)

df_x_augmented.shape, train_y_augmented.shape

((361968, 200), (361968, 16))

In [31]:
x_train_augmented = df_x_augmented.to_numpy()

In [33]:
x_train_augmented.shape

(361968, 200)

In [None]:

from imblearn.over_sampling import SMOTE

# 初始化SMOTE实例
smote = SMOTE()

# 应用SMOTE于主要目标特征列
X_train_smote, y_train_smote = smote.fit_resample(df_x, train_y['y'])

# 过采样后的样本数量
n_samples = len(y_train_smote)

# 创建一个空的DataFrame来存储过采样后的辅助特征
new_aux_features = pd.DataFrame(index=range(n_samples), columns=AUX_COLUMNS+IDENTITY_COLUMNS)

# 对每个辅助特征列进行过采样处理
for column in (AUX_COLUMNS+IDENTITY_COLUMNS):
    # 过采样每个辅助特征列
    # 注意：我们使用相同的SMOTE实例，这将确保每个特征列与主要目标列的过采样同步
    _, new_aux_feature = smote.fit_resample(np.arange(len(train_y)).reshape(-1, 1), train_y[column])

    # 将过采样结果存储到新的DataFrame中
    new_aux_features[column] = new_aux_feature

# 现在我们有了一个包含所有过采样辅助特征的DataFrame
# 将原始的'y'列替换为过采样后的'y'列
#new_aux_features['y'] = y_train_smote

# 更新train_y DataFrame
#train_y = new_aux_features


In [None]:
new_aux_features['y'] = y_train_smote
train_y = new_aux_features

In [None]:
train_y.head()

Unnamed: 0,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit,y,male,female,LGBTQ,christian,muslim,other_religions,black,white
0,0,0,0,0,1,0,1,0,0.0,0,0,0,0,1,0
1,0,0,0,0,1,0,1,1,1.0,0,0,0,0,0,0
2,0,0,0,1,0,0,1,0,0.0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0.0,0,0,0,0,1,1
4,0,0,0,1,0,0,1,0,0.0,0,0,0,0,0,0


In [None]:
# Overall
weights = np.ones((len(x_train_augmented),)) / 4
# Subgroup
weights += (train_y_augmented[IDENTITY_COLUMNS].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4
# Background Positive, Subgroup Negative
weights += (( (train_y_augmented['y'].values>=0.5).astype(bool).astype(np.int) +
   (train_y_augmented[IDENTITY_COLUMNS].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# Background Negative, Subgroup Positive
weights += (( (train_y_augmented['y'].values<0.5).astype(bool).astype(np.int) +
   (train_y_augmented[IDENTITY_COLUMNS].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
loss_weight = 1.0 / weights.mean()

In [37]:

y_aux_train = train_y_augmented[AUX_COLUMNS].values
y_train = np.vstack([(train_y_augmented['y'].values>=0.5).astype(np.int),weights]).T


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_train = np.vstack([(train_y_augmented['y'].values>=0.5).astype(np.int),weights]).T


In [None]:
#可视化train_y各个特征的分布情况

label_sums = y_res.sum()
total_samples = len(y_res)
label_proportions = label_sums / total_samples

# 创建一个新的 DataFrame，包含数量和比例
label_distribution = pd.DataFrame({
    'Counts': label_sums,
    'Proportions': label_proportions
})
# 可视化数量
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

label_distribution['Counts'].plot(kind='bar', ax=ax[0], color='skyblue')
ax[0].set_title('Counts of Each Category')
ax[0].set_ylabel('Number of Occurrences')
ax[0].set_xlabel('Category')

# 可视化比例
label_distribution['Proportions'].plot(kind='bar', ax=ax[1], color='lightgreen')
ax[1].set_title('Proportions of Each Category')
ax[1].set_ylabel('Proportion of Total Samples')
ax[1].set_xlabel('Category')

plt.tight_layout()
plt.show()


In [39]:
#functions to build our embedding matrix

#接收一个单词及其对应的嵌入向量，返回一个包含单词和它的 NumPy 数组形式的向量的元组。
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

#打开一个预训练的嵌入向量文件（如GloVe或Word2Vec），逐行读取并使用 get_coefs 函数来建立一个字典，字典中每个单词对应一个嵌入向量。
def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

#它初始化一个全零的嵌入矩阵，矩阵的行数比单词总数多一个（包括填充索引），列数是嵌入向量的大小（这里是300）。
#然后函数遍历单词索引，尝试将预训练的向量分配到嵌入矩阵中相应的行。如果单词索引中的单词在嵌入索引中找不到，就保留该行为零，有效地将其视为未知单词。
def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

In [40]:
#building the embedding matrix
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

In [41]:
from keras.losses import binary_crossentropy
def custom_loss(y_true, y_pred):
    return binary_crossentropy(K.reshape(y_true[:,0],(-1,1)), y_pred) * y_true[:,1]

In [42]:
def build_model(embedding_matrix, num_aux_targets, loss_weight):
    words = Input(shape=(MAX_LEN,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)

    model = Model(inputs=words, outputs=[result, aux_result])
    model.compile(loss=[custom_loss,'binary_crossentropy'], loss_weights=[loss_weight, 1.0], optimizer='adam',metrics=['accuracy'])

    return model


In [43]:
checkpoint_predictions = []

In [44]:
#fitting model on whole training data
from keras import backend as K

for model_idx in range(NUM_MODELS):
    print('Model ', model_idx)
    model = build_model(embedding_matrix, y_aux_train.shape[-1], loss_weight)
    for global_epoch in range(1):
        model.fit(
            x_train_augmented,
            [y_train, y_aux_train],
            batch_size=BATCH_SIZE,
            epochs=1,
            verbose=1,
            callbacks=[
                LearningRateScheduler(lambda _: 1e-3 * (0.55 ** global_epoch))

            ]
        )
        checkpoint_predictions.append(model.predict(x_val, batch_size=2048)[0].flatten())


  #每行代表一个epoch的训练结果，展示了模型在该epoch训练后的性能。对于每个epoch，
  #都显示了训练集上的性能指标。例如，第一行显示第一个epoch的损失和准确率，然后随着epoch的增加，损失减少，准确率提高。

Model  0
Model  1


In [45]:
def group_accuracies(prediction, y):
    """
    Compute the percentage of correctly classified instances within each group and round them to four decimal places.
    arguments:
        prediction [pandas.DataFrame]: dataframe with 2 columns (index and pred)
        y [pandas.DataFrame]: dataframe containing the metadata with actual labels
    returns:
        accuracies_dict [dict]: dictionary with group names as keys and their percentage of correctly classified instances as values
    """
    y.loc[prediction.index, 'pred'] = prediction.pred

    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies_dict = {}
    for category in categories:
        group = y.loc[y[category] == 1]  # Assuming 1 indicates the presence of the category
        if not group.empty:
            group_accuracy = (group['y'] == group['pred']).mean()
            accuracies_dict[category] = round(group_accuracy * 100, 4)  # Convert to percentage and round to four decimal places

    return accuracies_dict

# Note: This function now assumes that `y['y']` contains the actual labels and `prediction['pred']` contains the predicted labels.


In [51]:
val_predictions = np.average(checkpoint_predictions, axis=0)
pred = [1 if pred > 0.50 else 0 for pred in val_predictions]

prediction_df = pd.DataFrame.from_dict({
    'ID': val_y.index,
    'pred': pred
})

group_accs = group_accuracies(prediction_df, val_y)
print(group_accs)

{'male': 89.6118, 'female': 90.1375, 'LGBTQ': 80.2334, 'christian': 93.2088, 'muslim': 83.0806, 'other_religions': 87.4239, 'black': 77.3002, 'white': 78.7234}


In [None]:
train_x_pred =

In [47]:
checkpoint_predictions_test=[]
checkpoint_predictions_test.append(model.predict(x_test, batch_size=2048)[0].flatten())



In [48]:
predictions = np.average(checkpoint_predictions_test, axis=0)
pred = [1 if pred > 0.5 else 0 for pred in predictions]

submission = pd.DataFrame.from_dict({
    'ID': test_x.index,
    'pred': pred
})
submission.to_csv('submission.csv', index=False)

In [52]:
# 计算 pred_df 中 'pred' 列中 0 和 1 出现的次数
pred_counts = submission['pred'].value_counts()

# 打印出结果
print(pred_counts)

0    118998
1     14784
Name: pred, dtype: int64
