#Import necessary library and other files

In [1]:
# Basic imports for data handling
import numpy as np
import pandas as pd
import os
import time
import gc
import random
# Imports for progress bar
from tqdm import tqdm_notebook as tqdm
# Keras imports for the model
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
# PyTorch imports for neural network functionality
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
# Gensim for loading word vectors
from gensim.models import KeyedVectors
# Google Colab import for drive mounting
from google.colab import drive
import matplotlib.pyplot as plt


In [2]:
from tqdm import tqdm
tqdm.pandas()


In [3]:
#Mount to the dir of dataset
drive = drive.mount('/content/drive')
data_dir = 'drive/MyDrive/kaggle_data'

Mounted at /content/drive


In [4]:
# to gpu
import torch
cuda_available = torch.cuda.is_available()
print("CUDA available:", cuda_available)
device = torch.device('cuda')

CUDA available: True


In [5]:
NUM_MODELS = 2
BATCH_SIZE = 256
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220


In [6]:
import zipfile

# ÂÆö‰πâÂéãÁº©Êñá‰ª∂ÁöÑË∑ØÂæÑ
zip_file_paths = [
    '/content/drive/MyDrive/kaggle_data/crawl-300d-2M.vec.zip',
    '/content/drive/MyDrive/kaggle_data/glove.840B.300d.txt.zip'
]

# Ëß£ÂéãÁº©Êñá‰ª∂
for zip_file_path in zip_file_paths:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall('/content/drive/MyDrive/kaggle_data/')


In [7]:
#È¢ÑËÆ≠ÁªÉÁöÑËØçÂêëÈáèÊñá‰ª∂

EMBEDDING_FILES = [
    '/content/drive/MyDrive/kaggle_data/crawl-300d-2M.vec',
    '/content/drive/MyDrive/kaggle_data/glove.840B.300d.txt'
]

CRAWL_EMBEDDING_PATH = '/content/drive/MyDrive/kaggle_data/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = '/content/drive/MyDrive/kaggle_data/glove.840B.300d.txt'

In [8]:
# ‰ΩøÁî®Êñá‰ª∂Ë∑ØÂæÑËØªÂèñ CSV Êñá‰ª∂
train_x = pd.read_csv(os.path.join(data_dir, 'train_x.csv'))
train_y = pd.read_csv(os.path.join(data_dir, 'train_y.csv'))
val_x = pd.read_csv(os.path.join(data_dir, 'val_x.csv'))
val_y = pd.read_csv(os.path.join(data_dir, 'val_y.csv'))
test_x = pd.read_csv(os.path.join(data_dir, 'test_x.csv'))

In [11]:
symbols_to_isolate = '.,?!-;*"‚Ä¶:‚Äî()%#$&_/@Ôºº„Éªœâ+=‚Äù‚Äú[]^‚Äì>\\¬∞<~‚Ä¢‚â†‚Ñ¢Àà ä…í‚àû¬ß{}¬∑œÑŒ±‚ù§‚ò∫…°|¬¢‚ÜíÃ∂`‚ù•‚îÅ‚î£‚î´‚îóÔºØ‚ñ∫‚òÖ¬©‚Äï…™‚úî¬Æ\x96\x92‚óè¬£‚ô•‚û§¬¥¬π‚òï‚âà√∑‚ô°‚óê‚ïë‚ñ¨‚Ä≤…îÀê‚Ç¨€©€û‚Ä†Œº‚úí‚û•‚ïê‚òÜÀå‚óÑ¬Ω ªœÄŒ¥Œ∑ŒªœÉŒµœÅŒΩ É‚ú¨Ôº≥ÔºµÔº∞Ôº•Ôº≤Ôº©Ôº¥‚òª¬±‚ôç¬µ¬∫¬æ‚úì‚óæÿüÔºé‚¨Ö‚ÑÖ¬ª–í–∞–≤‚ù£‚ãÖ¬ø¬¨‚ô´Ôº£Ôº≠Œ≤‚ñà‚ñì‚ñí‚ñë‚áí‚≠ê‚Ä∫¬°‚ÇÇ‚ÇÉ‚ùß‚ñ∞‚ñî‚óû‚ñÄ‚ñÇ‚ñÉ‚ñÑ‚ñÖ‚ñÜ‚ñá‚ÜôŒ≥ÃÑ‚Ä≥‚òπ‚û°¬´œÜ‚Öì‚Äû‚úãÔºö¬•Ã≤ÃÖÃÅ‚àô‚Äõ‚óá‚úè‚ñ∑‚ùì‚ùó¬∂ÀöÀôÔºâ—Å–∏ ø‚ú®„ÄÇ…ë\x80‚óïÔºÅÔºÖ¬Ø‚àíÔ¨ÇÔ¨Å‚ÇÅ¬≤ å¬º‚Å¥‚ÅÑ‚ÇÑ‚å†‚ô≠‚úò‚ï™‚ñ∂‚ò≠‚ú≠‚ô™‚òî‚ò†‚ôÇ‚òÉ‚òé‚úà‚úå‚ú∞‚ùÜ‚òô‚óã‚Ä£‚öìÂπ¥‚àé‚Ñí‚ñ™‚ñô‚òè‚ÖõÔΩÉÔΩÅÔΩì«Ä‚ÑÆ¬∏ÔΩó‚Äö‚àº‚Äñ‚Ñ≥‚ùÑ‚Üê‚òº‚ãÜ í‚äÇ„ÄÅ‚Öî¬®Õ°‡πè‚öæ‚öΩŒ¶√óŒ∏Ôø¶ÔºüÔºà‚ÑÉ‚è©‚òÆ‚ö†Êúà‚úä‚ùå‚≠ï‚ñ∏‚ñ†‚áå‚òê‚òë‚ö°‚òÑ«´‚ï≠‚à©‚ïÆÔºå‰æãÔºû ï…êÃ£Œî‚ÇÄ‚úû‚îà‚ï±‚ï≤‚ñè‚ñï‚îÉ‚ï∞‚ñä‚ñã‚ïØ‚î≥‚îä‚â•‚òí‚Üë‚òù…π‚úÖ‚òõ‚ô©‚òûÔº°Ôº™Ôº¢‚óî‚ó°‚Üì‚ôÄ‚¨ÜÃ±‚Ñè\x91‚†ÄÀ§‚ïö‚Ü∫‚á§‚àè‚úæ‚ó¶‚ô¨¬≥„ÅÆÔΩúÔºè‚àµ‚à¥‚àöŒ©¬§‚òú‚ñ≤‚Ü≥‚ñ´‚Äø‚¨á‚úßÔΩèÔΩñÔΩçÔºçÔºíÔºêÔºòÔºá‚Ä∞‚â§‚àïÀÜ‚öú‚òÅ'
symbols_to_delete = '\nüçï\rüêµüòë\xa0\ue014\t\uf818\uf04a\xadüò¢üê∂Ô∏è\uf0e0üòúüòéüëä\u200b\u200eüòÅÿπÿØŸàŸäŸáÿµŸÇÿ£ŸÜÿßÿÆŸÑŸâÿ®ŸÖÿ∫ÿ±üòçüíñüíµ–ïüëéüòÄüòÇ\u202a\u202cüî•üòÑüèªüí•·¥ç è Ä·¥á…¥·¥Ö·¥è·¥Ä·¥ã ú·¥ú ü·¥õ·¥Ñ·¥ò ô“ì·¥ä·¥°…¢üòãüëè◊©◊ú◊ï◊ù◊ë◊ôüò±‚Äº\x81„Ç®„É≥„Ç∏ÊïÖÈöú\u2009üöå·¥µÕûüåüüòäüò≥üòßüôÄüòêüòï\u200füëçüòÆüòÉüòò◊ê◊¢◊õ◊óüí©üíØ‚õΩüöÑüèº‡Æúüòñ·¥†üö≤‚Äêüòüüòàüí™üôèüéØüåπüòáüíîüò°\x7füëå·ºê·Ω∂ŒÆŒπ·Ω≤Œ∫·ºÄŒØ·øÉ·º¥ŒæüôÑÔº®üò†\ufeff\u2028üòâüò§‚õ∫üôÇ\u3000ÿ™ÿ≠ŸÉÿ≥ÿ©üëÆüíôŸÅÿ≤ÿ∑üòèüçæüéâüòû\u2008üèæüòÖüò≠üëªüò•üòîüòìüèΩüéÜüçªüçΩüé∂üå∫ü§îüò™\x08‚Äëüê∞üêáüê±üôÜüò®üôÉüíïùòäùò¶ùò≥ùò¢ùòµùò∞ùò§ùò∫ùò¥ùò™ùòßùòÆùò£üíóüíöÂú∞ÁçÑË∞∑—É–ª–∫–Ω–ü–æ–ê–ùüêæüêïüòÜ◊îüîóüöΩÊ≠åËàû‰ºéüôàüò¥üèøü§óüá∫üá∏–ºœÖ—Ç—ï‚§µüèÜüéÉüò©\u200aüå†üêüüí´üí∞üíé—ç–ø—Ä–¥\x95üñêüôÖ‚õ≤üç∞ü§êüëÜüôå\u2002üíõüôÅüëÄüôäüôâ\u2004À¢·µí ≥ ∏·¥º·¥∑·¥∫ ∑·µó ∞·µâ·µò\x13üö¨ü§ì\ue602üòµŒ¨ŒøœåœÇŒ≠·Ω∏◊™◊û◊ì◊£◊†◊®◊ö◊¶◊òüòíÕùüÜïüëÖüë•üëÑüîÑüî§üëâüë§üë∂üë≤üîõüéì\uf0b7\uf04c\x9f\x10ÊàêÈÉΩüò£‚è∫üòåü§ëüåèüòØ–µ—Öüò≤·º∏·æ∂·ΩÅüíûüöìüîîüìöüèÄüëê\u202düí§üçá\ue613Â∞èÂúüË±Üüè°‚ùî‚Åâ\u202füë†„Äã‡§ï‡§∞‡•ç‡§Æ‡§æüáπüáºüå∏Ëî°Ëã±Êñáüåûüé≤„É¨„ÇØ„Çµ„ÇπüòõÂ§ñÂõΩ‰∫∫ÂÖ≥Á≥ª–°–±üíãüíÄüéÑüíúü§¢ŸêŸé—å—ã–≥—è‰∏çÊòØ\x9c\x9düóë\u2005üíÉüì£üëø‡ºº„Å§‡ºΩüò∞·∏∑–ó–∑‚ñ±—ÜÔøºü§£ÂçñÊ∏©Âì•ÂçéËÆÆ‰ºö‰∏ãÈôç‰Ω†Â§±ÂéªÊâÄÊúâÁöÑÈí±Âä†ÊãøÂ§ßÂùèÁ®éÈ™óÂ≠êüêù„ÉÑüéÖ\x85üç∫ÿ¢ÿ•ÿ¥ÿ°üéµüåéÕü·ºîÊ≤πÂà´ÂÖãü§°ü§•üò¨ü§ß–π\u2003üöÄü§¥ ≤—à—á–ò–û–†–§–î–Ø–ú—é–∂üòùüñë·Ωê·ΩªœçÁâπÊÆä‰ΩúÊà¶Áæ§—âüí®ÂúÜÊòéÂõ≠◊ß‚Ñêüèàüò∫üåç‚èè·ªáüçîüêÆüçÅüçÜüçëüåÆüåØü§¶\u200dùìíùì≤ùìøùìµÏïàÏòÅÌïòÏÑ∏Ïöî–ñ—ô–ö—õüçÄüò´ü§§·ø¶ÊàëÂá∫ÁîüÂú®‰∫ÜÂèØ‰ª•ËØ¥ÊôÆÈÄöËØùÊ±âËØ≠Â•ΩÊûÅüéºüï∫üç∏ü•ÇüóΩüéáüéäüÜòü§†üë©üñíüö™Â§©‰∏ÄÂÆ∂‚ö≤\u2006‚ö≠‚öÜ‚¨≠‚¨Ø‚èñÊñ∞‚úÄ‚ïåüá´üá∑üá©üá™üáÆüá¨üáßüò∑üá®üá¶–•–®üåê\x1fÊùÄÈ∏°ÁªôÁå¥Áúã Åùó™ùóµùó≤ùóªùòÜùóºùòÇùóøùóÆùóπùó∂ùòáùóØùòÅùó∞ùòÄùòÖùóΩùòÑùó±üì∫œñ\u2000“Ø’Ω·¥¶·é•“ªÕ∫\u2007’∞\u2001…©ÔΩôÔΩÖ‡µ¶ÔΩå∆ΩÔΩàùêìùê°ùêûùê´ùêÆùêùùêöùêÉùêúùê©ùê≠ùê¢ùê®ùêß∆Ñ·¥®◊ü·ëØ‡ªêŒ§·èß‡Ø¶–Ü·¥ë‹Åùê¨ùê∞ùê≤ùêõùê¶ùêØùêëùêôùê£ùêáùêÇùêòùüé‘ú–¢·óû‡±¶„Äî·é´ùê≥ùêîùê±ùüîùüìùêÖüêãÔ¨Éüíòüíì—ëùò•ùòØùò∂üíêüåãüåÑüåÖùô¨ùôñùô®ùô§ùô£ùô°ùôÆùôòùô†ùôöùôôùôúùôßùô•ùô©ùô™ùôóùôûùôùùôõüë∫üê∑‚ÑãùêÄùê•ùê™üö∂ùô¢·ºπü§òÕ¶üí∏ÿ¨Ìå®Ìã∞Ôº∑ùôá·µªüëÇüëÉ…úüé´\uf0a7–ë–£—ñüö¢üöÇ‡™ó‡´Å‡™ú‡™∞‡™æ‡™§‡´Ä·øÜüèÉùì¨ùìªùì¥ùìÆùìΩùìº‚òòÔ¥æÃØÔ¥ø‚ÇΩ\ue807ùëªùíÜùíçùíïùíâùíìùíñùíÇùíèùíÖùíîùíéùíóùíäüëΩüòô\u200c–õ‚Äíüéæüëπ‚éåüèí‚õ∏ÂÖ¨ÂØìÂÖªÂÆ†Áâ©ÂêóüèÑüêÄüöëü§∑ÊìçÁæéùíëùíöùíêùë¥ü§ôüêíÊ¨¢ËøéÊù•Âà∞ÈòøÊãâÊñØ◊°◊§ùô´üêàùíåùôäùô≠ùôÜùôãùôçùòºùôÖÔ∑ªü¶ÑÂ∑®Êî∂Ëµ¢ÂæóÁôΩÈ¨ºÊÑ§ÊÄíË¶Å‰π∞È¢ù·∫Ωüöóüê≥ùüèùêüùüñùüëùüïùíÑùüóùê†ùôÑùôÉüëáÈîüÊñ§Êã∑ùó¢ùü≥ùü±ùü¨‚¶Å„Éû„É´„Éè„Éã„ÉÅ„É≠Ê†™ÂºèÁ§æ‚õ∑ÌïúÍµ≠Ïñ¥„Ñ∏„ÖìÎãàÕú ñùòøùôî‚Çµùí©‚ÑØùíæùìÅùí∂ùìâùìáùìäùìÉùìàùìÖ‚Ñ¥ùíªùíΩùìÄùìåùí∏ùìéùôèŒ∂ùôüùòÉùó∫ùüÆùü≠ùüØùü≤üëãü¶äÂ§ö‰º¶üêΩüéªüéπ‚õìüèπüç∑ü¶Ü‰∏∫Âíå‰∏≠ÂèãË∞äÁ•ùË¥∫‰∏éÂÖ∂ÊÉ≥Ë±°ÂØπÊ≥ïÂ¶ÇÁõ¥Êé•ÈóÆÁî®Ëá™Â∑±ÁåúÊú¨‰º†ÊïôÂ£´Ê≤°ÁßØÂîØËÆ§ËØÜÂü∫Áù£ÂæíÊõæÁªèËÆ©Áõ∏‰ø°ËÄ∂Á®£Â§çÊ¥ªÊ≠ªÊÄ™‰ªñ‰ΩÜÂΩì‰ª¨ËÅä‰∫õÊîøÊ≤ªÈ¢òÊó∂ÂÄôÊàòËÉúÂõ†Âú£ÊääÂÖ®Â†ÇÁªìÂ©öÂ≠©ÊÅêÊÉß‰∏îÊ†óË∞ìËøôÊ†∑Ëøò‚ôæüé∏ü§ïü§í‚õëüéÅÊâπÂà§Ê£ÄËÆ®üèùü¶Åüôãüò∂Ï•êÏä§ÌÉ±Ìä∏Î§ºÎèÑÏÑùÏú†Í∞ÄÍ≤©Ïù∏ÏÉÅÏù¥Í≤ΩÏ†úÌô©ÏùÑÎ†µÍ≤åÎßåÎì§ÏßÄÏïäÎ°ùÏûòÍ¥ÄÎ¶¨Ìï¥ÏïºÌï©Îã§Ï∫êÎÇòÏóêÏÑúÎåÄÎßàÏ¥àÏôÄÌôîÏïΩÍ∏àÏùòÌíàÎü∞ÏÑ±Î∂ÑÍ∞àÎïåÎäîÎ∞òÎìúÏãúÌóàÎêúÏÇ¨Ïö©üî´üëÅÂá∏·Ω∞üí≤üóØùôà·ºåùíáùíàùíòùíÉùë¨ùë∂ùïæùñôùñóùñÜùñéùñåùñçùñïùñäùñîùñëùñâùñìùñêùñúùñûùñöùñáùïøùñòùñÑùñõùñíùñãùñÇùï¥ùñüùñàùï∏üëëüöøüí°Áü•ÂΩºÁôæ\uf005ùôÄùíõùë≤ùë≥ùëæùíãùüíüò¶ùôíùòæùòΩüèêùò©ùò®·Ωº·πëùë±ùëπùë´ùëµùë™üá∞üáµüëæ·ìá·íß·î≠·êÉ·êß·ê¶·ë≥·ê®·ìÉ·ìÇ·ë≤·ê∏·ë≠·ëé·ìÄ·ê£üêÑüéàüî®üêéü§ûüê∏üíüüé∞üåùüõ≥ÁÇπÂáªÊü•Áâàüç≠ùë•ùë¶ùëßÔºÆÔºßüë£\uf020„Å£üèâ—Ñüí≠üé•Œûüê¥üë®ü§≥ü¶ç\x0büç©ùëØùííüòóùüêüèÇüë≥üçóüïâüê≤⁄Ü€åùëÆùóïùó¥üçíÍú•‚≤£‚≤èüêë‚è∞ÈâÑ„É™‰∫ã‰ª∂—óüíä„Äå„Äç\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600ÁáªË£Ω„Ç∑ËôöÂÅΩÂ±ÅÁêÜÂ±à–ìùë©ùë∞ùíÄùë∫üå§ùó≥ùóúùóôùó¶ùóßüçä·Ω∫·ºà·º°œá·øñŒõ‚§èüá≥ùíôœà’Å’¥’•’º’°’µ’´’∂÷Ä÷Ç’§’±ÂÜ¨Ëá≥·ΩÄùíÅüîπü§öüçéùë∑üêÇüíÖùò¨ùò±ùò∏ùò∑ùòêùò≠ùòìùòñùòπùò≤ùò´⁄©Œíœéüí¢ŒúŒüŒùŒëŒïüá±‚ô≤ùùà‚Ü¥üíí‚äò»ªüö¥üñïüñ§ü•òüìçüëà‚ûïüö´üé®üåëüêªùêéùêçùêäùë≠ü§ñüééüòºüï∑ÔΩáÔΩíÔΩéÔΩîÔΩâÔΩÑÔΩïÔΩÜÔΩÇÔΩãùü∞üá¥üá≠üáªüá≤ùóûùó≠ùóòùó§üëºüìâüçüüç¶üåàüî≠„Ääüêäüêç\uf10a·Éö⁄°üê¶\U0001f92f\U0001f92aüê°üí≥·º±üôáùó∏ùóüùó†ùó∑ü•ú„Åï„Çà„ÅÜ„Å™„Çâüîº'
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""‚Äú‚Äù‚Äô' + '‚àûŒ∏√∑Œ±‚Ä¢√†‚àíŒ≤‚àÖ¬≥œÄ‚Äò‚Çπ¬¥¬∞¬£‚Ç¨\√ó‚Ñ¢‚àö¬≤‚Äî‚Äì&'
small_caps_mapping = {
    "·¥Ä": "a", " ô": "b", "·¥Ñ": "c", "·¥Ö": "d", "·¥á": "e", "“ì": "f", "…¢": "g", " ú": "h", "…™": "i",
    "·¥ä": "j", "·¥ã": "k", " ü": "l", "·¥ç": "m", "…¥": "n", "·¥è": "o", "·¥ò": "p", "«´": "q", " Ä": "r",
    "s": "s", "·¥õ": "t", "·¥ú": "u", "·¥†": "v", "·¥°": "w", "x": "x", " è": "y", "·¥¢": "z"}
contraction_mapping = {
    "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
    "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
    "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
    "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've":
    "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
    "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
    "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
    "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
    "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not",
    "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's":"this is","that'd": "that would",
    "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is",
    "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have",
    "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have",
    "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will",
    "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have",
    "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have",
    "trump's": "trump is", "obama's": "obama is", "canada's": "canada is", "today's": "today is"}
specail_signs = { "‚Ä¶": "...", "‚ÇÇ": "2"}
specials = ["‚Äô", "‚Äò", "¬¥", "`"]
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()


isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}


def handle_punctuation(x):
    x = x.translate(remove_dict)
    x = x.translate(isolate_dict)
    return x

def handle_contractions(x):
    x = tokenizer.tokenize(x)
    return x

def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

def preprocess(x):
    x = handle_punctuation(x)
    x = handle_contractions(x)
    x = fix_quote(x)
    return x

In [12]:
train_x['string'] = train_x['string'].progress_apply(lambda x:preprocess(x))
val_x['string'] = val_x['string'].progress_apply(lambda x:preprocess(x))

test_x['string'] = test_x['string'].progress_apply(lambda x:preprocess(x))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 269038/269038 [01:34<00:00, 2848.10it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 45180/45180 [00:12<00:00, 3557.36it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 133782/133782 [00:39<00:00, 3354.77it/s]


In [13]:
train_x.head(), train_x.shape

(   index                                             string
 0      0                even up here . . . . . . . BLACKS !
 1      1  Blame men . There s always an excuse to blame ...
 2      2  You have no business making any comments on th...
 3      3  `` Let s get the black folks and the white fol...
 4      4  I guess the issue is people not willing to put...,
 (269038, 2))

#EDA

#Prepare the model

In [14]:
def seed_everything(seed=10086):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [15]:
NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 1
MAX_LEN = 200
MAX_FEATURES = 120000

IDENTITY_COLUMNS = ['male',	'female',	'LGBTQ',	'christian',	'muslim',	'other_religions',	'black',	'white']
AUX_COLUMNS = ['severe_toxicity',	'obscene',	'threat',	'insult',	'identity_attack',	'sexual_explicit','y']
TEXT_COLUMN = 'string'
TARGET_COLUMN = 'y'


In [None]:
# Overall
weights = np.ones((len(train_x),)) / 4
# Subgroup
weights += (train_y[IDENTITY_COLUMNS].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4
# Background Positive, Subgroup Negative
weights += (( (train_y['y'].values>=0.5).astype(bool).astype(np.int) +
   (train_y[IDENTITY_COLUMNS].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# Background Negative, Subgroup Positive
weights += (( (train_y['y'].values<0.5).astype(bool).astype(np.int) +
   (train_y[IDENTITY_COLUMNS].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
loss_weight = 1.0 / weights.mean()

In [17]:
#preprocessing the data for later usage
x_train = train_x[TEXT_COLUMN].astype(str)

#y_aux_train = train_y[AUX_COLUMNS].values
#y_train = np.vstack([(train_y['y'].values>=0.5).astype(np.int),weights]).T

x_val = val_x[TEXT_COLUMN].astype(str)

x_test = test_x[TEXT_COLUMN].astype(str)

In [18]:
#tokenizing the corpus, limiting the tokenizer to 120000 words
#for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
#train_y[column] = np.where(train_y[column] >= 0.5, True, False)

tokenizer = text.Tokenizer(num_words=MAX_FEATURES, filters='',lower=False)
tokenizer.fit_on_texts(list(x_train) + list(x_val) + list(x_test))


In [19]:
#Sequence Bucketing
x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)

In [20]:
#making sure that every sentence is of equal length by adding padding
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_val = sequence.pad_sequences(x_val, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [21]:
train_y = pd.read_csv(os.path.join(data_dir, 'train_y.csv'))
train_y = train_y.drop('from_source_domain', axis=1)

In [22]:
# Â∞ÜÂ∫èÂàóÊãÜÂàÜ‰∏∫200ÂàóÔºåÂπ∂ÂàõÂª∫DataFrame
df_x = pd.DataFrame(x_train, columns=['feature_' + str(i) for i in range(200)])

# Ê£ÄÊü• DataFrame ÁªìÊûÑ
print(df_x.shape)

(269038, 200)


In [23]:
df_x.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_190,feature_191,feature_192,feature_193,feature_194,feature_195,feature_196,feature_197,feature_198,feature_199
0,0,0,0,0,0,0,0,0,0,0,...,143,1,1,1,1,1,1,1,26470,38
1,0,0,0,0,0,0,0,0,0,0,...,1538,4,661,135,14,2,4610,6,75,1
2,0,0,0,0,0,0,0,0,0,0,...,72,334,335,10,8,2641,4,51,2302,1
3,0,0,0,0,0,0,0,0,0,0,...,439,5,2,104,439,50,429,225,7248,11
4,0,0,0,0,0,0,0,0,0,0,...,27,17,895,4,293,26,2033,5,2505,1


In [24]:
def get_tail_label(df):
    """
    Give tail label colums of the given target dataframe

    args
    df: pandas.DataFrame, target label df whose tail label has to identified

    return
    tail_label: list, a list containing column name of all the tail label
    """
    columns = df.columns
    n = len(columns)
    irpl = np.zeros(n)
    for column in range(n):
        irpl[column] = df[columns[column]].value_counts()[1]
    irpl = max(irpl)/irpl
    mir = np.average(irpl)
    tail_label = []
    for i in range(n):
        if irpl[i] > mir:
            tail_label.append(columns[i])
    return tail_label

tail_labels =['black','white', 'LGBTQ','muslim']
def get_index(df):
  """
  give the index of all tail_label rows
  args
  df: pandas.DataFrame, target label df from which index for tail label has to identified

  return
  index: list, a list containing index number of all the tail label
  """
  tail_labels = ['black','white', 'LGBTQ','muslim']
  index = set()
  for tail_label in tail_labels:
    sub_index = set(df[df[tail_label]==1].index)
    index = index.union(sub_index)
  return list(index)

def get_minority_instace(X, y):
    """
    Give minority dataframe containing all the tail labels

    args
    X: pandas.DataFrame, the feature vector dataframe
    y: pandas.DataFrame, the target vector dataframe

    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    index = get_index(y)
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub


In [25]:
from sklearn.neighbors import NearestNeighbors

def nearest_neighbour(X):
    """
    Give index of 5 nearest neighbor of all the instance

    args
    X: np.array, array whose nearest neighbor has to find

    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs=NearestNeighbors(n_neighbors=5,metric='euclidean',algorithm='kd_tree').fit(X)
    euclidean,indices= nbs.kneighbors(X)
    return indices

def MLSMOTE(X,y, n_sample):
    """
    Give the augmented data using MLSMOTE algorithm

    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample

    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0,n-1)
        neighbour = random.choice(indices2[reference,1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val>2 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbour,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    new_X = pd.concat([X, new_X], axis=0)
    target = pd.concat([y, target], axis=0)
    return new_X, target


In [26]:
X_sub, y_sub = get_minority_instace(df_x,train_y)#Getting minority instance of that datframe

In [27]:
y_sub.shape

(42930, 16)

In [28]:
X_res,y_res =MLSMOTE(X_sub, y_sub, 50000) #Applying MLSMOTE to augment the dataframe

In [30]:
# ÂêàÂπ∂ÁâπÂæÅÊï∞ÊçÆÂ∏ß
df_x_augmented = pd.concat([df_x, X_res], ignore_index=True)

# ÂêàÂπ∂Ê†áÁ≠æÊï∞ÊçÆÂ∏ß
train_y_augmented = pd.concat([train_y, y_res], ignore_index=True)

df_x_augmented.shape, train_y_augmented.shape

((361968, 200), (361968, 16))

In [31]:
x_train_augmented = df_x_augmented.to_numpy()

In [33]:
x_train_augmented.shape

(361968, 200)

In [None]:

from imblearn.over_sampling import SMOTE

# ÂàùÂßãÂåñSMOTEÂÆû‰æã
smote = SMOTE()

# Â∫îÁî®SMOTE‰∫é‰∏ªË¶ÅÁõÆÊ†áÁâπÂæÅÂàó
X_train_smote, y_train_smote = smote.fit_resample(df_x, train_y['y'])

# ËøáÈááÊ†∑ÂêéÁöÑÊ†∑Êú¨Êï∞Èáè
n_samples = len(y_train_smote)

# ÂàõÂª∫‰∏Ä‰∏™Á©∫ÁöÑDataFrameÊù•Â≠òÂÇ®ËøáÈááÊ†∑ÂêéÁöÑËæÖÂä©ÁâπÂæÅ
new_aux_features = pd.DataFrame(index=range(n_samples), columns=AUX_COLUMNS+IDENTITY_COLUMNS)

# ÂØπÊØè‰∏™ËæÖÂä©ÁâπÂæÅÂàóËøõË°åËøáÈááÊ†∑Â§ÑÁêÜ
for column in (AUX_COLUMNS+IDENTITY_COLUMNS):
    # ËøáÈááÊ†∑ÊØè‰∏™ËæÖÂä©ÁâπÂæÅÂàó
    # Ê≥®ÊÑèÔºöÊàë‰ª¨‰ΩøÁî®Áõ∏ÂêåÁöÑSMOTEÂÆû‰æãÔºåËøôÂ∞ÜÁ°Æ‰øùÊØè‰∏™ÁâπÂæÅÂàó‰∏é‰∏ªË¶ÅÁõÆÊ†áÂàóÁöÑËøáÈááÊ†∑ÂêåÊ≠•
    _, new_aux_feature = smote.fit_resample(np.arange(len(train_y)).reshape(-1, 1), train_y[column])

    # Â∞ÜËøáÈááÊ†∑ÁªìÊûúÂ≠òÂÇ®Âà∞Êñ∞ÁöÑDataFrame‰∏≠
    new_aux_features[column] = new_aux_feature

# Áé∞Âú®Êàë‰ª¨Êúâ‰∫Ü‰∏Ä‰∏™ÂåÖÂê´ÊâÄÊúâËøáÈááÊ†∑ËæÖÂä©ÁâπÂæÅÁöÑDataFrame
# Â∞ÜÂéüÂßãÁöÑ'y'ÂàóÊõøÊç¢‰∏∫ËøáÈááÊ†∑ÂêéÁöÑ'y'Âàó
#new_aux_features['y'] = y_train_smote

# Êõ¥Êñ∞train_y DataFrame
#train_y = new_aux_features


In [None]:
new_aux_features['y'] = y_train_smote
train_y = new_aux_features

In [None]:
train_y.head()

Unnamed: 0,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit,y,male,female,LGBTQ,christian,muslim,other_religions,black,white
0,0,0,0,0,1,0,1,0,0.0,0,0,0,0,1,0
1,0,0,0,0,1,0,1,1,1.0,0,0,0,0,0,0
2,0,0,0,1,0,0,1,0,0.0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0.0,0,0,0,0,1,1
4,0,0,0,1,0,0,1,0,0.0,0,0,0,0,0,0


In [None]:
# Overall
weights = np.ones((len(x_train_augmented),)) / 4
# Subgroup
weights += (train_y_augmented[IDENTITY_COLUMNS].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4
# Background Positive, Subgroup Negative
weights += (( (train_y_augmented['y'].values>=0.5).astype(bool).astype(np.int) +
   (train_y_augmented[IDENTITY_COLUMNS].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# Background Negative, Subgroup Positive
weights += (( (train_y_augmented['y'].values<0.5).astype(bool).astype(np.int) +
   (train_y_augmented[IDENTITY_COLUMNS].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
loss_weight = 1.0 / weights.mean()

In [37]:

y_aux_train = train_y_augmented[AUX_COLUMNS].values
y_train = np.vstack([(train_y_augmented['y'].values>=0.5).astype(np.int),weights]).T


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_train = np.vstack([(train_y_augmented['y'].values>=0.5).astype(np.int),weights]).T


In [None]:
#ÂèØËßÜÂåñtrain_yÂêÑ‰∏™ÁâπÂæÅÁöÑÂàÜÂ∏ÉÊÉÖÂÜµ

label_sums = y_res.sum()
total_samples = len(y_res)
label_proportions = label_sums / total_samples

# ÂàõÂª∫‰∏Ä‰∏™Êñ∞ÁöÑ DataFrameÔºåÂåÖÂê´Êï∞ÈáèÂíåÊØî‰æã
label_distribution = pd.DataFrame({
    'Counts': label_sums,
    'Proportions': label_proportions
})
# ÂèØËßÜÂåñÊï∞Èáè
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

label_distribution['Counts'].plot(kind='bar', ax=ax[0], color='skyblue')
ax[0].set_title('Counts of Each Category')
ax[0].set_ylabel('Number of Occurrences')
ax[0].set_xlabel('Category')

# ÂèØËßÜÂåñÊØî‰æã
label_distribution['Proportions'].plot(kind='bar', ax=ax[1], color='lightgreen')
ax[1].set_title('Proportions of Each Category')
ax[1].set_ylabel('Proportion of Total Samples')
ax[1].set_xlabel('Category')

plt.tight_layout()
plt.show()


In [39]:
#functions to build our embedding matrix

#Êé•Êî∂‰∏Ä‰∏™ÂçïËØçÂèäÂÖ∂ÂØπÂ∫îÁöÑÂµåÂÖ•ÂêëÈáèÔºåËøîÂõû‰∏Ä‰∏™ÂåÖÂê´ÂçïËØçÂíåÂÆÉÁöÑ NumPy Êï∞ÁªÑÂΩ¢ÂºèÁöÑÂêëÈáèÁöÑÂÖÉÁªÑ„ÄÇ
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

#ÊâìÂºÄ‰∏Ä‰∏™È¢ÑËÆ≠ÁªÉÁöÑÂµåÂÖ•ÂêëÈáèÊñá‰ª∂ÔºàÂ¶ÇGloVeÊàñWord2VecÔºâÔºåÈÄêË°åËØªÂèñÂπ∂‰ΩøÁî® get_coefs ÂáΩÊï∞Êù•Âª∫Á´ã‰∏Ä‰∏™Â≠óÂÖ∏ÔºåÂ≠óÂÖ∏‰∏≠ÊØè‰∏™ÂçïËØçÂØπÂ∫î‰∏Ä‰∏™ÂµåÂÖ•ÂêëÈáè„ÄÇ
def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

#ÂÆÉÂàùÂßãÂåñ‰∏Ä‰∏™ÂÖ®Èõ∂ÁöÑÂµåÂÖ•Áü©ÈòµÔºåÁü©ÈòµÁöÑË°åÊï∞ÊØîÂçïËØçÊÄªÊï∞Â§ö‰∏Ä‰∏™ÔºàÂåÖÊã¨Â°´ÂÖÖÁ¥¢ÂºïÔºâÔºåÂàóÊï∞ÊòØÂµåÂÖ•ÂêëÈáèÁöÑÂ§ßÂ∞èÔºàËøôÈáåÊòØ300Ôºâ„ÄÇ
#ÁÑ∂ÂêéÂáΩÊï∞ÈÅçÂéÜÂçïËØçÁ¥¢ÂºïÔºåÂ∞ùËØïÂ∞ÜÈ¢ÑËÆ≠ÁªÉÁöÑÂêëÈáèÂàÜÈÖçÂà∞ÂµåÂÖ•Áü©Èòµ‰∏≠Áõ∏Â∫îÁöÑË°å„ÄÇÂ¶ÇÊûúÂçïËØçÁ¥¢Âºï‰∏≠ÁöÑÂçïËØçÂú®ÂµåÂÖ•Á¥¢Âºï‰∏≠Êâæ‰∏çÂà∞ÔºåÂ∞±‰øùÁïôËØ•Ë°å‰∏∫Èõ∂ÔºåÊúâÊïàÂú∞Â∞ÜÂÖ∂ËßÜ‰∏∫Êú™Áü•ÂçïËØç„ÄÇ
def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

In [40]:
#building the embedding matrix
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

In [41]:
from keras.losses import binary_crossentropy
def custom_loss(y_true, y_pred):
    return binary_crossentropy(K.reshape(y_true[:,0],(-1,1)), y_pred) * y_true[:,1]

In [42]:
def build_model(embedding_matrix, num_aux_targets, loss_weight):
    words = Input(shape=(MAX_LEN,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)

    model = Model(inputs=words, outputs=[result, aux_result])
    model.compile(loss=[custom_loss,'binary_crossentropy'], loss_weights=[loss_weight, 1.0], optimizer='adam',metrics=['accuracy'])

    return model


In [43]:
checkpoint_predictions = []

In [44]:
#fitting model on whole training data
from keras import backend as K

for model_idx in range(NUM_MODELS):
    print('Model ', model_idx)
    model = build_model(embedding_matrix, y_aux_train.shape[-1], loss_weight)
    for global_epoch in range(1):
        model.fit(
            x_train_augmented,
            [y_train, y_aux_train],
            batch_size=BATCH_SIZE,
            epochs=1,
            verbose=1,
            callbacks=[
                LearningRateScheduler(lambda _: 1e-3 * (0.55 ** global_epoch))

            ]
        )
        checkpoint_predictions.append(model.predict(x_val, batch_size=2048)[0].flatten())


  #ÊØèË°å‰ª£Ë°®‰∏Ä‰∏™epochÁöÑËÆ≠ÁªÉÁªìÊûúÔºåÂ±ïÁ§∫‰∫ÜÊ®°ÂûãÂú®ËØ•epochËÆ≠ÁªÉÂêéÁöÑÊÄßËÉΩ„ÄÇÂØπ‰∫éÊØè‰∏™epochÔºå
  #ÈÉΩÊòæÁ§∫‰∫ÜËÆ≠ÁªÉÈõÜ‰∏äÁöÑÊÄßËÉΩÊåáÊ†á„ÄÇ‰æãÂ¶ÇÔºåÁ¨¨‰∏ÄË°åÊòæÁ§∫Á¨¨‰∏Ä‰∏™epochÁöÑÊçüÂ§±ÂíåÂáÜÁ°ÆÁéáÔºåÁÑ∂ÂêéÈöèÁùÄepochÁöÑÂ¢ûÂä†ÔºåÊçüÂ§±ÂáèÂ∞ëÔºåÂáÜÁ°ÆÁéáÊèêÈ´ò„ÄÇ

Model  0
Model  1


In [45]:
def group_accuracies(prediction, y):
    """
    Compute the percentage of correctly classified instances within each group and round them to four decimal places.
    arguments:
        prediction [pandas.DataFrame]: dataframe with 2 columns (index and pred)
        y [pandas.DataFrame]: dataframe containing the metadata with actual labels
    returns:
        accuracies_dict [dict]: dictionary with group names as keys and their percentage of correctly classified instances as values
    """
    y.loc[prediction.index, 'pred'] = prediction.pred

    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies_dict = {}
    for category in categories:
        group = y.loc[y[category] == 1]  # Assuming 1 indicates the presence of the category
        if not group.empty:
            group_accuracy = (group['y'] == group['pred']).mean()
            accuracies_dict[category] = round(group_accuracy * 100, 4)  # Convert to percentage and round to four decimal places

    return accuracies_dict

# Note: This function now assumes that `y['y']` contains the actual labels and `prediction['pred']` contains the predicted labels.


In [51]:
val_predictions = np.average(checkpoint_predictions, axis=0)
pred = [1 if pred > 0.50 else 0 for pred in val_predictions]

prediction_df = pd.DataFrame.from_dict({
    'ID': val_y.index,
    'pred': pred
})

group_accs = group_accuracies(prediction_df, val_y)
print(group_accs)

{'male': 89.6118, 'female': 90.1375, 'LGBTQ': 80.2334, 'christian': 93.2088, 'muslim': 83.0806, 'other_religions': 87.4239, 'black': 77.3002, 'white': 78.7234}


In [None]:
train_x_pred =

In [47]:
checkpoint_predictions_test=[]
checkpoint_predictions_test.append(model.predict(x_test, batch_size=2048)[0].flatten())



In [48]:
predictions = np.average(checkpoint_predictions_test, axis=0)
pred = [1 if pred > 0.5 else 0 for pred in predictions]

submission = pd.DataFrame.from_dict({
    'ID': test_x.index,
    'pred': pred
})
submission.to_csv('submission.csv', index=False)

In [52]:
# ËÆ°ÁÆó pred_df ‰∏≠ 'pred' Âàó‰∏≠ 0 Âíå 1 Âá∫Áé∞ÁöÑÊ¨°Êï∞
pred_counts = submission['pred'].value_counts()

# ÊâìÂç∞Âá∫ÁªìÊûú
print(pred_counts)

0    118998
1     14784
Name: pred, dtype: int64
