In [1]:
# load packages
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import stopwords as sw
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import spacy
import gensim.downloader as api
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


## 1. load data


In [3]:
# load datasets: since only focused on report, so train.csv and val.csv were used
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [4]:
id = '1zTa79IzS1uSW0wVILnVHsVx97xNbCb88'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train.csv')  
# https://drive.google.com/file/d/1zTa79IzS1uSW0wVILnVHsVx97xNbCb88/view?usp=sharing #train
# https://drive.google.com/file/d/1913G0dNuYpWh0v-Fh2RUoq1piMfLTIOa/view?usp=sharing #vali
# https://drive.google.com/file/d/1ysKwXTo2J2w4Vd9hALSJNhMSIEIQv2X6/view?usp=sharing #test
# https://drive.google.com/file/d/1-KpmplTKfKfoAo8v6b-urrwF6iHQomzp/view?usp=sharing. #sample
id = '1913G0dNuYpWh0v-Fh2RUoq1piMfLTIOa'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('val.csv')  
id = '1ysKwXTo2J2w4Vd9hALSJNhMSIEIQv2X6'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test_without_labels.csv') 
id = '1-KpmplTKfKfoAo8v6b-urrwF6iHQomzp'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('sample.csv')

#Reference: COMP5046, Lab05,https://colab.research.google.com/drive/1qgMqtdKGy9geQ4IUx3pd8PovUQLFswW8#scrollTo=vh1rYL3NIavi

In [5]:
#load data
trainset=pd.read_csv('train.csv')
valset=pd.read_csv('val.csv')
testset=pd.read_csv('test_without_labels.csv')
trainset.head(5)

Unnamed: 0,sents,labels
0,wow,O
1,WTF,T
2,wpe wpe,O O
3,hahaha,O
4,wtf,T


In [6]:
testset

Unnamed: 0,sents
0,FUCKER
1,hahha
2,ggggg
3,macropyre
4,Boom
...,...
495,REPORT SPECTRE RAGE QUIT [SEPA] AND HIM FRIEND...
496,sf feeder auto lose
497,GG [SEPA] COMMEND SUPPORTS
498,how [SEPA] WTF [SEPA] THIS IS NOT EVEN REALITY


In [7]:

train_label=list(trainset.labels)
val_label=list(valset.labels)

In [8]:
train_sentences=list(trainset.sents)
val_sentences=list(valset.sents)
test_sentences = list(testset.sents)

##1.1Data processing

In [9]:
# split all labels by space
train_label_split=[]
val_label_split=[]
for i in train_label:
    train_label_split.append(i.split(" "))
for j in val_label:
    val_label_split.append(j.split(" "))

# Split all data parts by space
tokenized_train=[]
tokenized_val=[]
tokenized_test=[]
for i in train_sentences:
    token=i.lower().split(" ")
    tokenized_train.append(token)
for j in val_sentences:
    token=j.lower().split(" ")
    tokenized_val.append(token)
for k in test_sentences:
    token=k.lower().split(" ")
    tokenized_test.append(token)
#token all the train,validation and test dataset and lower it's capital

#reference: comp5046,lab02,https://colab.research.google.com/drive/1-l7gzLZ71ERuJ_ktG1nKTU6G8UuEvseN?usp=sharing

In [10]:
concate_tokens=tokenized_train+tokenized_val+tokenized_test

In [11]:
len(tokenized_train)

26078

In [12]:
len(tokenized_val)

8705

In [13]:
len(tokenized_test)

500

##2. Input embedding

2.1 word embedding

In [14]:
#make a sentence dictionary
# Generate word_to_ix
word_to_ix = {}
for sentence in concate_tokens: #all tokens
    for word in sentence:
        word = word.lower()
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

word_list = list(word_to_ix.keys()) # get all keys of tokens

# Generate tag_to_ix
START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in train_label_split + val_label_split: #train and valid lables
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

#reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

In [15]:
tag_to_ix

{'<START>': 0,
 '<STOP>': 1,
 'C': 8,
 'D': 7,
 'O': 2,
 'P': 4,
 'S': 6,
 'SEPA': 5,
 'T': 3}

In [16]:
#for output later
ix_to_tag = dict(zip([i for i in tag_to_ix.values()],[i for i in tag_to_ix.keys()]))

In [17]:
ix_to_tag

{0: '<START>',
 1: '<STOP>',
 2: 'O',
 3: 'T',
 4: 'P',
 5: 'SEPA',
 6: 'S',
 7: 'D',
 8: 'C'}

In [18]:
# Convert dataset to idxs
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
          input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

train_input_index =  to_index(tokenized_train,word_to_ix)
train_output_index = to_index(train_label_split,tag_to_ix)
val_input_index = to_index(tokenized_val,word_to_ix)
val_output_index = to_index(val_label_split,tag_to_ix)
test_input_index = to_index(tokenized_test,word_to_ix)
#test have no label

#reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

2.2 POS tag & parse

In [None]:
#reference: comp5046,lab07, https://colab.research.google.com/drive/1vfhLu945Wi00WPhWAMBVQe4RbUIdqH05?usp=sharing#scrollTo=rFBFgEEkIaQ-

In [19]:
# Get all pos tags
def pos_tag(doc):
    pos=[]
    for i in doc:
        tags=[]
        for w,t in nltk.pos_tag(i):
            tags.append(t)
        pos.append(tags)
    return pos

pos_tags=pos_tag(concate_tokens)

In [20]:
# use Word2Vec:SkigGram to train pos tags
word_to_pos=Word2Vec(sentences=pos_tags,size=20,window=3,min_count=1, workers=4, sg=1)

In [21]:
# Get all parse dependency
#load the spacy api with the pre-trained statistical models for English. English multi-task CNN trained on OntoNotes
nlp=spacy.load("en_core_web_sm")
def parse_depend(word):
    parse_sent=[]
    for i in word:
        parse=nlp(' '.join(i))
        x=[]
        for j in parse:
            x.append(j.dep_)            
        parse_sent.append(x[:len(i)])
    return parse_sent

parse_sents=parse_depend(concate_tokens)


In [22]:
# use Word2Vec:SkipGram to train all parses 
word_to_parse=Word2Vec(sentences=parse_sents,size=20,window=3,min_count=1,workers=4,sg=1)

In [23]:
#Create dictionaries to store all pos tags and parse
word_2_pos={}
for i in range(0,len(concate_tokens)):
    for x in range(0,len(concate_tokens[i])):
        word_2_pos[concate_tokens[i][x]]=word_to_pos[pos_tags[i][x]]
word_2_parse={} 
for i in range(0,len(concate_tokens)):
    for x in range(0,len(concate_tokens[i])):
        word_2_parse[concate_tokens[i][x]]=word_to_parse[parse_sents[i][x]]

#reference: comp5046,lab07, https://colab.research.google.com/drive/1vfhLu945Wi00WPhWAMBVQe4RbUIdqH05?usp=sharing#scrollTo=rFBFgEEkIaQ-

  """
  if __name__ == '__main__':


2.3 domian & enbedding matrix 

In [104]:
corpus_dota ="""AA 
Initialism for Ancient Apparition.
AC 
Initialism for Assault Cuirass.
Aggro 
Abbreviation for aggression. The programmed aggression of AI controlled towers and creeps. Refers to getting the attention of a particular hostile unit, e.g. "I have aggro" means a hostile unit is focusing on attacking you.
AM 
Initialism for Anti-Mage.
AoE 
Initialism for Area of Effect. It refers to a spell, attack, or effect that affects an area around a point (for example, Dragon Slave is an "AoE spell"). Also used to refer to the range or size of one such spell/attack/effect, for example Reverse Polarity affects enemies in a 410 radius, therefore it has an AoE of 410, or "410 AoE".
AW 
Initialism for Arc Warden.
B 
Initialism for "Back" or "Get Back". Used to call for a retreat. A player saying "b" is usually suggesting that everyone back up or run away.
BAT 
Abbreviation for Base Attack Time, which determines how long an unbuffed unit with 0 agility and 0% bonus attack speed has to wait to attack again. Lower BAT units are able to attack more frequently. Most heroes have a BAT of 1.70. It is considered better to have a lower BAT and worse to have a higher BAT.
Backdoor 
The act of attacking enemy structures while a player's own lane creeps have not reached the structure yet. Usually ineffective due to backdoor protection.
Ball 
May refer to Io or snowballing or close positioning of teammates when pushing highground ('don't ball for ravage')
Bara, Barathum 
Refers to Spirit Breaker's name as he was commonly referenced in Dota 1.
Barracks
Structures found in each lane that when destroyed permanently grant super creeps to the team that destroyed it.
Basi 
refers to Ring of Basilius
BB 
Initialism for Bristleback or Buyback.
BD 
Initialism for backdooring.
BF 
Initialism for Battle Fury, also often referred to as "bfury", or Butterfly.
BH 
Initialism for either Bounty Hunter, or Enigma's ultimate, Black Hole.
Bird 
May refer to Phoenix, Skywrath Mage (and his spells), Visage's Familiars, or the flying courier.
BKB 
Initialism for Black King Bar.
Blocking 
Refers to one of the following: the act of "blocking" creeps with your hero's body as they run down the lane, impeding their movement so that the creeps meet closer toward your tower; the act of deliberately and repeatedly moving directly in front of an enemy hero (in the same manner as creep blocking) to impede their movement, often to help allies attack them; the act of "blocking" a neutral camp with a unit or a ward -- because if there is a unit or ward in the spawn box at X:00 on the clock, the camp will not spawn new creeps.
BM 
Initialism for Beastmaster, Brewmaster, Broodmother, Blade Mail or "bad manners".
Bot 
Refers to either the bottom lane of the map, or an AI-controlled hero.
BoT, BoTs
Initialism for Boots of Travel.
Book 
Refers to Necronomicon, short for "necro book". Can also be referred to as book1, book2, or book3 based on its level.
Bottle Crow 
The tactic of giving one's Bottle to the courier, then sending it on a round trip back to base to refill the bottle and give it back to the owner. This keeps the player's HP/MP healthy, and they do not have to miss any exp/gold from lane refilling it themselves. Note that any bottle that is not full slows the courier by 30%, and the act of bottle crowing may hamper player's allies as it gets harder to deliver their own items while the courier is making trips. (no longer possible, patched out)
BRB 
Initialism for "be right back".
Break 
Refers to either the effect of disabling the effects of a hero's passive abilities or the other passive ability of Tranquil Boots.
BS 
Initialism for Bloodseeker or Bloodstone. May also be used as an acronym for "bullshit".
Buff 
A beneficial spell or effect placed on a unit. Refers to the opposite of a debuff.
Burst or Burst Damage 
Refers to a high amount of damage dealt over a short period of time. Usually involves nukes.
Carry 
A type of hero which can overpower the enemy team in the late game. These heroes tend to scale very well with gold and experience, and thus require large sums of it in order to be successful.
Caster 
Either refers to a hero whose primary function is to cast spells prolifically, or a live person who commentates on an ongoing game.
CC 
Initialism for Crowd Control. See disable.
CD 
Initialism for cooldown or Captain's Draft.
CDR 
Initialism for cooldown reduction.
Chicken or Chick 
Refers to Animal Courier.
Chieftain 
See Tauren Chieftain.
CK 
Initialism for Chaos Knight. May also refer to "creep kill" (see CS or last hit).
CM 
Initialism for Crystal Maiden or Captain's Mode.
Comeback 
A situation when a team is performing poorly for most of the game, but has managed to fight back, turn the tide of battle and ultimately win the game.
Crow 
Refers to Flying Courier, which upgrades a walking courier. A player asking to "please crow it" is asking for someone to upgrade the courier.
CS 
Initialism for Creep Score, the amount of last hits a player has, or the amount of last hits and denies a player has. In DotA 1, typing -cs would show the player's last hits and denies count. May also refer to "creep steal", the act of stealing last hits from an allied player.
CW 
Initialism for Centaur Warrunner or Clockwerk.
DD 
Initialism for Rune of Double Damage.
Debuff 
A detrimental spell of effect placed on a unit. Refers to the opposite of a buff.
Deny 
Killing your allied unit, hero or building, in order to prevent an enemy from gaining the gold and experience it gives. You can manually attack your own creeps if their health is below 50%, your own towers below if they are below 10%, and heroes affected by certain DoTs. A hero can also be said to be denied if they die to neutral creeps instead of to an enemy unit.
Deward 
The act of removing enemy wards, usually mostly pertaining to Observer Wards.
DGU 
Initialism for "Don't give up".
Dieback 
When a player dies after using their Buyback, resulting in a much longer respawn time and them unable to use it once more as it will most definitely be on cooldown.
Disable 
A catch-all term referring to any spell, ability, or effect that otherwise prevents an enemy hero from moving, casting, and/or attacking, leaving them helpless for a short period of time or in general impeding their ability to act. See the main article for more information.
Dive 
The act of running at a hero that is within their tower range in order to attempt to secure a kill. This can sometimes be a risky play in the early stages of the game when tower damage output is more significant. Dives are usually attempted by more than one hero, but can also be done by a single hero. Also referred to as a "tower dive".
DK 
Initialism for Dragon Knight.
DoT 
Damage over Time, an applied effect that repeatedly inflicts damage at a regular interval for a specific duration. A classic example would be Venomancer's Poison Nova.
DP 
Initialism for Death Prophet.
DPS 
Initialism for damage per second, a measure of the damage dealt by a hero or unit over one second. Usually refers to heroes with consistent damage output over a period rather than heroes with high burst damage periodically.
Dunk 
Refers to either Axe's ultimate, Culling Blade, or Earthshaker's ultimate, Echo Slam.
Durable 
A hero that can take a lot of damage and abuse before dying. See the main article for more information.
Dust 
Abbreviation for the item, Dust of Appearance.
Easy lane
See Safe Lane.
Egg 
Refers to Phoenix' ultimate, Supernova.
Epi 
Refers to Sand King's ultimate, Epicenter.
ES 
Initialism for Earthshaker, Earth Spirit, or Ember Spirit.
ET 
Initialism for Elder Titan.
Exp or XP 
Experience, the resource used by heroes to gain levels, level skills, and otherwise increase in power over the course of a game.
ez 
"Easy", typically said to mock the enemy team's lesser power.
F - J
Fade time 
The amount of time it takes for a unit to become completely invisible following the activating of an invisibility effect.
Farm or Farming 
The process of steadily earning gold and experience by killing lane creeps and/or neutral creeps. This tactic is often slow and tedious, but is usually necessary for Carries to reach their full potential.
FB 
Initialism for First Blood.
Feed 
The act of "feeding" gold and experience to the enemy team by dying repeatedly. Can be intentional or unintentional. Those who feed are called "feeders".
Flash Farming 
A style of farming when you farm your lane creeps until the enemy tower and then proceed to clear neutral creep camps in quick and efficient rotations. This is often the fastest method of farming, but it also allows the enemy to farm your creeps freely by their tower and can be risky, as being near the enemy tower is often an easy gank, or the enemy might know exactly where you are clearing the camp closest to the lane tanking creeps and losing health, etc.
Flash Farming Skill/Ability
An AoE ability that facilitates flash farming by allowing a hero to quickly clear lane creeps or groups of neutrals. The classic flash farming ability is Shadow Fiend's Shadowraze which allows him to mow down wave after wave of creeps with ease. It is common to hit lane creeps below threshold and then clear them all in one shot with the flash farming ability, and then go off to check rune, kill neutrals, etc.
FF 
According to DotA history, "ff" appeared on DotA-League.com (at this time it was for DotA 1), and it means "forfeit". You had to type "-ff" in the game so it can detected and recorded by the dota-league plugin and save the data to your dota-league profile. If all 5 members of one team type it the game would end. Now replaced by calling "gg" if the surrender option is enabled in a lobby. More commonly used as abbreviation to "finish fast", asking enemy team to end the game that's already unwinnable for other side.
Fortification/Fortify 
The Glyph of Fortification is the button in the bottom right hand of the screen that renders all allied buildings invulnerable for 5 seconds on a 5 minute cooldown. If a player says "don't fort", they are telling the team to save the glyph for later. Towers are "fortified" for the 5 seconds that the glyph is active.
FoW 
Initialism for Fog of War. Refers to the portion of the map that is dark and unseen. If you cannot see an area, then it is said to be "fogged". Most of the map is "fogged" by default, such as Roshan's lair, and the enemy base.
Furion 
Nature's Prophet's name in WC3 DotA. Still referenced in the game's files for the hero.
Gank 
Abbreviation for Gang Kill, but over time the term has come to refer to any time a hero or group of heroes attempts to pick off an enemy hero (or enemy heroes) by surprise, but usually with superior forces (such as 2v1).
Gem 
The Gem of True Sight. Most commonly means to either buy a gem, or to be careful of the enemy team as they possess a gem (especially for allied invisible heroes).
gg 
"Good game". Said by either team when they are winning or losing and the match is close to ending, or when a team or a player gives up or claims victory. For example, both teams might say "gg" to congratulate each other after a hard fought match, or a team might call out "gg" after their ally abandons in an assumption that they can no longer win.
ggwp 
"Good game, well played". Used in a similar fashion as "gg", but also congratulates the enemy for skillful play.
gj 
"Good job", typically said to compliment an ally.
glhf 
"Good luck, have fun". Usually said at the start of a game to encourage other players, or just to be courteous.
Golem 
Refers to Mud Golems, Ancient Golems, or Warlock's ultimate summon through Chaotic Offering.
Gs 
Initialism for Grimstroke.
Guinsoo 
A reference to Scythe of Vyse's full name in WC3 DotA, which in turn references one of the developers of the original DotA maps.
Hard Carry 
A type of carry hero which scales incredibly well with items and requires a substantial amount of farm to be effective. Examples of this are Anti-Mage and Spectre. Hard carries are usually weaker than other semi-carry or non-carry heroes at the early-mid stages of the game, when they do not yet have their core items.
Hard Lane 
The lane of either faction where the creep wave meets up furthest from the tower, making those who lane in it more susceptible to ganks (top lane for Radiant, bottom lane for Dire). See Lane.
HH 
Initialism for the item Heaven's Halberd.
Hook 
Refers to either Pudge's signature ability, Meat Hook, or Clockwerk's ultimate, Hookshot.
HP 
Health/hit points, referring to the health a unit has.
inc 
Abbreviation for "incoming", usually referring to enemy players heading to a lane to gank.
Initiation 
The act of starting (initiating) a teamfight. A good initiation can catch the enemy team off-guard, possibly losing quickly to the attacking team. A hero adept at performing this is called an initiator.
Janggo 
Drums of Endurance's former name in WC3 DotA.
Juking 
Running and weaving around trees, fog and other obstacles in such a manner to avoid and possibly escape an enemy.
Jungling 
The process of killing the neutral creeps in the woods (aka "jungle") between the lanes. Killing ancient creeps also counts as jungling, but is more often referred to as ancienting or "clearing ancients". A hero adept at jungling (be it by having summons to absorb the creeps' attacks or having good durability/self-sustain) is termed a jungler.
K - O
Kick 
More commonly used to refer to Earth Spirit's Boulder Smash. May also refer to removing a player from a lobby or game or Tusk's Walrus Kick.
Kite or Kiting 
See pulling. A technique where a hero gets the attention of a hostile unit to draw them away or force them to follow. Can also be used to refer to a period of repeated hit-and-run attacks where the target is kept out of range.
KotL 
Initialism for Keeper of the Light.
KS 
Either means "kill steal" or, usually jokingly, "kill secure". Firstly, it refers to the act of one hero stealing the kill of another hero that could have easily taken it themselves. Secondly, it refers to when another hero "secures" the kill if the carry or another hero could not do so themselves. The latter meaning is also sometimes interchangeably used with the former meaning in a satirical or comedic way.
LC 
Initialism for Legion Commander.
Leoric 
Wraith King's name in WC3 Dota.
Long lane 
See Hard Lane.
Lothar's or Lothar's Edge 
Shadow Blade's name in WC3 DotA.
LS 
Initialism for Lifestealer.
LSA 
Initialism for Lina's ability, Light Strike Array.
Manfight 
Refers to when two heroes, usually melee, continuously attack one another without kiting (see above) until one hero either dies or retreats. Often implies a 1v1 battle without allies of either hero nearby. Heroes such as Ursa and Troll Warlord excel at manfights.
Meatball 
Refers to Invoker's Chaos Meteor.
Micro 
Micromanagement. Refers to (effective) control and usage of multiple units, items, and abilities in quick succession. Bad micromanagement can result in the many units becoming a hindrance or a liability rather than an asset or ability combos being wasted and/or used in the wrong situations, while a good one can easily overpower an enemy in seconds. Meepo, Invoker, and Chen are good examples of heroes who require good "micro".
Mid/Middle 
The middle of the three lanes in the map.
Miss, Missing, or MIA 
Mentions that a particular hero has gone absent (missing) from their lane, and is probably setting up for a gank. Warning missing enemy heroes is crucial to warn your allies of a possible ambush in any of the lanes. "MIA" is an acronym for "Missing In Action".
MK 
Initialism for Monkey King.
MKB 
Initialism for Monkey King Bar.
MP 
Mana points, referring to the amount of mana a unit has.
MS 
Initialism for Movement Speed, or sometimes "missing".
Mute 
Refers either to the effect that prevents item usage, or blocking any means of communication from a player in the game to avoid harassment.
N'aix Bomb 
Refers to when Lifestealer uses his ultimate, Infest, on an allied hero, preferably one with high mobility or with a Blink Dagger (and perhaps a disable), so that Lifestealer bursts out of his ally upon arrival, dealing additional damage and having two heroes down their target.
Necro 
May refer to Necrophos or the Necronomicon. Rarely used to refer to Visage, the Bound Form of Necro'lic.
Nerubian/Nerub 
Refers to Nyx Assassin and Weaver's former titles in WC3 DotA (the Nerubian Assassin and Nerubian Weaver, respectively).
nj 
"Nice job", analogous to gj.
NP 
Initialism for Nature's Prophet or "no problem".
Nuke 
A spell whose purpose is to deal a large amount of damage immediately or in a very short span of time. Heroes adept at nuking are referred to as nukers.
OC 
Initialism for Octarine Core.
OD 
Initialism for Outworld Destroyer.
Offlane 
See Hard Lane.
Offlaner 
A hero sent down to the offlane/hard lane, usually on their own. These heroes usually have good escape abilities or are naturally durable to withstand being alone and facing usually two enemy heroes (and being more vulnerable to ganks).
Omni 
Refers to Omniknight or Juggernaut's ultimate, Omnislash.
OoM 
Initialism for "out of mana".
OoV 
Initialism for Orb of Venom.
Orb effect 
The name used for unique attack modifiers in WC3 DotA.
Orb walk 
Using your orb effect (aka autocast) ability, i.e. Frost Arrows, Searing Arrows, to harass the enemy while moving forward to gain distance on the target and also preventing enemy creep aggro.
P - T
P 
Abbreviation for "push" or "pause". Players saying "p" means they are either suggesting the team to push an enemy lane or asking for a pause.
PA 
Initialism for Phantom Assassin.
Panda/Pandaren 
Brewmaster's former title in WC3 DotA.
Pet 
A creature that a hero can summon or convert to their side. For example, the Eidolons are Enigma's pets. Could also refer to the cosmetic pets that follow your hero in-game.
Pit Lord 
Underlord's former title in WC3 DotA.
PL 
Initialism for Phantom Lancer.
PP 
Initialism for "pause please".
PotM 
Initialism for Mirana, the Princess of the Moon.
Proc 
Short for "Programmed Random Occurence". It refers to the triggering of effects, whether the occurence is random (such as Wraith King's critical strikes on attack) or regular (such as Bristleback's quill spray every 250 damage he takes). When these effects trigger or "process", they are said to "proc". The rate of occurence for random effects is termed the "Proc rate".
Pull or Pulling
A technique where a hero gets the attention of a hostile unit to draw them away or force them to follow. This more usually refers to "creep pulling", which involves pulling lane creeps away from their lane by aggroing nearby creep camps into the chosen lane, which attracts the lane creeps and forces them to fight the aggroed creep camp for some time, which in turn makes the enemy creeps push forward, closer to the player's tower, letting them (or their carry) gain gold and experience underneath the safety of their tower.
QoP 
Initialism for Queen of Pain.
Rat 
A strategy in which heroes avoid 5-man teamfights, and focus on pushing other lanes instead.
Rax 
Abbreviation for barracks.
Recrow 
To reuse the courier after it has finished performing one of its deliveries/tasks. Since the courier can only perform one task for one player at a time, this is mentioned to remind/alert other players that the courier is now free to use.
Reuse 
See recrow, usually mentioned by Southeast Asian players.
Ricer 
Hardcore farmer whose main goal is to be extremely strengthened by the time they come out of their farming.
Ring or RoB/RoH/RoR/RoT 
Refers to Ring of Basilius, Ring of Health, Ring of Regeneration, and Ring of Tarrasque respectively. "Ring" usually refers to one of these, based on context.
RNG 
Random Number Generation, referring to the proc chance of abilities and items such as Bash or Evasion.
Ro3 
See Roshan, usually mentioned by Chinese players.
Roamer 
A hero that jumps between lanes, especially in the early game, in order to gank enemy heroes or defend allies.
Rock 
May refer to Warlock's ultimate: Chaotic Offering, Earth Spirit's Stone Remnants, the Ancient Rock and Granite Golems, or the Mud Golem's ability: Hurl Boulder.
Rosh or RS 
Roshan is a difficult-to-kill neutral creep that drops the Aegis of the Immortal when killed (or Refresher Shard on his third death and afterward). Spawns at the beginning of the game, respawns 8-11 minutes after he is killed.
RP 
Magnus' ultimate, Reverse Polarity.
Safe lane 
The lane of either faction, where the creep wave meets closest to the tower, making farming easier and less risky for those in the lane (bottom lane for Radiant, top lane for Dire). See Lane.
SB 
Initialism for Spirit Breaker or Shadow Blade.
Scepter 
Mostly refers to Aghanim's Scepter, although can also refer to Eul's Scepter of Divinity or Ghost Scepter.
SD 
Initialism for Shadow Demon or "self denial" (see suicide).
SF 
Initialism for Shadow Fiend.
Sheepstick 
Refers to the Scythe of Vyse, since back in WC3 DotA, it actually turned the target into a sheep (unlike in Dota 2, where the hex model is a pig).
Short lane
See Safe Lane.
Silence 
The effect of preventing a unit from casting spells, but can still allows item usage and passive abilities are unaffected. See the main article for more information.
SK 
Initialism for Sand King. May also refer to Wraith King in reference to his former title, Skeleton King, or Terrorblade, in reference to his former title Soul Keeper.
Skillshot 
An ability that requires proper aim and timing to hit an enemy. Some examples would be Sacred Arrow and Powershot.
Smoke 
Refers to either Smoke of Deceit, or Riki's ability Smoke Screen.
Snowballing 
The situation where a hero or team just gets stronger as the games goes on (usually through getting kills) to the point where it is very difficult to stop them, much like a snowball rolling down a hill getting bigger.
SnY 
Initialism for Sange and Yasha.
SOD 
Initialism for Smoke of Deceit.
Solo 
Being the only hero on the lane. A player may call for "solo" to prevent other players from interrupting their lane and splitting experience and gold.
Soul Keeper 
Terrorblade's former title in WC3 DotA.
Spirit Hero 
Shorthand for Ember Spirit, Storm Spirit, and Void Spirit. These three heroes share many attributes, such as typically being played mid, and having versatile kits featuring heavy burst damage and extremely reliable long-range engagement/escape tools. Earth Spirit is generally not included in this definition, as he is more often played as a support.
Squishy 
A hero that can only take relatively little damage or abuse before dying.
SR 
Initialism for status resistance.
SS
See missing. Mentions that a particular hero has gone missing and is probably setting up for a gank, e.g. "Earthshaker ss" means that Earthshaker isn't visible to the team. Also refers to the ultimate of a hero ("special/super skill"), usually mentioned by Southeast Asian players. May also refer to Storm Spirit, Shadow Shaman or Sun Strike.
Stacking
May also be referred to as "creep stacking", it is a technique where a creep camp is aggroed at a certain time before neutral spawning, pulling them away from their camp's area and allowing a new set of creeps to spawn. This can be done multiple times, usually up to three at maximum, then letting their team's carry farm the stacked creeps for a massive gold and experience gain.
Static Farming
A sub-type of farming where you aim to only kill enemy lane creeps when they have low health and deny the allied lane creeps whenever possible, with the goal of maintaining an equilibrium of the lane creeps in a safe position. This is often a slower method of farming, but allows you to farm safely with little risk of dying to a gank.
Stygian
References the Desolator's former full name in WC3 DotA.
Suicide
The act of killing oneself to deny the enemy of gold and experience. May also directly refer to Techies' ability Blast Off! (formerly Suicide Squad, Attack!).
Suicide lane
See Hard Lane.
Summoner
A hero whose primary function is summoning or controlling creeps.
Support
A hero whose primary function is to help their own team through heals, buffs, and detection, or sabotaging the enemy team through disables, slows, or debuffs.
TA 
Initialism for Templar Assassin.
Tank 
See Durable.
Tauren Chieftain (or simply Tauren) 
Elder Titan's former title in WC3 DotA.
TB 
Initialism for Terrorblade.
Teamwipe 
The act of an entire team dying at once.
THD 
Twin Head Dragon, the title of Jakiro.
Throw 
The act of giving a supreme advantage to the supposedly losing team, letting them win the match instead. This can be intentional or unintentional.
Throne 
See Ancient. Remnant of WC3 DOTA in which the Scourge's (Dire) Ancient is using the Frozen Throne model.
Top 
Refers to the top lane of the map. For the Radiant, this is the lane to the North. For the Dire, this is the lane to the West.
TP 
Initialism for Teleport or Town Portal Scroll.
True Sight 
Anything invisible caught in the radius of a True Sight source gets revealed. Towers, a Gem of True Sight, Sentry Wards, Dust of Appearance, and some abilities of certain heroes (such as Slardar's Amplify Damage and Zeus' Lightning Bolt) all provide True Sight. See the main article for more information.
Turnaround 
The act of a team facing the enemy in a situation where they might be expected to run, such as when a team initiates on the other. Instead of running away, they try to turn the battle around and fight back, possibly even winning the battle themselves.
U - Z
UAM 
Initialism for unique attack modifier.
Ult or Ulti
Abbreviation for Ultimate.
Veno 
Abbreviation for Venomancer. Rarely used to refer to Orb of Venom.
VS 
Initialism for Vengeful Spirit or Void Spirit. Less commonly used to mean "versus".
Wards
Items that can be placed almost anywhere on the map and provide vision around the location for a few minutes. May also refer to the unit type of the same name.
WD 
Initialism for Witch Doctor.
Well 
The fountain from either team. Derived from its name in WC3 DotA, "Well of Eternity".
Wipe 
See teamwipe.
WK 
Initialism for Wraith King.
wp 
"Well Played". Usually said by either teams when they are winning or losing and the match is either about to end or when a player did something skillful. Usually paired up with "gg".
WR 
Initialism for Windranger.
WW 
Initialism for Winter Wyvern. It also may refer to "Wind Walk" abilities, which is derived from the common DotA name of the invisibility spells of Bounty Hunter, Clinkz, Shadow Blade and Storm, and the invisibility spells of Nyx Assassin and Invoker (which already had unique names in DotA). All 6 of them are based on the Warcraft 3 spell named "Wind Walk".
Zoning
The act of harassing an enemy hero away from the creep zone in order to prevent them from gaining experience.
"""

# reference : Dota 2 wiki,https://dota2.fandom.com/wiki/Glossary

In [105]:
dota_corpus = corpus_dota.split(" ")

import re

dota_text =[]
for i in dota_corpus:
  dota = re.sub(r"[^a-z0-9]+", "", i.lower())
  dota_text.append(dota)

dota_cor = []
for i in dota_text:
 a = i.split(",")
 dota_cor.append(a)

#reference: comp5046,lab02,https://colab.research.google.com/drive/1-l7gzLZ71ERuJ_ktG1nKTU6G8UuEvseN?usp=sharing
#reference: comp5046,lab05, https://colab.research.google.com/drive/1zO2FYT9J61Lj_b5S_tMH2Uo_FhjK0u18?usp=sharing

In [106]:
#get the matrix by dota corpus
word_emb_model_domain=Word2Vec(sentences=dota_cor,size=100, 
                 window=3,
                 min_count=1, #count in word2vec of word appear at minimum once
                 workers=4,sg=1)
#reference: comp5046,lab02,https://colab.research.google.com/drive/1-l7gzLZ71ERuJ_ktG1nKTU6G8UuEvseN?usp=sharing

In [107]:
EMBEDDING_DIM=141

embedding_matrix=[] 
for word in word_list:
    try:
        word_embed=list(word_emb_model_domain.wv[word])
        word_embed.extend(list(word_2_pos[word]))
        word_embed.extend(list(word_2_parse[word]))
        word_embed.extend([len(word)])
        embedding_matrix.append(word_embed)
        
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)


embedding_matrix=np.array(embedding_matrix)
embedding_matrix.shape

#reference: comp5046,lab02,https://colab.research.google.com/drive/1-l7gzLZ71ERuJ_ktG1nKTU6G8UuEvseN?usp=sharing

(11243, 141)

###3.Model


##3.1 baseline model

In [108]:
torch.manual_seed(1)
#help the model to more readable 
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
#reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

In [109]:
class BiLSTM_CRF_base(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF_base, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space. 
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

#reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

##3.1.1 Training for baseline model



In [110]:
import numpy as np
from sklearn.metrics import f1_score,accuracy_score
import numpy as np
def cal_acc_base(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    f1score=f1_score(ground_truth,predicted,average="micro")
    return predicted,ground_truth, f1score
#reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

In [111]:
# concate all input and output index
train_data=train_input_index+val_input_index
train_label=train_output_index+val_output_index

In [112]:
# Set up baseline model 
device = torch.device("cpu" if torch.cuda.is_available() else "cpu") #put all in cpu
HIDDEN_DIM = 70

base_model = BiLSTM_CRF_base(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)#base model
optimizer = optim.SGD(base_model.parameters(), lr=0.01, weight_decay=1e-4)
#reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

In [113]:
# Train the baseline model

import datetime

print("="*115)
print("                                   Baseline model training on Validation                          ")
print("="*115)
for epoch in range(2):  
    time1 = datetime.datetime.now()
    train_loss = 0

    base_model.train()#base model
    for i, idxs in enumerate(train_data):
        tags_index = train_label[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        base_model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = base_model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    base_model.eval()
    # Call the cal_acc functions you implemented as required
    _, _, train_acc = cal_acc_base(base_model,train_input_index,train_output_index)
    _, _, val_acc = cal_acc_base(base_model,val_input_index,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = base_model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train f1-score: %.4f, val loss: %.2f, val f1-score: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    #reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

                                   Baseline model training on Validation                          
Epoch:1, Training loss: 35806.00, train f1-score: 0.9939, val loss: 709.27, val f1-score: 0.9962, time: 577.34s
Epoch:2, Training loss: 2272.02, train f1-score: 0.9989, val loss: 150.67, val f1-score: 0.9995, time: 585.80s


##3.2 model designed

In [114]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

torch.manual_seed(1)
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim , num_layers, use_crf, attention_method):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        #hyper-parameter
        self.num_layers = num_layers
        self.attention_method = attention_method
        self.use_crf = use_crf

        self.general_attention_weight = nn.parameter.Parameter(torch.Tensor(1,self.hidden_dim,self.hidden_dim),requires_grad = True)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True) #bi-direction

        self.dropout = nn.Dropout(0.2)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size) if not attention_method else nn.Linear(hidden_dim*2,self.tagset_size)
        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG],:] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)

        #attention setting
        if self.attention_method:
          lstm_out = torch.squeeze(lstm_out,1)
          left_self = lstm_out.view(1,lstm_out.size(0),lstm_out.size(1))
          right_self = left_self.view(left_self.size(0),left_self.size(2),left_self.size(1))

          if "scale" in self.attention_method.lower():
            weight_att = nn.functional.softmax(torch.bmm(left_self,right_self)/np.sqrt(self.hidden_dim),dim =-1)
          elif"general" in self.attention_method.lower():
            step_one = torch.bmm(left_self,self.general_attention_weight)
            step_two = torch.bmm(step_one,right_self)
            weight_att = nn.functional.softmax(step_two,dim=-1)
          elif"dot product"in self.attention_method.lower():
            weight_att = nn.functional.softmax(torch.bmm(left_self,right_self),dim=-1)
          
          output = torch.bmm(weight_att,left_self)
          concat_output = torch.cat((output,left_self),dim = -1)
          lstm_out = concat_output.view(len(sentence), self.hidden_dim*2)
        else:
          lstm_out = lstm_out.view(len(sentence), self.hidden_dim)


        lstm_out = self.dropout(lstm_out)
        lstm_feats = self.hidden2tag(lstm_out)

        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        if self.use_crf==True: #using crf
           lstm_feats=self._get_lstm_features(sentence)
           score,tag_=self._viterbi_decode(lstm_feats)
           return score,tag_
        else:           #no crf
           lstm_feats = self._get_lstm_features(sentence)
           return lstm_feats,torch.argmax(lstm_feats,-1)
#reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

## 3.2.1 Training model with hyper-parameters

In [115]:
import numpy as np
from sklearn.metrics import f1_score,accuracy_score

def cal_acc(model, input_index, output_index,use_crf = True):
    ground_truth = []
    predicted = []
    for x,y in zip(input_index,output_index):
        input_tensor=torch.tensor(x).to(device)
        _,output=model(input_tensor)
        ground_truth.extend(y)
        
        if use_crf==True:
            predicted.extend(output)
        else:
            predicted.extend(list(output.cpu().numpy()))
    f1= torch.tensor(f1_score(ground_truth,predicted,average="micro")).to(device)
    return ground_truth, predicted, f1
    #reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

In [116]:
# set up a list of configs
#hyper-parameter
configs = [{ "num_layers":1,"use_crf":False,"attention_method":"general"
    },
    { "num_layers":1, "use_crf":False ,"attention_method":"scale"
    },
    { "num_layers":1, "use_crf":False ,"attention_method":"dot product"     
    },
    { "num_layers":2, "use_crf":False,"attention_method":"general"  
    },
    { "num_layers":2, "use_crf":False,"attention_method":"scale"   
    },
    { "num_layers":2, "use_crf":False,"attention_method":"dot product"     
    },
    { "num_layers":1,"use_crf":True ,"attention_method":"general"       
    },
    { "num_layers":1,"use_crf":True ,"attention_method":"scale"
    },
    { "num_layers":1, "use_crf":True ,"attention_method":"dot product"
    },
    { "num_layers":2, "use_crf":True ,"attention_method":"general"
    },
    { "num_layers":2, "use_crf":True ,"attention_method":"scale"
    },
    { "num_layers":2, "use_crf":True ,"attention_method":"dot product"
    }]

# reference: comp5046,lab11, https://colab.research.google.com/drive/1c4NnDzNmzVJmXolwWdRO1tRhKX6aROWf?usp=sharing

In [117]:
import datetime
from re import A
device=torch.device("cpu" if torch.cuda.is_available() else "cpu")#put all in cpu
 
for config in configs:
    HIDDEN_DIM=70
    vocab_size = len(word_to_ix) # the size of vocabulary
    use_crf=True
    model = BiLSTM_CRF(vocab_size,tag_to_ix, EMBEDDING_DIM,HIDDEN_DIM,config["num_layers"], config["use_crf"], config["attention_method"]).to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.015,weight_decay=1e-4)
    loss_func = nn.CrossEntropyLoss()
  

    print("="*111)
    print("          .    number of layers = {}    &    use_crf = {}    &    attention_method = {}    . ".format(config["num_layers"], config["use_crf"], config["attention_method"]))
    print("="*111)


    for epoch in range(2):
      time1 = datetime.datetime.now()
      train_loss = 0
      model.train()
      for i, idxs in enumerate(train_data):
        tags_index = train_label[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)

        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        
        if use_crf:
          loss=model.neg_log_likelihood(sentence_in,targets)
        else:
          lstm_feats,tags=model(sentence_in)
          loss=loss_func(lstm_feats,targets)


        loss.backward()
        optimizer.step()

        train_loss+=loss.item()
      
      
      max_val_acc=0
      model.eval() 
      # Call the cal_acc functions you implemented as required
      _, _, train_acc = cal_acc(model,train_input_index,train_output_index,use_crf=use_crf)
      _, _, val_acc = cal_acc(model,val_input_index,val_output_index,use_crf=use_crf)

      if(val_acc > max_val_acc):
        best_model=model
        max_val_acc=val_acc  
      
      if  use_crf:
        val_loss=0
        for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, targets)
          val_loss+=loss.item()
        time2 = datetime.datetime.now()
        print("Epoch:%d, Training loss: %.2f, train f1 score: %.4f, val loss: %.2f, val f1 score: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
        
      else:
        time2 = datetime.datetime.now()
        print("Epoch:%d, Training loss: %.2f, train f1 score: %.4f, val f1 score: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_acc, (time2-time1).total_seconds()))

#reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

          .    number of layers = 1    &    use_crf = False    &    attention_method = general    . 
Epoch:1, Training loss: 28911.86, train f1 score: 0.9959, val loss: 470.55, val f1 score: 0.9975, time: 558.59s
Epoch:2, Training loss: 1961.12, train f1 score: 0.9994, val loss: 70.02, val f1 score: 0.9997, time: 543.27s
          .    number of layers = 1    &    use_crf = False    &    attention_method = scale    . 
Epoch:1, Training loss: 28095.83, train f1 score: 0.9957, val loss: 399.62, val f1 score: 0.9976, time: 544.38s
Epoch:2, Training loss: 1734.59, train f1 score: 0.9995, val loss: 62.63, val f1 score: 0.9997, time: 458.83s
          .    number of layers = 1    &    use_crf = False    &    attention_method = dot product    . 
Epoch:1, Training loss: 28439.49, train f1 score: 0.9958, val loss: 399.63, val f1 score: 0.9978, time: 542.22s
Epoch:2, Training loss: 1700.59, train f1 score: 0.9994, val loss: 65.40, val f1 score: 0.9996, time: 456.36s
          .    number of laye

## 4. Evaluation Setup

In [118]:
# model design
y_true,y_pred,_=cal_acc(best_model,val_input_index,val_output_index,use_crf)


def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

# generate classification report
from sklearn.metrics import classification_report
print(classification_report(y_true_decode,y_pred_decode,digits=4))
#reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

              precision    recall  f1-score   support

           C     0.9994    1.0000    0.9997      1641
           D     0.9975    0.9899    0.9937       398
           O     1.0000    1.0000    1.0000     18985
           P     0.9995    1.0000    0.9997      3936
           S     1.0000    0.9997    0.9998      3322
        SEPA     1.0000    1.0000    1.0000      3603
           T     0.9986    0.9993    0.9990      1469

    accuracy                         0.9998     33354
   macro avg     0.9993    0.9984    0.9989     33354
weighted avg     0.9998    0.9998    0.9998     33354



In [119]:
# baseline model
y_true,y_pred,_=cal_acc(base_model,val_input_index,val_output_index,use_crf)


def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

# generate classification report
from sklearn.metrics import classification_report
print(classification_report(y_true_decode,y_pred_decode,digits=4))
#reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

              precision    recall  f1-score   support

           C     0.9994    0.9976    0.9985      1641
           D     0.9822    0.9724    0.9773       398
           O     0.9998    1.0000    0.9999     18985
           P     1.0000    1.0000    1.0000      3936
           S     0.9970    1.0000    0.9985      3322
        SEPA     1.0000    1.0000    1.0000      3603
           T     0.9979    0.9932    0.9956      1469

    accuracy                         0.9993     33354
   macro avg     0.9966    0.9947    0.9957     33354
weighted avg     0.9992    0.9993    0.9992     33354



1. Performance Comparison

In [121]:
import pandas as pd
performance = {
    'Model': ['Baseline', 'Model_designed'], #column
    'T-F1': ['0.9992', '0.9998'],
    'T-F1(T)': ['0.9956', '0.9999'],
    'T-F1(S)': ['0.9985', '0.9998'],
    'T-F1(C)': ['0.9985', '0.9997'],
    'T-F1(D)': ['0.9773', '0.9937'],
    'T-F1(P)': ['1.0000', '0.9997'],
    'T-F1(O)': ['0.9999', '1.0000'] }
df = pd.DataFrame(data=performance)
print("="*80)
print(" "*35, "Metrics")
print('-'*80)
print(df)
print("="*80)

                                    Metrics
--------------------------------------------------------------------------------
            Model    T-F1 T-F1(T) T-F1(S) T-F1(C) T-F1(D) T-F1(P) T-F1(O)
0        Baseline  0.9992  0.9956  0.9985  0.9985  0.9773  1.0000  0.9999
1  Model_designed  0.9998  0.9999  0.9998  0.9997  0.9937  0.9997  1.0000


2. Ablation Study - different input embedding model

In [123]:
performance = {
    'Baseline_model': ['POS + Parses + word embedding'], #column
    ' |   0.9957': ['|   0.9989'] }
df = pd.DataFrame(data=performance)
print("="*60)
print(" "*12, "Model"," "*17,"F1 mean score")
print('-'*60)
print(df)
print("="*60)

             Model                   F1 mean score
------------------------------------------------------------
                  Baseline_model  |   0.9957
0  POS + Parses + word embedding  |   0.9989


3. Ablation Study - different attention strategy

In [124]:
import pandas as pd
performance = {
    'No attention method': ['scale', 'general','dot product'], #column
    '      0.9957': ['      0.99975', '      0.99965','      0.999525'], }
df = pd.DataFrame(data=performance)
print("="*50)
print(" "*5, "Attention method"," "*5,"F1 mean score")
print('-'*50)
print(df)
print("="*50)

      Attention method       F1 mean score
--------------------------------------------------
  No attention method          0.9957
0               scale         0.99975
1             general         0.99965
2         dot product        0.999525


4. Ablation Study - different Stacked layer or # of encoder/decoder strategy


In [125]:
import pandas as pd
performance = {
    'Baseline model': ['scale', 'scale', 'general','general','dot product','dot product'], #column
    '      layer = 1': ['     layer = 1', '     layer = 2','     layer = 1','     layer = 2','     layer = 1','     layer = 2'],
    '    0.9957': ['   0.9997', '   0.9996','   0.9997','   0.9998','   0.9996','   0.99945']}
df = pd.DataFrame(data=performance)
print("="*50)
print(" "*5, "Model"," "*10,"strategy"," "*5,"F1 score")
print('-'*50)
print(df)
print("="*50)

      Model            strategy       F1 score
--------------------------------------------------
  Baseline model       layer = 1      0.9957
0          scale       layer = 1      0.9997
1          scale       layer = 2      0.9996
2        general       layer = 1      0.9997
3        general       layer = 2      0.9998
4    dot product       layer = 1      0.9996
5    dot product       layer = 2     0.99945


5. Ablation Study - with/without CRF

In [126]:
import pandas as pd
performance = {
    'baseline without crf': ['model design without crf', 'model design with crf'], #column
    '      0.9957': ['      0.9996', '      0.9996833'], }
df = pd.DataFrame(data=performance)
print("="*50)
print(" "*5, "strategy"," "*15,"F1 mean score")
print('-'*50)
print(df)
print("="*50)

      strategy                 F1 mean score
--------------------------------------------------
       baseline without crf           0.9957
0  model design without crf           0.9996
1     model design with crf        0.9996833


## 5. Testing

In [127]:
# save and load the best model
torch.save(best_model,"best_model.pt") 
model = torch.load("best_model.pt")
#reference: comp5046,lab05, https://colab.research.google.com/drive/1zO2FYT9J61Lj_b5S_tMH2Uo_FhjK0u18?usp=sharing

In [128]:
def predict(model,input_index):
    predicted=[]
    for x in input_index:
        input_tensor=torch.tensor(x).to(device)
        _,output=model(input_tensor)   
        for idx in output:           
            predicted.append(ix_to_tag[idx])
    return predicted  


#reference: comp5046,lab09, https://colab.research.google.com/drive/1efZZFttmHKXHbQtNzAVHzjf8FUUD9cWR?usp=sharing

In [129]:
prediction=predict(best_model,test_input_index)

In [130]:
id = range(len(prediction))
test_prediction={'ID':id,'Predicted':prediction}
df=pd.DataFrame(test_prediction)
df.to_csv('result.csv',index=False)