In [1]:
from fastai.text import * 
from fastai import *

In [2]:
path = Path('wikitext-103')

In [3]:
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0

In [4]:
def process_unk(s):
    return UNK if s == '<unk>' else s

In [5]:
def read_file(filename):
    articles = []
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            articles.append(current_article)
            current_article = ''
    articles.append(current_article)
    return np.array(articles)

In [6]:
train = read_file(path/'wiki.train.tokens')
valid = read_file(path/'wiki.valid.tokens')
test =  read_file(path/'wiki.test.tokens')

In [7]:
len(train), len(valid), len(test)

(28476, 60, 60)

In [8]:
train.shape

(28476,)

In [9]:
train[0]

' \n = Valkyria Chronicles III = \n \n Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series new

In [10]:
all_texts = np.concatenate([valid, train,test])
df = pd.DataFrame({'texts':all_texts})
df.head()

Unnamed: 0,texts
0,\n = Homarus gammarus = \n \n Homarus gammaru...
1,\n = Frank Headlam = \n \n Air Vice Marshal F...
2,\n = M @-@ 82 ( Michigan highway ) = \n \n M ...
3,\n = Shikamaru Nara = \n \n Shikamaru Nara ( ...
4,"\n = Meridian , Mississippi = \n \n Meridian ..."


In [11]:
del train
del valid
del test

In [13]:
data = (TextList.from_df(df, path, cols='texts')
                .split_by_idx(range(0,60))
                .label_for_lm()
                .databunch())
data.save()

In [15]:
data = TextLMDataBunch.load(path, bs=80)
data.show_batch()

idx,text
0,xxbos \n = xxmaj alfred xxup e. xxmaj montgomery = \n \n xxmaj vice xxmaj admiral xxmaj alfred xxmaj eugene xxmaj montgomery ( 12 xxmaj june 1891 – 15 xxmaj december 1961 ) was an officer in the xxmaj united xxmaj states xxmaj navy who served in xxmaj world xxmaj war i and xxmaj world xxmaj war xxup ii . a graduate of the xxmaj naval xxmaj academy
1,", mean temperatures range from 24 ° c ( 75 ° f ) in xxmaj march to 18 ° c ( 64 ° f ) in xxmaj may . xxmaj in the central parts of the country , mean temperatures in xxmaj march are between 18 and 22 ° c ( 64 and 72 ° f ) , dropping to 10 and 14 ° c ( 50 and 57 °"
2,"pair of scissors , xxmaj homer and xxmaj marge are the prime suspects in the murder , even though they witnessed a man with braces leaving the murder scene , with xxmaj mrs. xxmaj bellamy 's necklace . xxmaj the people of xxmaj springfield are very suspicious of xxmaj homer and xxmaj marge , and xxmaj chief xxmaj wiggum does not believe their story . xxmaj finally , during an"
3,"moving away from home to undertake life - cycle service , which was necessary so that they could build up skills and capital that would enable them to marry and create a separate household . xxmaj lower down in society boys might be apprenticed to a trade , or become agricultural servants . xxmaj girls might go into domestic or agricultural service . xxmaj for those higher up in society"
4,"society 's first chairman , was put into service on 6 xxmaj may 1991 . xxmaj during this period further expansions of both xxmaj xxunk xxmaj wharf and xxmaj xxunk stations were undertaken and xxmaj xxunk station , which had been rebuilt as part of the xxmaj xxunk xxmaj xxunk extension in the 1970s , was expanded further to provide an additional attraction at the eastern end of the line"


In [16]:
learn = language_model_learner(data, drop_mult=0., emb_sz=400, nh=1150, nl=3, qrnn=True, clip=0.12)
learn.fit_one_cycle(1,5e-3, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy
1,3.565936,3.500393,0.381790


In [17]:
learn.save('qrnn_maj')

In [18]:
path = Path('/home/ubuntu/data/autopsy/milestone/suicide_homicide_accident/no_preprocessing')

In [19]:
df = pd.read_csv(path/'data.csv')
df.head()

Unnamed: 0,label,text,is_valid
0,Accident,\n.MICRO.\nNONE\n.LOCATIONExtremities\n.INJURY...,False
1,Accident,\n.MICRO.\nHeart - no significant histopatholo...,False
2,Suicide,\n.MICRO.\nNONE\n.LOCATIONNeck\n.INJURYHanging...,False
3,Homicide,\n.MICRO.\nHEART: The sections of heart demons...,False
4,Homicide,\n.MICRO.\nHeart: No diagnostic abnormality. ...,False


In [20]:
data_lm = TextLMDataBunch.from_csv(path, 'data.csv')

In [21]:
data_lm.show_batch()

idx,text
0,"xxbos \n xxup .micro . \n xxup none \n .locationhead \n .injuryblunt injury . \n xxup .injurydesc . \n xxmaj external injuries include abrasions and contusions of the face and sides of the head . \n\n xxmaj there is visible deformation of the head and face with extensive palpable fractures of the mandible and maxillofacial , basilar , and calvarial skull . \n\n .locationtrunk \n .injuryblunt injury . \n xxup"
1,"number of glomeruli without significant sclerosis or inflammation . xxmaj the tubules have moderate autolytic change without inflammation , tubule drop out or fibrosis . xxmaj no significant polarizable material is present . \n\n xxmaj the heart has mild interstitial fibrous tissue , predominantly in a section of right ventricle . xxmaj the cardiac myocytes are markedly autolyzed . xxmaj the myocardial vessels are patent without significant medial hypertrophy or"
2,"xxmaj on the left side of the back is a 0.5 cm defect that is 5 cm left of midline and 39.5 cm from the top of the head . xxmaj the edges of the wound are not abraded . \n\n xxmaj bullet : xxmaj none , exited the body . \n\n xxmaj trajectory : xxmaj primarily front to back . \n\n xxmaj clothing : a 3 x 2.5 cm"
3,â½ x 3 / 8 inch individually . \n\n xxmaj on the right anterior upper arm are three scattered discrete brown - yellow contusions measuring individually a maximum of xxup â¾ x xxup â½ inches . xxmaj on the right posterior upper arm is a purple - yellow 2 x 1 inch contusion . xxmaj on the posterior right forearm is a brown contusion with two semicircular arches and central
4,". \n\n xxmaj on the medial right thigh is a 1 cm , round , dark purple contusion . \n\n .locationhead \n .injuryblunt injury . \n xxup .injurydesc . \n xxmaj located on the left temporal - parietal scalp is a 3.5 x 1.5 cm red dried abrasion . \n\n .locationback \n .injuryblunt injury . \n xxup .injurydesc . \n xxmaj on the right middle back is a 5 x"


In [34]:
#learn = language_model_learner(data_lm, pretrained_model=URLs.WT103_1, qrnn=True)
learn = language_model_learner(data_lm, qrnn=True)


In [35]:
#load pretrained weights
learn.load_pretrained('/home/ubuntu/milestone/qrnn-pretrained/wikitext-103/models/qrnn_maj.pth','/home/ubuntu/milestone/qrnn-pretrained/wikitext-103/tmp/itos.pkl')
learn.freeze()


In [28]:
learn.fit_one_cycle(2, 1e-2, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy
1,3.095180,2.534037,0.464198
2,2.834910,2.376500,0.489287


In [29]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy
1,2.430303,2.111371,0.532607


In [30]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy
1,2.272071,2.018871,0.546170


In [31]:
learn.unfreeze()
learn.fit_one_cycle(4, 1e-2, moms=(0.8,0.7))


epoch,train_loss,valid_loss,accuracy
1,2.057879,1.869817,0.569868
2,1.904340,1.731161,0.594239
3,1.782588,1.637367,0.611235
4,1.671316,1.609835,0.616658
