In [45]:
import os
import pandas as pd
import torch
import transformers

In [46]:
torch.cuda.is_available()

True

In [47]:
questions = pd.read_csv(os.path.join("data", "Questions.csv"), nrows=10000)

In [75]:
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,full_text
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,SQLStatement.execute() - multiple queries in o...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,Good branching and merging tutorials for Torto...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,ASP.NET Site Maps\n<p>Has anyone got experienc...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,Function for creating color wheels\n<p>This is...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,Adding scripting functionality to .NET applica...


In [74]:
questions['full_text'] = questions['Title'] + "\n" + questions['Body']

In [76]:
tags = pd.read_csv(os.path.join("data", "Tags.csv"))

In [77]:
tags.head()

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [78]:
tags['Tag'].nunique()

37034

In [79]:
tags['Tag'].value_counts()

javascript           124155
java                 115212
c#                   101186
php                   98808
android               90659
                      ...  
testcasesource            1
google-floodlight         1
iecapt                    1
netfs                     1
docker-windows            1
Name: Tag, Length: 37034, dtype: int64

#### Only take the 100 most common tags

In [80]:
tags_top_100 = tags['Tag'].value_counts().head(100).index

In [81]:
tags_filtered = tags[tags['Tag'].isin(tags_top_100)]

In [82]:
len(tags_filtered)

1755529

### Join tags to questions dataset

In [83]:
# create dataframe with a column for each possible tag and a 1 or 0 depending on whether the ID has that tag
tags_onehot = tags_filtered.assign(values=1).pivot(index='Id', columns='Tag', values="values").fillna(0)

In [84]:
tags_onehot.head()

Tag,.htaccess,.net,ajax,algorithm,android,angularjs,apache,api,arrays,asp.net,...,visual-studio,visual-studio-2010,wcf,web-services,windows,winforms,wordpress,wpf,xcode,xml
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
260,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
330,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
470,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
questions_with_tags = questions.merge(tags_onehot, how='left', on='Id')

In [86]:
questions_with_tags[tags_top_100] = questions_with_tags[tags_top_100].fillna(0)

In [87]:
questions_with_tags.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,full_text,.htaccess,.net,...,visual-studio,visual-studio-2010,wcf,web-services,windows,winforms,wordpress,wpf,xcode,xml
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,SQLStatement.execute() - multiple queries in o...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,Good branching and merging tutorials for Torto...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,ASP.NET Site Maps\n<p>Has anyone got experienc...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,Function for creating color wheels\n<p>This is...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,Adding scripting functionality to .NET applica...,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
number_of_tags = questions_with_tags[tags_top_100].sum(axis=1)

In [89]:
number_of_tags.value_counts()

1.0    4333
2.0    2667
0.0    2260
3.0     644
4.0      87
5.0       9
dtype: int64

#### Observation: Most questions have a tag

# Baseline

if tag is mentioned literally in text, predict yes, otherwise no

In [90]:
questions_baseline = questions_with_tags.copy()
questions_baseline[tags_top_100] = 0

In [94]:
for tag in tags_top_100:
    questions_baseline[tag] = questions_baseline['full_text'].str.contains(tag, regex=False)

In [97]:
number_of_tags = questions_baseline[tags_top_100].sum(axis=1)

In [98]:
number_of_tags.value_counts()

3     3089
2     2726
4     2073
5     1114
6      537
7      224
8      112
1       48
9       37
10      26
11       8
12       2
14       2
13       1
0        1
dtype: int64

In [102]:
for text in questions_baseline.loc[questions_baseline[tags_top_100].sum(axis=1) == 14, 'full_text'].values:
    print(text)
    print("======================")

is SFig language syntax efficient and clear (and better than Spring-Framework's XML DSL)?
<p><strong>ADDENDUM EDIT:</strong></p>

<blockquote>
  <p>Have not accepted an answer to this as
  there has not been any feedback from
  experienced Spring Framework
  developers.</p>
</blockquote>

<p>I've been working on a replacement DSL to use for Spring-Framework applicationContext.xml files (where bean initialization and dependency relationships are described for loading up into the Spring bean factory).</p>

<p>My motivation is that I just flat out don't like Spring's use of XML for this purpose nor do I really like any of the alternatives that have been devised so far. For various reasons that I won't go into, I want to stay with a declarative language and not some imperative scripting language such as Groovy.</p>

<p>So I grabbed the ANTLR parser tool and have been devising a new bean factory DSL that I've dubbed SFig. Here's a link that talks more about that:</p>

<p><a href="http://dob

In [103]:
questions_baseline.loc[questions_baseline[tags_top_100].sum(axis=1) == 14, 'full_text']

5551    is SFig language syntax efficient and clear (a...
8762    Jquery append using multiline\n<p>I have been ...
Name: full_text, dtype: object

In [104]:
for tag in tags_top_100:
    if questions_baseline.loc[5551, tag] == 1:
        print(tag)

java
php
sql
c
r
regex
xml
string
spring
apache
list
rest
file
class
