# Detect Program Language

- using Spacy

# 1)- Importing key modules

In [0]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [0]:
# For data processing and maths
import numpy as np
import pandas as pd
import time
import math
import os
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [0]:
# For text we shall use Spacy

import spacy 
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

In [4]:
! python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [5]:
! pip install version_information



In [6]:
# first install: pip install version_information
%reload_ext version_information
%version_information pandas,spacy,numpy,seaborn, matplotlib

Software,Version
Python,3.6.8 64bit [GCC 8.3.0]
IPython,5.5.0
OS,Linux 4.14.137+ x86_64 with Ubuntu 18.04 bionic
pandas,0.25.3
spacy,2.1.9
numpy,1.17.4
seaborn,0.9.0
matplotlib,3.1.1
Sun Nov 24 00:10:09 2019 UTC,Sun Nov 24 00:10:09 2019 UTC


# 2)- Loading Data

In [0]:
df = pd.read_csv("Questions.csv", nrows=1_000_00, encoding="ISO-8859-1")

In [8]:
df.shape

(100000, 7)

In [9]:
df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [0]:
df= df.loc[: , ['Id','Title']]

In [11]:
df.head()

Unnamed: 0,Id,Title
0,80,SQLStatement.execute() - multiple queries in o...
1,90,Good branching and merging tutorials for Torto...
2,120,ASP.NET Site Maps
3,180,Function for creating color wheels
4,260,Adding scripting functionality to .NET applica...


In [12]:
df.shape

(100000, 2)

# 3)- Exploring data

In [13]:
# checking full title text

df.Title[4]

'Adding scripting functionality to .NET applications'

In [14]:
df.Title[14]

'Use SVN Revision to label build in CCNET'

In [0]:
# another way to check 
titles = [_ for _ in df['Title']]

In [16]:
titles[5:10]

['Should I use nested classes in this case?',
 'Homegrown consumption of web services',
 'Deploying SQL Server Databases from Test to Live',
 'Automatically update version number',
 'Visual Studio Setup Project - Per User Registry Settings']

In [17]:
# use random library: ALternative df.head()
import random
random.choices(titles, k=5)

['Why limit WCF ServiceContracts to 10-20 OperationContracts?',
 'Stack overflow error when using JQuery/ASP.NET for simple ajax chat',
 'SSRS 2008 Dynamic Columns in a Matrix Report?',
 'MonoRail redirect to # anchor',
 'How to spin an android icon on its center point?']

In [18]:
# same can be done. But, it wont be random

df.Title[:5]

0    SQLStatement.execute() - multiple queries in o...
1    Good branching and merging tutorials for Torto...
2                                    ASP.NET Site Maps
3                   Function for creating color wheels
4    Adding scripting functionality to .NET applica...
Name: Title, dtype: object

### working with spacy module

For properties in Spacy

- https://spacy.io/usage/linguistic-features

In [0]:
import spacy 

nlp = spacy.load("en_core_web_sm") # preload model

In [20]:
nlp("Cat eats mouse")

Cat eats mouse

In [21]:
type(nlp("Cat eats mouse"))

spacy.tokens.doc.Doc

Type of object is  document not a string

In [22]:
# apply list comprehension
[t for t in nlp("Cat eats mouse.")]

[Cat, eats, mouse, .]

In [0]:
doc = nlp("Cat eats mouse.")

In [24]:
doc

Cat eats mouse.

In [25]:
# query 1st element
t = doc[0]
t

Cat

In [26]:
type(t)

spacy.tokens.token.Token

So, we can see that document consists of tokens

In [27]:
# properties of tokens t. tab shows em

t.lower_ # to check lower case

'cat'

In [28]:
doc[1].lemma_ # to check lemmatization of eats 

'eat'

### visualization with SPacy

In [31]:
from spacy import displacy

displacy.render(doc, style= 'dep', jupyter= True)

In [0]:
#displacy.serve(doc, style="dep")

In [32]:
for t in doc:
    print(t, t.dep_)

Cat nsubj
eats ROOT
mouse dobj
. punct


In [33]:
# to check what are abbreviations are

spacy.explain("dobj")

'direct object'

In [34]:
# adding part of speech as well

for t in doc:
    print(t,t.pos_, t.dep_)

Cat PROPN nsubj
eats VERB ROOT
mouse NOUN dobj
. PUNCT punct


In [35]:
spacy.explain("nsubj")

'nominal subject'

In [36]:
spacy.explain("PROPN")

'proper noun'

# 4)- Problem Statement

Detecting programming language

In [37]:
# We shall check "go" language: https://golang.org/
def has_golang(text):
    return "go" in text

# USe generator trick
g = (title for title in titles if has_golang(title))
[next(g) for i in range(5)] # for only 5 instances

['My website got hacked... What should I do?',
 "DVCS Choices - What's good for Windows?",
 'Is a "Confirm Email" input good practice when user changes email address?',
 'Any good advice on using emacs for C++ project?',
 'What is a good way to denormalize a mysql database?']

As for example,  got hacked in 1st sentence has "go" string but, it does not stop there. What if we use [go and space]

In [38]:
# We shall check "go" language: https://golang.org/
def has_golang(text):
    return "go " in text

# USe generator trick
g = (title for title in titles if has_golang(title))
[next(g) for i in range(5)] # for only 5 instances

['Is there a good, free WYSIWYG editor for creating HTML using a Django template?',
 'How to get Django AutoFields to start at a higher number',
 'Where does Console.WriteLine go in ASP.NET?',
 'Should try...catch go inside or outside a loop?',
 'Way to go from recursion to iteration']

Look at case of "Django" where word ends with go but , it does not refer to go language. So we use [space go space]

In [39]:
# We shall check "go" language: https://golang.org/
def has_golang(text):
    return " go " in text

# USe generator trick
g = (title for title in titles if has_golang(title))
[next(g) for i in range(5)] # for only 5 instances

['Where does Console.WriteLine go in ASP.NET?',
 'Should try...catch go inside or outside a loop?',
 'Way to go from recursion to iteration',
 'When are API methods marked "deprecated" actually going to go away?',
 'How to go to main stack']

Now we have correct strings with word "go"<br>
Do we have "go" as programming language? And how could we make sense of it? How to check if it is noun or verb or what is context of this word?

**Basic string matching is not the answer of this problem**

In [40]:
# using spacy to see beyond string matching (https://spacy.io/usage/linguistic-features)

# adding Part of Speech i.e pos
for t in nlp("Where does Console.WriteLine go in ASP.NET?"):
    print(t, t.pos_, t.dep_)

Where ADV advmod
does VERB ROOT
Console PROPN nsubj
. PUNCT punct
WriteLine PROPN nsubj
go VERB ROOT
in ADP prep
ASP.NET PROPN pobj
? PUNCT punct


In [0]:
doc2=nlp("Where does Console.WriteLine go in ASP.NET?")

In [42]:
displacy.render(doc2, style= 'dep', jupyter= True)

This is actually an example where "go" is not used as programming language. We can see how spacy has used it as "verb" and "root" case.

### Filtering our data as per problem statement

We shall only take data that contains "go". And we shall use lower case so that all 'go's are in text are taken.

In [0]:
df = (pd.read_csv("Questions.csv", nrows=1_000_00, 
                  encoding="ISO-8859-1", usecols=['Title', 'Id']))

In [47]:
df.shape

(100000, 2)

In [48]:
df.head()

Unnamed: 0,Id,Title
0,80,SQLStatement.execute() - multiple queries in o...
1,90,Good branching and merging tutorials for Torto...
2,120,ASP.NET Site Maps
3,180,Function for creating color wheels
4,260,Adding scripting functionality to .NET applica...


In [0]:
# for only "go" filer
titles = [_ for _ in df.loc[lambda d: d['Title'].str.lower().str.contains("go")]['Title']]

In [0]:
nlp = spacy.load("en_core_web_sm", disable=["ner"])

# 5)- Re-define function 

In [51]:
def has_golang(doc):
    for t in doc:
        if t.lower_ in ["go", "golang"]: # from domain knowledge, we know that prog. language go could be used in these two forms.
            if t.pos_ != "VERB":
                return True 
    return False

g = (doc for doc in nlp.pipe(titles) if has_golang(doc))
[next(g) for i in range(10)]

[Deploying multiple Java web apps to Glassfish in one go,
 Removing all event handlers in one go,
 How to Create a Dropdown List Hyperlink without the GO button?,
 How do I disable multiple listboxes in one go using jQuery?,
 Embedding instead of inheritance in Go,
 Shared library in Go?,
 multi package makefile example for go,
 What's the point of having pointers in Go?,
 Simulate a tcp connection in Go,
 <canvas> Go/Baduk/Weiqi Game Board]

Still couple of issues. "in one go" is not referring to any language 

In [53]:
displacy.render(nlp("Removing all event handlers in one go"), style= 'dep', jupyter= True)

In [54]:
for t in nlp("Removing all event handlers in one go"):
    print(t, t.pos_, t.dep_)

Removing VERB ROOT
all DET det
event NOUN compound
handlers NOUN dobj
in ADP prep
one NUM pobj
go NOUN aux


In [55]:
spacy.explain("aux")

'auxiliary'

In [56]:
spacy.explain("pobj")

'object of preposition'

### another attempt

We shall assume if our word go has been used as pobj then it is fine else we shall consider it wrong estimation

In [57]:
def has_golang(doc):
    for t in doc:
        if t.lower_ in ["go", "golang"]: # from domain knowledge, we know that prog. language go could be used in these two forms.
            if t.pos_ != "VERB":
              if t.dep_== "pobj":
                return True 
    return False

g = (doc for doc in nlp.pipe(titles) if has_golang(doc)) # nlp.pipe is more efficient in computing 
[next(g) for i in range(5)]

[Embedding instead of inheritance in Go,
 Shared library in Go?,
 multi package makefile example for go,
 What's the point of having pointers in Go?,
 Simulate a tcp connection in Go]

In [58]:
displacy.render(nlp("What's the point of having pointers in Go?"), style= 'dep', jupyter= True)

### And bit more improvement

In [61]:
def has_golang(doc):
    for t in doc:
        if t.lower_ in ["go", "golang"]: # from domain knowledge, we know that prog. language go could be used in these two forms.
            if t.pos_ == "NOUN": # as we have noun as of language
              if t.dep_== "pobj":
                return True 
    return False

g = (doc for doc in nlp.pipe(titles) if has_golang(doc)) # nlp.pipe is more efficient in computing 
[next(g) for i in range(2)]

[multi package makefile example for go,
 What's the simplest way to edit conflicted files in one go when using git and an editor like Vim or textmate?]

We still have in one go. So some stuff is still need further improvement

# 6)- Using Tag dataset

In [62]:
df_tags = pd.read_csv("Tags.csv")
df_tags.head()

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [63]:
df_tags.shape

(3750994, 2)

In [0]:
# only use those ids with go

go_ids = df_tags.loc[lambda d: d['Tag'] == 'go']['Id']

In [65]:
go_ids[:5]

98267     1724680
98367     1726130
98457     1727250
100482    1757090
101172    1766720
Name: Id, dtype: int64

In [66]:
# lets check our main dataframe

df.head()

Unnamed: 0,Id,Title
0,80,SQLStatement.execute() - multiple queries in o...
1,90,Good branching and merging tutorials for Torto...
2,120,ASP.NET Site Maps
3,180,Function for creating color wheels
4,260,Adding scripting functionality to .NET applica...


We shall only take those Ids from our main dataframe that have "go". We have stored them in go_ids from df_tags.

In [0]:
def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang']:
            if t.pos_ != 'VERB':
                return True
    return False

all_go_sentences = df.loc[lambda d: d['Id'].isin(go_ids)]['Title'].tolist()
detectable = [d.text for d in nlp.pipe(all_go_sentences) if has_go_token(d)]

non_detectable = (df
                  .loc[lambda d: ~d['Id'].isin(go_ids)]
                  .loc[lambda d: d['Title'].str.lower().str.contains("go")]
                  ['Title']
                  .tolist())

non_detectable = [d.text for d in nlp.pipe(non_detectable) if has_go_token(d)]

In [68]:
# shows all cases where we have "go"
all_go_sentences

['Go language benchmarks?',
 'Go code contribution: license and patent implications?',
 'Embedding instead of inheritance in Go',
 'Shared library in Go?',
 'multi package makefile example for go',
 "What's the point of having pointers in Go?",
 'Simulate a tcp connection in Go',
 'exec.Run and argv problem',
 'Trouble reading from a socket in go',
 "basic json > struct question ( using 'Go')",
 "Google's 'go' and scope/functions",
 'How does Go compile so quickly?',
 "For a struct vertex, what's the difference between map[int]vertex and map[int]*vertex?",
 'More idiomatic way of adding channel result to queue on completion',
 'Reading utf8-encoded data from a connection, using Go',
 'Checking if a channel has a ready-to-read value, using Go',
 "What does it mean by a 'systems language'?"]

In [69]:
# those instances where we can detect "go"
detectable

['Embedding instead of inheritance in Go',
 'Shared library in Go?',
 'multi package makefile example for go',
 "What's the point of having pointers in Go?",
 'Simulate a tcp connection in Go']

In [70]:
non_detectable

['Deploying multiple Java web apps to Glassfish in one go',
 'Removing all event handlers in one go',
 'How to Create a Dropdown List Hyperlink without the GO button?',
 'How do I disable multiple listboxes in one go using jQuery?',
 '<canvas> Go/Baduk/Weiqi Game Board',
 "How to listen for iPhone keyboard action/touch (ex, 'GO', 'Search', etc)",
 'SOAPUI & Groovy Scripts, executing multiple SQL statements in one go',
 "What's the simplest way to edit conflicted files in one go when using git and an editor like Vim or textmate?",
 'Import large chunk of data into Google App Engine Data Store at one go',
 'How many records can be loaded into Salesforce using Apex Data Loader in one go?',
 'How can I run multiple inserts with NHibernate in one go?']

In [71]:
len(all_go_sentences), len(detectable), len(non_detectable)

(17, 5, 11)

# 7)- Checking Scores

In [0]:
model_name = "en_core_web_sm"
model = spacy.load(model_name, disable=["ner"])

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ["go", "golang"]:
            if t.pos_ != "VERB":
                return True
    return False

In [0]:
method = "not-verb-but-pobj"

In [0]:
correct = sum(has_go_token(doc) for doc in model.pipe(detectable))
wrong = sum(has_go_token(doc) for doc in model.pipe(non_detectable))
precision = correct/(correct + wrong)
recall = correct/len(detectable)
accuracy = (correct + len(non_detectable) - wrong)/(len(detectable) + len(non_detectable))

In [75]:
precision

0.3125

In [76]:
accuracy

0.3125

In [77]:
recall

1.0

In [78]:
correct

5

In [79]:
wrong

11

### Method 2: not-verb with same model

In [80]:
model_name = "en_core_web_sm"
model = spacy.load(model_name, disable=["ner"])

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ["go", "golang"]:
            if t.pos_ != "VERB":
                return True
    return False

method = "not-verb"

correct = sum(has_go_token(doc) for doc in model.pipe(detectable))
wrong = sum(has_go_token(doc) for doc in model.pipe(non_detectable))
precision = correct/(correct + wrong)
recall = correct/len(detectable)
accuracy = (correct + len(non_detectable) - wrong)/(len(detectable) + len(non_detectable))

f"{precision},{recall},{accuracy},{model_name},{method}"

'0.3125,1.0,0.3125,en_core_web_sm,not-verb'

Same can be done with different methods i.e en_core_web_md, en_core_web_lg . Also we might use other combinations of methods with them to see which one gives the best score.