## Imports and Settings

In [1]:
using Pkg

In [2]:
#Pkg.add("Glob")
#Pkg.add("TextAnalysis")
#Pkg.add("Languages")
#Pkg.add("WordNet")
#Pkg.add("WordTokenizers")

In [3]:
using PyCall
using Conda

In [4]:
using Glob
using TextAnalysis
using Languages
using DataFrames
using WordNet
using WordTokenizers

## Exploring Pipeline

### TextAnalysis

In [5]:
sample_text = "Apple is looking at buying U.K. startup for \$1 billion"
doc = StringDocument(sample_text)

A StringDocument{String}
 * Language: Languages.English()
 * Title: Untitled Document
 * Author: Unknown Author
 * Timestamp: Unknown Time
 * Snippet: Apple is looking at buying U.K. startup for $1 bil

In [6]:
text(doc)

"Apple is looking at buying U.K. startup for \$1 billion"

In [7]:
tokens(doc)

11-element Vector{String}:
 "Apple"
 "is"
 "looking"
 "at"
 "buying"
 "U.K."
 "startup"
 "for"
 "\$"
 "1"
 "billion"

In [8]:
stemmed_doc = StringDocument(sample_text)
stem!(stemmed_doc)
println(text(doc))

Apple is looking at buying U.K. startup for $1 billion


## Read BBC Data

In [9]:
files = Glob.glob("../data/bbc/bbc/**/*.txt")

2225-element Vector{String}:
 "..\\data\\bbc\\bbc\\business\\001.txt"
 "..\\data\\bbc\\bbc\\business\\002.txt"
 "..\\data\\bbc\\bbc\\business\\003.txt"
 "..\\data\\bbc\\bbc\\business\\004.txt"
 "..\\data\\bbc\\bbc\\business\\005.txt"
 "..\\data\\bbc\\bbc\\business\\006.txt"
 "..\\data\\bbc\\bbc\\business\\007.txt"
 "..\\data\\bbc\\bbc\\business\\008.txt"
 "..\\data\\bbc\\bbc\\business\\009.txt"
 "..\\data\\bbc\\bbc\\business\\010.txt"
 "..\\data\\bbc\\bbc\\business\\011.txt"
 "..\\data\\bbc\\bbc\\business\\012.txt"
 "..\\data\\bbc\\bbc\\business\\013.txt"
 ⋮
 "..\\data\\bbc\\bbc\\tech\\390.txt"
 "..\\data\\bbc\\bbc\\tech\\391.txt"
 "..\\data\\bbc\\bbc\\tech\\392.txt"
 "..\\data\\bbc\\bbc\\tech\\393.txt"
 "..\\data\\bbc\\bbc\\tech\\394.txt"
 "..\\data\\bbc\\bbc\\tech\\395.txt"
 "..\\data\\bbc\\bbc\\tech\\396.txt"
 "..\\data\\bbc\\bbc\\tech\\397.txt"
 "..\\data\\bbc\\bbc\\tech\\398.txt"
 "..\\data\\bbc\\bbc\\tech\\399.txt"
 "..\\data\\bbc\\bbc\\tech\\400.txt"
 "..\\data\\bbc\\bbc\\tech\\

In [10]:
bbc_articles = Any[]
for (i,file) in enumerate(files)
    f = open(file, "r")
    lines = readlines(f)
    body = strip(join([strip(line) for line in lines[2:end]]))
    push!(bbc_articles, body)
    close(f)
end

In [11]:
length(bbc_articles)

2225

In [12]:
bbc_articles[1]

"Quarterly profits at US media giant TimeWarner jumped 76% to \$1.13bn (£600m) for the three months to December, from \$639m year-earlier.The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher adver" ⋯ 1975 bytes ⋯ "00m. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe, which it had reported as advertising revenue. It will now book the sale of its stake in AOL Europe as a loss on the value of that stake."

In [13]:
doc = StringDocument(bbc_articles[1])
typeof(doc)

StringDocument{SubString{String}}

In [14]:
sentences = TextAnalysis.sentence_tokenize(Languages.English(), bbc_articles[1])
for sent in sentences[1:3]
    println(sent, "\n")
end

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales.

TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.

Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.Time Warner said on Friday that it now owns 8% of search-engine Google.



In [15]:
ngrams(doc, 2)

Dict{AbstractString, Int64} with 437 entries:
  "quarters. However"     => 1
  "\$ 3.36"               => 1
  "US media"              => 1
  "has to"                => 1
  "music publisher"       => 1
  "search-engine Google." => 1
  "from \$"               => 2
  "TimeWarner said"       => 1
  "film division"         => 1
  "Parsons said."         => 1
  "dip at"                => 1
  "has mixed"             => 1
  "than in"               => 1
  "were slightly"         => 1
  "back of"               => 1
  "preceding three"       => 1
  "stake ."               => 1
  "free to"               => 1
  "\$ 300"                => 1
  "unable to"             => 1
  "SEC. The"              => 1
  "TimeWarner is"         => 1
  "hopes to"              => 1
  "and will"              => 1
  "already offered"       => 1
  ⋮                       => ⋮

## NLTK

In [16]:
#Conda.add("nltk")

In [17]:
@pyimport nltk

In [18]:
nltk.download("punkt")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amirreza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Amirreza\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


true

In [24]:
sent_tokens = tokens(StringDocument(sentences[1]))
sent_tags = nltk.pos_tag(sent_tokens)
df = DataFrame(Token = [x[1] for x in sent_tags], POS_Tag = [x[2] for x in sent_tags])
rename!(df, Symbol.(["Token", "POS Tag"]))

Unnamed: 0_level_0,Token,POS Tag
Unnamed: 0_level_1,String,String
1,Quarterly,JJ
2,profits,NNS
3,at,IN
4,US,NNP
5,media,NNS
6,giant,JJ
7,TimeWarner,NNP
8,jumped,VBD
9,76,CD
10,%,NN


In [20]:
lemmatizer = nltk.stem.WordNetLemmatizer()

PyObject <WordNetLemmatizer>

In [21]:
println(lemmatizer.lemmatize("rocks"))
println(lemmatizer.lemmatize("corpora"))

rock
corpus


In [22]:
pos_tags = nltk.pos_tag(tokens(doc))

476-element Vector{Tuple{String, String}}:
 ("Quarterly", "JJ")
 ("profits", "NNS")
 ("at", "IN")
 ("US", "NNP")
 ("media", "NNS")
 ("giant", "JJ")
 ("TimeWarner", "NNP")
 ("jumped", "VBD")
 ("76", "CD")
 ("%", "NN")
 ("to", "TO")
 ("\$", "\$")
 ("1.13", "CD")
 ⋮
 ("AOL", "NNP")
 ("Europe", "NNP")
 ("as", "IN")
 ("a", "DT")
 ("loss", "NN")
 ("on", "IN")
 ("the", "DT")
 ("value", "NN")
 ("of", "IN")
 ("that", "DT")
 ("stake", "NN")
 (".", ".")

In [23]:
println([lemmatizer.lemmatize(token) for token in tokens(doc)])

["Quarterly", "profit", "at", "US", "medium", "giant", "TimeWarner", "jumped", "76", "%", "to", "\$", "1.13", "bn", "(", "£", "600", "m", ")", "for", "the", "three", "month", "to", "December", ",", "from", "\$", "639", "m", "year-earlier.The", "firm", ",", "which", "is", "now", "one", "of", "the", "biggest", "investor", "in", "Google", ",", "benefited", "from", "sale", "of", "high-speed", "internet", "connection", "and", "higher", "advert", "sales.", "TimeWarner", "said", "fourth", "quarter", "sale", "rose", "2", "%", "to", "\$", "11.1", "bn", "from", "\$", "10.9", "bn.", "Its", "profit", "were", "buoyed", "by", "one-off", "gain", "which", "offset", "a", "profit", "dip", "at", "Warner", "Bros", ",", "and", "le", "user", "for", "AOL.Time", "Warner", "said", "on", "Friday", "that", "it", "now", "owns", "8", "%", "of", "search-engine", "Google.", "But", "it", "own", "internet", "business", ",", "AOL", ",", "had", "ha", "mixed", "fortunes.", "It", "lost", "464,000", "subscriber", "in", "th