In [8]:
import pandas as pd

In [9]:
pdf = pd.DataFrame({
        'texts': [['I', 'like', 'playing', 'basketball'],
                 ['I', 'like', 'coding'],
                 ['I', 'like', 'machine', 'learning', 'very', 'much']]
    })

In [13]:
df = spark.createDataFrame(pdf)
df.show(truncate=False)

+----------------------------------------+
|texts                                   |
+----------------------------------------+
|[I, like, playing, basketball]          |
|[I, like, coding]                       |
|[I, like, machine, learning, very, much]|
+----------------------------------------+



In [22]:
from pyspark.ml.feature import NGram
from pyspark.ml import Pipeline
ngrams = [NGram(n=n, inputCol='texts', outputCol=str(n)+'-grams') for n in [2,3,4]]

# build pipeline model
pipeline = Pipeline(stages=ngrams)

# transform data
texts_ngrams = pipeline.fit(df).transform(df)

# display result
texts_ngrams.select('2-grams').show(truncate=False)
texts_ngrams.select('3-grams').show(truncate=False)
texts_ngrams.select('4-grams').show(truncate=False)

+------------------------------------------------------------------+
|2-grams                                                           |
+------------------------------------------------------------------+
|[I like, like playing, playing basketball]                        |
|[I like, like coding]                                             |
|[I like, like machine, machine learning, learning very, very much]|
+------------------------------------------------------------------+

+----------------------------------------------------------------------------------+
|3-grams                                                                           |
+----------------------------------------------------------------------------------+
|[I like playing, like playing basketball]                                         |
|[I like coding]                                                                   |
|[I like machine, like machine learning, machine learning very, learning very much]|
+-----

In [23]:
import nltk

In [24]:
nltk.corpus.gutenberg.fileids()

[u'austen-emma.txt',
 u'austen-persuasion.txt',
 u'austen-sense.txt',
 u'bible-kjv.txt',
 u'blake-poems.txt',
 u'bryant-stories.txt',
 u'burgess-busterbrown.txt',
 u'carroll-alice.txt',
 u'chesterton-ball.txt',
 u'chesterton-brown.txt',
 u'chesterton-thursday.txt',
 u'edgeworth-parents.txt',
 u'melville-moby_dick.txt',
 u'milton-paradise.txt',
 u'shakespeare-caesar.txt',
 u'shakespeare-hamlet.txt',
 u'shakespeare-macbeth.txt',
 u'whitman-leaves.txt']

In [25]:
nltk.corpus.gutenberg.sents('shakespeare-macbeth.txt')

[[u'[', u'The', u'Tragedie', u'of', u'Macbeth', u'by', u'William', u'Shakespeare', u'1603', u']'], [u'Actus', u'Primus', u'.'], ...]

In [26]:
pdf = pd.DataFrame({
        'shakespeare-macbeth': nltk.corpus.gutenberg.sents('shakespeare-macbeth.txt')
    })
df = spark.createDataFrame(pdf)

In [32]:
df.show(truncate=False)

+---------------------------------------------------------------------------------------------+
|shakespeare-macbeth                                                                          |
+---------------------------------------------------------------------------------------------+
|[[, The, Tragedie, of, Macbeth, by, William, Shakespeare, 1603, ]]                           |
|[Actus, Primus, .]                                                                           |
|[Scoena, Prima, .]                                                                           |
|[Thunder, and, Lightning, .]                                                                 |
|[Enter, three, Witches, .]                                                                   |
|[1, .]                                                                                       |
|[When, shall, we, three, meet, againe, ?]                                                    |
|[In, Thunder, ,, Lightning, ,, or, in, 

In [35]:
from nltk.corpus import gutenberg

In [39]:
gutenberg_fileids = gutenberg.fileids()
gutenberg_fileids

[u'austen-emma.txt',
 u'austen-persuasion.txt',
 u'austen-sense.txt',
 u'bible-kjv.txt',
 u'blake-poems.txt',
 u'bryant-stories.txt',
 u'burgess-busterbrown.txt',
 u'carroll-alice.txt',
 u'chesterton-ball.txt',
 u'chesterton-brown.txt',
 u'chesterton-thursday.txt',
 u'edgeworth-parents.txt',
 u'melville-moby_dick.txt',
 u'milton-paradise.txt',
 u'shakespeare-caesar.txt',
 u'shakespeare-hamlet.txt',
 u'shakespeare-macbeth.txt',
 u'whitman-leaves.txt']

In [40]:
gutenberg.abspath(gutenberg_fileids[0])

FileSystemPathPointer(u'/Users/mingchen/nltk_data/corpora/gutenberg/austen-emma.txt')

In [43]:
gutenberg.raw(gutenberg_fileids[0])[:200]

u'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; an'

In [44]:
gutenberg.words()

[u'[', u'Emma', u'by', u'Jane', u'Austen', u'1816', ...]

In [45]:
len(gutenberg.words())

2621613

In [46]:
gutenberg.sents(gutenberg_fileids[0])

[[u'[', u'Emma', u'by', u'Jane', u'Austen', u'1816', u']'], [u'VOLUME', u'I'], ...]

In [47]:
len(gutenberg.sents(gutenberg_fileids[0]))

7752

In [53]:
from nltk.corpus import PlaintextCorpusReader
corpus_data = PlaintextCorpusReader('./data', '.*')

In [54]:
data_fileids = corpus_data.fileids()
data_fileids

['Advertising.csv',
 'Credit.csv',
 'WineData.csv',
 'churn-bigml-20.csv',
 'churn-bigml-80.csv',
 'cuse_binary.csv',
 'horseshoe_crab.csv',
 'hsb2.csv',
 'hsb2_modified.csv',
 'iris.csv',
 'mtcars.csv',
 'prostate.csv',
 'twitter.txt']

In [56]:
corpus_data.raw('twitter.txt')

u'Fresh install of XP on new computer. Sweet relief! fuck vista\t1018769417\t1.0\nWell. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl\t10284216536\t1.0\n"Literally six weeks before I can take off ""SSC Chair"" off my email. Its like the torturous 4th mile before everything stops hurting."\t10298589026\t1.0\nMitsubishi i MiEV - Wikipedia, the free encyclopedia - http://goo.gl/xipe Cutest car ever!\t109017669432377344\t1.0\n\'Cheap Eats in SLP\' - http://t.co/4w8gRp7\t109642968603963392\t1.0\nTeenage Mutant Ninja Turtle art is never a bad thing... http://bit.ly/aDMHyW\t10995492579\t1.0\nNew demographic survey of online video viewers: http://bit.ly/cx8b7I via @KellyOlexa\t11713360136\t1.0\nhi all - i\'m going to be tweeting things lookstat at the @lookstat twitter account. please follow me there\t1208319583\t1.0\nHoly carp, no. That movie will seriously suffer for it. RT @MouseInfo: Anyone excited for The Little Mermaid in 3D?\t121330835726155776\t1.0\n"D

In [62]:
corpus_data.words(fileids='twitter.txt')

[u'Fresh', u'install', u'of', u'XP', u'on', u'new', ...]

In [63]:
len(corpus_data.words(fileids='twitter.txt'))

253

In [64]:
corpus_data.sents(fileids='twitter.txt')

[[u'Fresh', u'install', u'of', u'XP', u'on', u'new', u'computer', u'.'], [u'Sweet', u'relief', u'!'], ...]

In [65]:
len(corpus_data.sents(fileids='twitter.txt'))

14

In [66]:
from nltk.corpus import wordnet

In [84]:
wordnet.synsets('car')[0]

AttributeError: 'Synset' object has no attribute 'lower'

In [91]:
dir(wordnet.synsets('car')[0])
wordnet.synsets('car')[0]._name

u'car.n.01'

In [77]:
wordnet.synset('car.n.01').lemma_names()

[u'car', u'auto', u'automobile', u'machine', u'motorcar']

In [78]:
wordnet.synset('car.n.02').lemma_names()

[u'car', u'railcar', u'railway_car', u'railroad_car']

In [92]:
wordnet.synsets('car')

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

In [94]:
pdf = pd.DataFrame({
        'car_synsets': [synsets._name for synsets in wordnet.synsets('car')]
    })
df = spark.createDataFrame(pdf)

In [96]:
df.show()
from pyspark.sql.functions import udf

+--------------+
|   car_synsets|
+--------------+
|      car.n.01|
|      car.n.02|
|      car.n.03|
|      car.n.04|
|cable_car.n.01|
+--------------+



In [101]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from nltk.corpus import wordnet

returntype = ArrayType(StringType())
synset_lemmas_udf = udf(lambda x: wordnet.synset(x).lemma_names(), returnType=returntype)

In [109]:
wordnet.synset('car.n.01').lemma_names()

[u'car', u'auto', u'automobile', u'machine', u'motorcar']

In [114]:
df_lemmas = df.select('car_synsets', synset_lemmas_udf(df.car_synsets).alias('lemma_names'))

In [116]:
df_lemmas.show(truncate=False)

+--------------+------------------------------------------+
|car_synsets   |lemma_names                               |
+--------------+------------------------------------------+
|car.n.01      |[car, auto, automobile, machine, motorcar]|
|car.n.02      |[car, railcar, railway_car, railroad_car] |
|car.n.03      |[car, gondola]                            |
|car.n.04      |[car, elevator_car]                       |
|cable_car.n.01|[cable_car, car]                          |
+--------------+------------------------------------------+



In [117]:
car_n_01 = wordnet.synset('car.n.01')

In [119]:
car_n_01.definition()

u'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [None]:
synset_definition_udf = udf(lambda x: wordnet.synset(x).definition(), StringType())
df_2 = df.select('car_synsets',
                        synset_definition_udf(df.car_synsets).alias('definition'))

In [None]:
df_2.show(truncate=False)