## Which words written on US constitution still in regular use today? How many times each of these words are used on US constitution?

In [1]:
import re
from pyspark.sql.types import Row

In [2]:
regex = re.compile('[^a-zA-Z]')

constitution_words = sc.textFile('us_constitution.txt') \
                        .flatMap(lambda x: x.split()) \
                        .map(lambda x: regex.sub('', x.lower())) \
                        .map(lambda x: (x,1)) \
                        .reduceByKey(lambda x,y: x+y)\
                        .filter(lambda x: x[0])

constitution_words_df = constitution_words.map(lambda x: Row(*x)).toDF()
constitution_words_df.show(5) 

+-------+---+
|     _1| _2|
+-------+---+
| united| 96|
|     of|558|
|america|  5|
|     we|  2|
|     in|155|
+-------+---+
only showing top 5 rows



In [3]:
top_words = sc.textFile('top_english_words.txt')
# top_words.take(5)

top_words_df = top_words.map(lambda x: Row(x)).toDF()
top_words_df.show(5)

+---+
| _1|
+---+
|the|
| of|
|and|
| to|
|  a|
+---+
only showing top 5 rows



In [4]:
top_words_df.registerTempTable('top_words')
constitution_words_df.registerTempTable('constitution')

In [5]:
top_words_on_constitution = \
    sqlContext.sql('''
                    select c._1 word, c._2 events
                    from 
                        constitution c
                        inner join top_words t
                    where t._1 == c._1
                    order by events DESC
    ''')

In [6]:
print(f'Das {constitution_words.count()} palavras únicas da constituição foram encontradas {top_words_on_constitution.count()} nas 10 mil palavras mais usadas no google')

Das 1261 palavras únicas da constituição foram encontradas 888 nas 10 mil palavras mais usadas no google


In [7]:
top_words_on_constitution.show(10)

+------+------+
|  word|events|
+------+------+
|   the|   847|
|    of|   558|
| shall|   332|
|   and|   295|
|    to|   226|
|    be|   200|
|    or|   173|
|    in|   155|
|states|   146|
|    by|   119|
+------+------+
only showing top 10 rows



In [8]:
#top_words_on_constitution.repartition(1).write.csv('result')