In [1]:
!pip install plotly



In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import NGram, Tokenizer, RegexTokenizer
import plotly.graph_objects as go
import numpy as np

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
df = spark.read.option("encoding", "UTF-8").option("decode", "UTF-8").csv('../dataset/debate-tweets.tsv', sep='\t', inferSchema=True)

In [5]:
df.show(vertical=True)

-RECORD 0--------------------
 _c0  | 522394422710136832   
 _c1  | @anacddd verdade,... 
 _c2  | -27.516566           
 _c3  | -48.646082           
 _c4  | False                
 _c5  | 522394422710136832   
 _c6  | 522394422710136832   
 _c7  | Wed Oct 15 14:31:... 
 _c8  | 2014-10-15           
 _c9  | 3.0342583E8          
 _c10 | pt                   
 _c11 | Biguaçu              
 _c12 | 77c15e08a456c529     
 _c13 | 0.0                  
 _c14 | 0.0                  
 _c15 | 0.0                  
 _c16 | 0.0                  
 _c17 | 0.0                  
 _c18 | 0.0                  
 _c19 | 0.0                  
 _c20 | 0.0                  
 _c21 | null                 
 _c22 | null                 
 _c23 | null                 
 _c24 | null                 
 _c25 | Geovana Nunes        
 _c26 | 295414968            
 _c27 | 295414968            
 _c28 | null                 
 _c29 | 316                  
 _c30 | Mon May 09 00:12:... 
 _c31 | geovanannunes        
-RECORD 1-

In [6]:
df.printSchema()

root
 |-- _c0: long (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: double (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: double (nullable = true)
 |-- _c14: double (nullable = true)
 |-- _c15: double (nullable = true)
 |-- _c16: double (nullable = true)
 |-- _c17: double (nullable = true)
 |-- _c18: double (nullable = true)
 |-- _c19: double (nullable = true)
 |-- _c20: string (nullable = true)
 |-- _c21: string (nullable = true)
 |-- _c22: long (nullable = true)
 |-- _c23: string (nullable = true)
 |-- _c24: string (nullable = true)
 |-- _c25: string (nullable = true)
 |-- _c26: string (nullable = true)
 |-- _c27: long (nullable = true)
 |-

## Questão 1:

### a)

In [7]:
df_time = df.withColumn('_c7', to_timestamp(f.split(df._c7, ' ')[3], 'HH:mm:ss'))

In [8]:
wordCountMorning = df_time.withColumn('word', f.explode(f.split(regexp_replace(f.upper(df_time._c1), '[\$,!":;. ”]', ''), ' ')))\
    .where("_c7 between '1970-01-01 06:00:00' AND '1970-01-01 11:59:59'")\
    .where("word like '#%'")\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)

In [9]:
wordCountAfternoon = df_time.withColumn('word', f.explode(f.split(regexp_replace(f.upper(df_time._c1), '[\$,!":;. ”]', ''), ' ')))\
    .where("_c7 between '1970-01-01 12:00:00' AND '1970-01-01 17:59:59'")\
    .where("word like '#%'")\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)

In [10]:
wordCountNight = df_time.withColumn('word', f.explode(f.split(regexp_replace(f.upper(df_time._c1), '[\$,!":;. ”]', ''), ' ')))\
    .where("_c7 between '1970-01-01 18:00:00' AND '1970-01-01 23:59:59'")\
    .where("word like '#%'")\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)

In [11]:
wordCountDawn = df_time.withColumn('word', f.explode(f.split(regexp_replace(f.upper(df_time._c1), '[\$,!":;. ”]', ''), ' ')))\
    .where("_c7 between '1970-01-01 00:00:00' AND '1970-01-01 05:59:59'")\
    .where("word like '#%'")\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)

In [12]:
wordCountMorning.show(1)

+--------------------+-----+
|                word|count|
+--------------------+-----+
|#EMABIGGESTFANSJU...|  167|
+--------------------+-----+
only showing top 1 row



In [13]:
wordCountAfternoon.show(1)

+--------------------+-----+
|                word|count|
+--------------------+-----+
|#EMABIGGESTFANSJU...|  627|
+--------------------+-----+
only showing top 1 row



In [14]:
wordCountNight.show(1)

+--------------------+-----+
|                word|count|
+--------------------+-----+
|#EMABIGGESTFANSJU...|  693|
+--------------------+-----+
only showing top 1 row



In [15]:
wordCountDawn.show(1)

+--------------------+-----+
|                word|count|
+--------------------+-----+
|#EMABIGGESTFANSJU...|  896|
+--------------------+-----+
only showing top 1 row



### b)

In [16]:
df_time = df.withColumn('_c8', to_timestamp(df._c8, 'yyyy-MM-dd'))

In [17]:
wordCount = df_time.withColumn('word', f.explode(f.split(regexp_replace(f.upper(df_time._c1), '[\$,!"”]', ''), ' ')))\
    .where("word like '#%'")\
    .groupBy('_c8', 'word')\
    .count()\
    .sort('count', ascending=False)

In [18]:
wordCount.show()

+-------------------+--------------------+-----+
|                _c8|                word|count|
+-------------------+--------------------+-----+
|2014-10-16 00:00:00|   #EMABIGGESTFANS1D|68414|
|2014-10-16 00:00:00|#EMABIGGESTFANSJU...|58811|
|2014-10-17 00:00:00|#EMABIGGESTFANSJU...|49599|
|2014-10-17 00:00:00|   #EMABIGGESTFANS1D|47701|
|2014-10-15 00:00:00|   #EMABIGGESTFANS1D|34366|
|2014-10-19 00:00:00|#EMABIGGESTFANSJU...|33508|
|2014-10-19 00:00:00|   #EMABIGGESTFANS1D|29497|
|2014-10-15 00:00:00|#EMABIGGESTFANSJU...|27998|
|2014-10-18 00:00:00|#EMABIGGESTFANSJU...|27399|
|2014-10-18 00:00:00|   #EMABIGGESTFANS1D|27200|
|2014-10-16 00:00:00|       #CAMILASAYSHI|10635|
|2014-10-20 00:00:00|#EMABIGGESTFANSJU...|10475|
|2014-10-20 00:00:00|   #EMABIGGESTFANS1D| 7399|
|2014-10-15 00:00:00|        #STEALMYGIRL| 6929|
|2014-10-20 00:00:00|     #DEBATENARECORD| 4430|
|2014-10-15 00:00:00|   #BIGPAYNODANCEOFF| 4062|
|2014-10-16 00:00:00|        #DEBATENOSBT| 3692|
|2014-10-16 00:00:00

In [19]:
wordCount.filter('_c8 is not null').groupBy('_c8').agg(f.first("word"), f.max("count")).show();

+-------------------+--------------------+----------+
|                _c8|         first(word)|max(count)|
+-------------------+--------------------+----------+
|2014-10-18 00:00:00|#EMABIGGESTFANSJU...|     27399|
|2014-10-15 00:00:00|   #EMABIGGESTFANS1D|     34366|
|2014-10-17 00:00:00|#EMABIGGESTFANSJU...|     49599|
|2014-10-19 00:00:00|#EMABIGGESTFANSJU...|     33508|
|2014-10-16 00:00:00|   #EMABIGGESTFANS1D|     68414|
|2014-10-20 00:00:00|#EMABIGGESTFANSJU...|     10475|
+-------------------+--------------------+----------+



### c)

In [20]:
df_time = df.withColumn('_c7', f.date_trunc('hour',f.to_timestamp(f.concat(df._c8, f.lit(" "), f.split(df._c7, ' ')[3]), 'yyyy-MM-dd HH:mm:ss')))

In [21]:
df_time.select('_c7').filter('_c7 is not null').groupBy('_c7').count().sort('_c7').show()

+-------------------+------+
|                _c7| count|
+-------------------+------+
|2014-10-15 14:00:00| 34378|
|2014-10-15 15:00:00| 79157|
|2014-10-15 16:00:00| 78353|
|2014-10-15 17:00:00| 83950|
|2014-10-15 18:00:00| 77713|
|2014-10-15 19:00:00| 65095|
|2014-10-15 20:00:00| 66813|
|2014-10-15 21:00:00| 79270|
|2014-10-15 22:00:00| 86030|
|2014-10-15 23:00:00| 97574|
|2014-10-16 00:00:00|110232|
|2014-10-16 01:00:00|163338|
|2014-10-16 02:00:00|176211|
|2014-10-16 03:00:00|124599|
|2014-10-16 04:00:00| 77743|
|2014-10-16 05:00:00| 42661|
|2014-10-16 06:00:00| 22228|
|2014-10-16 07:00:00| 10157|
|2014-10-16 08:00:00|  8327|
|2014-10-16 09:00:00| 23616|
+-------------------+------+
only showing top 20 rows



In [22]:
dates = df_time.select('_c8').filter('_c7 is not null').distinct().collect()

In [23]:
for date in dates:
    diff_sec = df_time.where("_c8 = '{}'".format(date['_c8']))\
                    .sort('_c7', ascending=False).first()._c7 -\
    df_time.where("_c8 = '{}'".format(date['_c8'])).first()._c7
    
    value = df_time.select('_c8')\
        .filter('_c7 is not null')\
        .where("_c8 = '{}'".format(date['_c8']))\
        .groupBy('_c8').count().select('count').collect()[0]['count']
    
    print(date['_c8'], 3600*(value/diff_sec.seconds))

2014-10-18 52046.391304347824
2014-10-16 73811.91304347826
2014-10-20 83015.0
2014-10-19 64701.52173913044
2014-10-17 65870.86956521739
2014-10-15 83148.11111111111


### d)

In [24]:
tokenizer = Tokenizer(inputCol="_c1", outputCol="words")
wordsDataFrame = tokenizer.transform(df)

ngram = NGram(inputCol="words", outputCol="ngrams", n = 6)
ngramDataFrame = ngram.transform(wordsDataFrame)

In [25]:
sentenceCount = ngramDataFrame.withColumn('sentences', f.explode(f.col('ngrams')))\
    .where("sentences LIKE '%dilma%'")\
    .groupBy('sentences')\
    .count()\
    .sort('count', ascending=False)

In [26]:
sentenceCount.show()

+--------------------+-----+
|           sentences|count|
+--------------------+-----+
|dilma foi nocaute...|   54|
|@dilmabr não fuja...|   31|
|#euquerodebatenag...|   31|
|dilma foi orienta...|   29|
|usados por dilma ...|   27|
|por dilma para cr...|   26|
|dilma para critic...|   25|
|a cara de deboche...|   25|
|o bem do brasil,m...|   24|
|relatórios usados...|   24|
|de hj, dilma foi ...|   23|
|debate de hj, dil...|   22|
|hj, dilma foi ori...|   22|
|22 anúncios de di...|   22|
|anúncios de dilma...|   22|
|no debate de hj, ...|   21|
|de dilma na tv, 1...|   20|
|dos 22 anúncios d...|   19|
| a cara da dilma é a|   18|
|compara apoio a d...|   17|
+--------------------+-----+
only showing top 20 rows



### e)

In [27]:
tokenizer = Tokenizer(inputCol="_c1", outputCol="words")
wordsDataFrame = tokenizer.transform(df)

ngram = NGram(inputCol="words", outputCol="ngrams", n = 6)
ngramDataFrame = ngram.transform(wordsDataFrame)

In [28]:
sentenceCount = ngramDataFrame.withColumn('sentences', f.explode(f.col('ngrams')))\
    .where("sentences LIKE '%aécio%'")\
    .groupBy('sentences')\
    .count()\
    .sort('count', ascending=False)

In [29]:
sentenceCount.show()

+--------------------+-----+
|           sentences|count|
+--------------------+-----+
|no debate do sbt ...|   54|
|debate do sbt por...|   54|
|do sbt por aécio ...|   53|
|sbt por aécio nev...|   53|
|por aécio neves h...|   52|
|usados por dilma ...|   27|
|por dilma para cr...|   26|
|dilma para critic...|   25|
|criticar aécio so...|   25|
|para criticar aéc...|   25|
|de aécio chefiou ...|   18|
|aécio chefiou órg...|   18|
|irmã de aécio che...|   17|
|levy fidelix anun...|   17|
|aécio somem do si...|   17|
|anuncia apoio a a...|   16|
|apoio a aécio nev...|   16|
|fidelix anuncia a...|   16|
|aécio aécio aécio...|   15|
|dilma na tv, 19 a...|   15|
+--------------------+-----+
only showing top 20 rows



## Questão 2:

In [30]:
df = spark.read.json('../dataset/eiffel-tower-reviews.json')

In [31]:
df.show(vertical=True)

-RECORD 0---------------------------
 _id         | {5921cdae4b679c46... 
 author      | {0, 0, Since this... 
 bubbleCount | 50                   
 collectedAt | {2017-05-21T17:26... 
 createdAt   | May 20, 2017         
 query       | Eiffel_Tower         
 text        | This is the most ... 
 title       | Must do even it w... 
-RECORD 1---------------------------
 _id         | {5921cdae4b679c46... 
 author      | {10, 4, Since Aug... 
 bubbleCount | 50                   
 collectedAt | {2017-05-21T17:26... 
 createdAt   | May 20, 2017         
 query       | Eiffel_Tower         
 text        | My significant ot... 
 title       | A Classic            
-RECORD 2---------------------------
 _id         | {5921cdae4b679c46... 
 author      | {9, 4, Since Nov ... 
 bubbleCount | 50                   
 collectedAt | {2017-05-21T17:26... 
 createdAt   | May 20, 2017         
 query       | Eiffel_Tower         
 text        | We had a tour to ... 
 title       | Wet weather          
-

In [32]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- author: struct (nullable = true)
 |    |-- helpfulVotes: string (nullable = true)
 |    |-- level: string (nullable = true)
 |    |-- memberSince: string (nullable = true)
 |    |-- postForum: string (nullable = true)
 |    |-- ratings: string (nullable = true)
 |    |-- reviews: string (nullable = true)
 |-- bubbleCount: long (nullable = true)
 |-- collectedAt: struct (nullable = true)
 |    |-- $date: string (nullable = true)
 |-- createdAt: string (nullable = true)
 |-- query: string (nullable = true)
 |-- text: string (nullable = true)
 |-- title: string (nullable = true)



In [33]:
stopwords = ("", "-", "it's", "a", "able", "about", "above", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ah", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "are", "aren", "arent", "arise", "around", "as", "aside", "ask", "asking", "at", "auth", "available", "away", "awfully", "b", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "between", "beyond", "biol", "both", "brief", "briefly", "but", "by", "c", "ca", "came", "can", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "could", "couldnt", "d", "date", "did", "didn't", "different", "do", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "et-al", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "few", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "for", "former", "formerly", "forth", "found", "four", "from", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "had", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "hed", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "how", "howbeit", "however", "hundred", "i", "id", "ie", "if", "i'll", "im", "immediate", "immediately", "importance", "important", "in", "inc", "indeed", "index", "information", "instead", "into", "invention", "inward", "is", "isn't", "it", "itd", "it'll", "its", "itself", "i've", "j", "just", "k", "keep	keeps", "kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "mug", "must", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "ones", "only", "onto", "or", "ord", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "she", "shed", "she'll", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure	t", "take", "taken", "taking", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'll", "theyre", "they've", "think", "this", "those", "thou", "though", "thoughh", "thousand", "throug", "through", "throughout", "thru", "thus", "til", "tip", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "twice", "two", "u", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "v", "value", "various", "'ve", "very", "via", "viz", "vol", "vols", "vs", "w", "want", "wants", "was", "wasnt", "way", "we", "wed", "welcome", "we'll", "went", "were", "werent", "we've", "what", "whatever", "what'll", "whats", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "whose", "why", "widely", "willing", "wish", "with", "within", "without", "wont", "words", "world", "would", "wouldnt", "www", "x", "y", "yes", "yet", "you", "youd", "you'll", "your", "youre", "yours", "yourself", "yourselves", "you've", "z", "zero") 

### a)

In [34]:
wordCount = df.withColumn('word', f.explode(f.split(regexp_replace(f.concat(f.lower(f.col('title')), f.lit(" "), f.lower(f.col('text'))), '[\$,!".]', ''), ' ')))\
    .where(f'word not in {stopwords}')\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)

In [35]:
wordCount.show()

+----------+-----+
|      word|count|
+----------+-----+
|     tower| 6420|
|     paris| 4127|
|    eiffel| 3958|
|       top| 2998|
|      view| 2276|
|      time| 2230|
|     night| 2192|
|     visit| 2016|
|     views| 1768|
|   amazing| 1726|
|       day| 1541|
|     great| 1485|
|   tickets| 1474|
| beautiful| 1411|
|     worth| 1387|
|      long| 1263|
|      tour| 1082|
|     level| 1045|
|experience| 1039|
|     floor| 1007|
+----------+-----+
only showing top 20 rows



### b)

In [36]:
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
wordsDataFrame = regexTokenizer.transform(df.dropna())
ngram = NGram(inputCol="words", outputCol="ngrams", n = 4)
ngramDataFrame = ngram.transform(wordsDataFrame)

In [37]:
sentenceCount = ngramDataFrame.withColumn('sentences', f.explode(f.col('ngrams')))\
    .groupBy('sentences')\
    .count()\
    .sort('count', ascending=False)

In [38]:
sentenceCount.show()

+--------------------+-----+
|           sentences|count|
+--------------------+-----+
| the eiffel tower is|  407|
|        to go to the|  301|
|       go to the top|  280|
| of the eiffel tower|  265|
|      the top of the|  264|
| to the eiffel tower|  240|
|   the view from the|  158|
|see the eiffel tower|  156|
|       to the top of|  151|
|   view from the top|  150|
|      all the way to|  146|
|      the way to the|  146|
|the eiffel tower and|  144|
|       up to the top|  140|
| to the second floor|  139|
|    top of the tower|  136|
|       to get to the|  134|
|      way to the top|  131|
|        to go up the|  130|
| to the second level|  127|
+--------------------+-----+
only showing top 20 rows



### c)

In [39]:
topicCount = df.withColumn('word', f.explode(f.split(f.lower(f.col('title')), ' ')))\
    .where(f'word not in {stopwords}')\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)

In [40]:
topicCount.show()

+----------+-----+
|      word|count|
+----------+-----+
|     paris|  634|
|    eiffel|  579|
|     tower|  556|
|   amazing|  426|
|      view|  340|
| beautiful|  322|
|     great|  317|
|     night|  288|
|     visit|  287|
|     views|  239|
|     worth|  219|
|    iconic|  181|
|       top|  164|
|experience|  136|
|      tour|  127|
|       day|  125|
|     place|  119|
|      best|  116|
|      time|  108|
|  stunning|  103|
+----------+-----+
only showing top 20 rows



### d)

In [41]:
distribuitionTemp = df.groupBy('createdAt')\
    .count()\
    .sort('createdAt', ascending=False)

In [42]:
distribuitionTemp.show()

+------------------+-----+
|         createdAt|count|
+------------------+-----+
| September 9, 2016|   15|
| September 9, 2015|   19|
| September 8, 2016|   35|
| September 8, 2015|    6|
| September 7, 2016|   17|
| September 7, 2015|   13|
| September 6, 2016|   13|
| September 6, 2015|   17|
| September 5, 2016|   14|
| September 5, 2015|    7|
| September 4, 2016|   13|
| September 4, 2015|    6|
|September 30, 2016|   11|
|September 30, 2015|   19|
| September 3, 2016|    6|
| September 3, 2015|   16|
|September 29, 2016|   14|
|September 29, 2015|   20|
|September 28, 2016|    8|
|September 28, 2015|   46|
+------------------+-----+
only showing top 20 rows



In [43]:
distrList = distribuitionTemp.collect()

In [44]:
distrArray = np.array(distrList)

In [45]:
for i in range(len(distrArray)):
    distrArray[i][0] = datetime.datetime.strptime(distrArray[i][0], '%B %d, %Y')

NameError: name 'datetime' is not defined

In [None]:
sortedDistr = distrArray.tolist()

In [None]:
sortedDistr.sort()

In [None]:
distrPlotX = []
distrPlotY = []
for value in sortedDistr:
    distrPlotX.append(value[0])
    distrPlotY.append(int(value[1]))

In [None]:
fig = go.Figure(data=go.Scatter(x=distrPlotX, y=distrPlotY))
fig.update_layout(title='Distribuição temporal',
                   xaxis_title='Data',
                   yaxis_title='Número de Revisões')
fig.show()

In [None]:
spark.stop()