In [1]:
from pyspark.sql import types
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from typing import List

In [2]:
spark = SparkSession.builder \
		.master("local[*]") \
		.config("spark.executor.memory", "2gb") \
		.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/02 13:36:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/02 13:36:56 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# 1. Wordcount

In [3]:
lines_df = spark.read.text('data/wordcount.txt')
lines_df.printSchema()
lines_df.show()

root
 |-- value: string (nullable = true)

+--------------------+
|               value|
+--------------------+
|word count from W...|
|the word count is...|
|is required to st...|
|proceedings journ...|
|the translation j...|
|and reading speed...|
|six characters to...|
|In non fiction Se...|
|This section does...|
|reliable sources ...|
|Variations in the...|
|which words don't...|
|is a broad consen...|
|The consensus is ...|
|word boundaries a...|
|characters such a...|
|Different word co...|
|details and on wh...|
|of most major wor...|
|handwriting or wi...|
+--------------------+
only showing top 20 rows




[Stage 0:>                                                          (0 + 1) / 1]

                                                                                

In [4]:
words_df = lines_df.withColumn('word', F.explode(F.split(F.col('value'), ' '))) \
		.groupBy('word').count().sort('count', ascending=False)
words_df.show()

+--------+-----+
|    word|count|
+--------+-----+
|     the|   38|
|       a|   28|
|      of|   25|
|    word|   24|
|     and|   23|
|   words|   21|
|      is|   19|
|      to|   18|
|      in|   11|
|   count|   11|
|      or|   11|
|     for|   10|
|      as|    9|
|     may|    8|
|      be|    8|
|    text|    8|
|      on|    7|
|    such|    7|
|counting|    6|
|     can|    5|
+--------+-----+
only showing top 20 rows



In [5]:
words_df.cache() # saves it to MEMORY_AND_DISK
assert 381 == words_df.count()

                                                                                

In [6]:
# How many words have more than 4 occurrences
filtered_words_df = words_df.filter(F.col('count') > 4)
assert 26 == filtered_words_df.count()

# 2. Tweet Mining

In [7]:
tweets_df = spark.read.json('data/reduced-tweets.json')

In [8]:
tweets_df.printSchema()
tweets_df.show()

root
 |-- country: string (nullable = true)
 |-- id: string (nullable = true)
 |-- place: string (nullable = true)
 |-- text: string (nullable = true)
 |-- user: string (nullable = true)

+--------------------+------------------+-----------------+--------------------+-------------------+
|             country|                id|            place|                text|               user|
+--------------------+------------------+-----------------+--------------------+-------------------+
|               India|572692378957430785|           Orissa|@always_nidhi @Yo...|    Srkian_nishu :)|
|       United States|572575240615796737|        Manhattan|@OnlyDancers Bell...| TagineDiningGlobal|
|       United States|572575243883036672|        Claremont|1/ "Without the a...|        Daniel Beer|
|       United States|572575252020109313|           Vienna|idk why people ha...|   someone actually|
|       United States|572575274539356160|           Boston|Taste of Iceland!...|     BostonAttitude|
|   

In [9]:
# Find all the persons mentioned on tweets
def extract_mentions(words: List[str]) -> List[str]:
	mentions = []
	for word in words:
		if len(word) > 1 and word[0] == '@':
			mentions.append(word.lower())

	return mentions

filtered_mentions_df = F.udf(lambda z: extract_mentions(z), types.ArrayType(types.StringType()))
spark.udf.register("filter_mentions", filtered_mentions_df)
mentions_df = tweets_df.withColumn('mention', F.explode(filtered_mentions_df(F.split(F.col('text'), ' ')))).select('mention')
mentions_df.cache()
assert mentions_df.count() == 4462
assert mentions_df.filter(F.col('mention') == "@jordinsparks").count() == 2

                                                                                

In [10]:
# Find all the hashtags mentioned on tweets
@F.udf(returnType=types.ArrayType(types.StringType()))
def extract_hashtags(words: List[str]) -> List[str]:
	mentions = []
	for word in words:
		if len(word) > 1 and word[0] == '#':
			mentions.append(word.lower())

	return mentions

hashtags_df = tweets_df.select('text').select(F.explode(extract_hashtags(F.split(F.col('text'), ' '))))

hashtags_df.cache()
assert hashtags_df.count() == 5262
assert hashtags_df.where(F.col('col') == '#youtube').count() == 4

In [11]:
# Find same hashtags and mentions
hashtags_stripped_df = hashtags_df.drop_duplicates().select(F.substring('col', pos=2, len=1000).alias("text"))
mentions_stripped_df = mentions_df.drop_duplicates().select(F.substring('mention', pos=2, len=1000).alias("text"))

same_hashtags_mentions_df = hashtags_stripped_df.join(other=mentions_stripped_df,
													  on=hashtags_stripped_df.text == mentions_stripped_df.text,
													  how="inner").drop(mentions_stripped_df.text)

assert same_hashtags_mentions_df.count() == 39

# Useful Resources

https://sparkbyexamples.com/
https://spark.apache.org/examples.html
https://databricks.com/spark/getting-started-with-apache-spark
https://www.oreilly.com/library/view/spark-the-definitive/9781491912201/