In [1]:
import datetime
import tarfile
import json
import bz2
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf, desc, min, max, to_timestamp, to_date, date_format, col, expr, hour, year, month, dayofweek, count
from pyspark.sql import functions as F #module that includes a variety of functions like to extract features

In [3]:
sc

In [4]:
#viewing dataset as pd df
columns = ["index", "id", "date", "flag", "user", "text"]
pdDf = pd.read_csv("ProjectTweets.csv", header=None, names=columns)

pdDf

Unnamed: 0,index,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,3,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,4,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,1599995,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,1599996,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,1599997,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,1599998,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


### Loading csv data file to spark

In [5]:
# Issue with pattern in the DateTimeFormatter when converting to timestamp due to spark version. Setting timeParserPolicy to LEGACY as the error suggested
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY") 

#defining schema
schema = StructType([
    StructField("index", StringType(), True),
    StructField("id", StringType(), True),
    StructField("date", StringType(), True),
    StructField("flag", StringType(), True),
    StructField("user", StringType(), True),
    StructField("text", StringType(), True),
])

tweetsDf= spark.read.csv('hdfs://localhost:9000/user1/ProjectTweets.csv', schema=schema, header=False)

# converting date column to timestamp, pattern taken from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
tweetsDf= tweetsDf.withColumn("date", to_timestamp("date", "EEE MMM dd HH:mm:ss zzz yyyy"))

tweetsDf.printSchema();tweetsDf.show()

root
 |-- index: string (nullable = true)
 |-- id: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

+-----+----------+-------------------+--------+---------------+--------------------+
|index|        id|               date|    flag|           user|                text|
+-----+----------+-------------------+--------+---------------+--------------------+
|    0|1467810369|2009-04-07 06:19:45|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|    1|1467810672|2009-04-07 06:19:49|NO_QUERY|  scotthamilton|is upset that he ...|
|    2|1467810917|2009-04-07 06:19:53|NO_QUERY|       mattycus|@Kenichan I dived...|
|    3|1467811184|2009-04-07 06:19:57|NO_QUERY|        ElleCTF|my whole body fee...|
|    4|1467811193|2009-04-07 06:19:57|NO_QUERY|         Karoli|@nationwideclass ...|
|    5|1467811372|2009-04-07 06:20:00|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|    6|1467811592|2009-04-07 06:20:03|NO_QUERY|        mybirch|         Need a hug |
|    7|1467811594|2009-04-07 06:20:03|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|    8|1467811795|2009-04-07 06:20:05|NO_QUERY|2Hood4Hollywood|@T

                                                                                

checking flag column

In [6]:
#extracting "flag" column as RDD
flag_rdd = tweetsDf.select("flag").rdd

# getting distinct values
unique_flags = flag_rdd.distinct()

#collecting the distinct values
unique_flags.collect()

                                                                                

[Row(flag='NO_QUERY')]

In [7]:
# counting the number of duplicated IDs
tweetsDf.groupBy("id").count().filter(col("count") > 1).count()

                                                                                

1685

In [8]:
#counting the number of duplicated rows, leaving index aside
tweetsDf.groupBy(["id", "date", "flag", "user", "text"]).count().filter(col("count") > 1).count()

                                                                                

1685

In [9]:
#number of rows
tweetsDf.count()

                                                                                

1600000

In [10]:
# dropping rows with duplicate IDs (keeping the first occurrence)
tweetsDf= tweetsDf.dropDuplicates(["id"])

In [11]:
#number of rows
tweetsDf.count()

                                                                                

1598315

Checking users with greatest count of tweets

In [12]:
# registerring the df as a temporary view
tweetsDf.createOrReplaceTempView("tweets")

# SQL query
query = """SELECT user, COUNT(*) as count
    FROM tweets
    GROUP BY user
    ORDER BY count DESC"""

# running the SQL query and showing result
spark.sql(query).show()

[Stage 26:>                                                         (0 + 1) / 1]

+---------------+-----+
|           user|count|
+---------------+-----+
|       lost_dog|  549|
|        webwoke|  345|
|       tweetpet|  310|
|SallytheShizzle|  281|
|    VioletsCRUK|  279|
|    mcraddictal|  276|
|       tsarnick|  248|
|    what_bugs_u|  246|
|    Karen230683|  238|
|      DarkPiano|  236|
|   SongoftheOss|  227|
|      Jayme1988|  225|
|         keza34|  219|
| ramdomthoughts|  216|
|      shanajaca|  213|
|         wowlew|  212|
|     nuttychris|  211|
|   TraceyHewins|  211|
|   thisgoeshere|  207|
|     Spidersamm|  205|
+---------------+-----+
only showing top 20 rows



                                                                                

### DATES

In [13]:
#Ordering rows by date
tweetsDf= tweetsDf.orderBy("date")
tweetsDf.show()



+-----+----------+-------------------+--------+---------------+--------------------+
|index|        id|               date|    flag|           user|                text|
+-----+----------+-------------------+--------+---------------+--------------------+
|    0|1467810369|2009-04-07 06:19:45|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|    1|1467810672|2009-04-07 06:19:49|NO_QUERY|  scotthamilton|is upset that he ...|
|    2|1467810917|2009-04-07 06:19:53|NO_QUERY|       mattycus|@Kenichan I dived...|
|    4|1467811193|2009-04-07 06:19:57|NO_QUERY|         Karoli|@nationwideclass ...|
|    3|1467811184|2009-04-07 06:19:57|NO_QUERY|        ElleCTF|my whole body fee...|
|    5|1467811372|2009-04-07 06:20:00|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|    6|1467811592|2009-04-07 06:20:03|NO_QUERY|        mybirch|         Need a hug |
|    7|1467811594|2009-04-07 06:20:03|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|    8|1467811795|2009-04-07 06:20:05|NO_QUERY|2Hood4Hollywood|@T

                                                                                

In [14]:
# checking if the df is ordered by id
tweetsDf.agg(F.min("id").alias("min_id"), F.max("id").alias("max_id")) \ # aggregation on the df and calculation of the min and max values of parameter "id" 
                    .select(F.col("min_id") < F.col("max_id")) \ #checking if min_id < max_id, returning a df with a single Boolean value
                    .collect()[0][0] # collect executes the transformations and select access row 0 column 0 from resulting df

# True confirms the df is ordered by id

                                                                                

True

In [19]:
# is the df ordered by index? Same code as above
tweetsDf.agg(F.min("index").alias("min_index"), F.max("index").alias("max_index")) \
                    .select(F.col("min_index") < F.col("max_index")) \
                    .collect()[0][0]

                                                                                

True

In [22]:
# from pyspark.sql.window import Window

# # Create a window specification to define the order by 'date'
# windowSpec = Window.orderBy("date")

# Add a column that represents the previous 'date' value
tweetsDf = tweetsDf.withColumn("prev_date", F.lag("date").over(tweetsDf))

# Filter out rows where there is a missing date
missing_dates_df = tweetsDf.filter((F.col("date") - F.col("prev_date") > F.expr("INTERVAL 1 DAY")) | F.col("prev_date").isNull())

# Show the rows before the missing date
missing_dates_df.show()


TypeError: window should be WindowSpec

In [None]:
tweetsDf.agg(F.min("date")).collect()[0][0]

In [None]:
tweetsDf.agg(F.max("date")).collect()[0][0]

In [20]:
sql_query = """
SELECT *,
       UNIX_TIMESTAMP(date) - LAG(UNIX_TIMESTAMP(date), 1, 0) OVER (ORDER BY date) AS gap_seconds
FROM tweets
ORDER BY gap_seconds DESC
"""

spark.sql(sql_query).show()

2023-07-27 03:26:17,852 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-07-27 03:26:17,856 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-07-27 03:26:26,609 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-07-27 03:26:30,579 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 86:>                                                         (0 + 1) / 1]

+-------+----------+-------------------+--------+---------------+--------------------+-----------+
|  index|        id|               date|    flag|           user|                text|gap_seconds|
+-------+----------+-------------------+--------+---------------+--------------------+-----------+
|      0|1467810369|2009-04-07 06:19:45|NO_QUERY|_TheSpecialOne_|@switchfoot http:...| 1239081585|
|  41607|1675708196|2009-05-02 04:08:46|NO_QUERY|        jayseto|jayseto@KevinSpac...|     920791|
|   8575|1548274671|2009-04-18 04:30:31|NO_QUERY|   xoLovebug224|Working on my son...|     906261|
| 460800|2169448960|2009-06-14 22:31:20|NO_QUERY|        ddrfire|@squarespace Dang...|     580723|
|  77476|1750882626|2009-05-10 02:08:48|NO_QUERY|     m3l0v3sr0n|2 more hours of w...|     469053|
| 135509|1879906505|2009-05-22 07:29:41|NO_QUERY|    antcastillo|@terencefitz yeah...|     314081|
| 104024|1822312820|2009-05-17 03:17:07|NO_QUERY|ReLLySiLLyViLLy|My feet hurt soo ...|     205379|
|  94940|1

                                                                                

In [None]:
gaps_df = spark.sql("""
    SELECT DATE_ADD(d1, 1) AS missing_date
    FROM (
        SELECT CAST(date AS DATE) AS d1, 
               LEAD(CAST(date AS DATE), 1) OVER (ORDER BY date) AS d2
        FROM tweets
    ) temp
    WHERE DATE_ADD(d1, 1) < d2
""")

# Count the number of gaps in days
gaps_df.count()

In [21]:
# Use SQL with a CTE to find the rows with missing timestamps in between
result_df = spark.sql("""
    WITH Temp AS (
        SELECT *,
               DATE_ADD(CAST(date AS DATE), 1) AS next_date,
               LEAD(CAST(date AS DATE), 1) OVER (ORDER BY date) AS lead_date
        FROM tweets
    )
    SELECT *
    FROM Temp
    WHERE next_date < lead_date
""")

# Show the resulting DataFrame with rows that have missing timestamps in between
result_df.show(truncate=False)

2023-07-27 03:27:16,272 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-07-27 03:27:16,276 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-07-27 03:27:24,623 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-07-27 03:27:28,127 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 92:>                                                         (0 + 1) / 1]

+-------+----------+-------------------+--------+-------------+-------------------------------------------------------------------------------------------------------------------------------+----------+----------+
|index  |id        |date               |flag    |user         |text                                                                                                                           |next_date |lead_date |
+-------+----------+-------------------+--------+-------------+-------------------------------------------------------------------------------------------------------------------------------+----------+----------+
|812095 |1470241255|2009-04-07 16:46:10|NO_QUERY|tallcathy    |That's why I love ya jimmy                                                                                                     |2009-04-08|2009-04-18|
|858416 |1574131184|2009-04-21 12:22:15|NO_QUERY|LaDy_aLySsa  |@KattPackAllDay THIS SHIT RIGHT HEREEEEEEEEE                                     

                                                                                