## Data Cleaning

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
spark = SparkSession.builder.appName("Time Series Sentiment Analysis").getOrCreate()

In [3]:
# Load the data ProjectTweets.csv into hadoop in the named folder 'user1'
df = spark.read.csv('/user1/ProjectTweets.csv', header=False, inferSchema=True)

                                                                                

In [4]:
# Display the structure of schema
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: long (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



In [5]:
from pyspark.sql import Row

# Extract the first row which contains the original header information
header_row = df.first()

# Rename the columns as specified
new_column_names = ["index", "user_id", "timestamp", "query", "username", "tweet_text"]
for i, colname in enumerate(df.columns):
    df = df.withColumnRenamed(colname, new_column_names[i])

# Drop the first row from the DataFrame to avoid duplication
df = df.filter(df.index != header_row[0])

# Construct a new DataFrame with header row 
header_df = spark.createDataFrame([header_row], new_column_names)

# Concatenate header DataFrame and original DataFrame
df = header_df.union(df)

# Drop the "query" column
df = df.drop("query")

# Show the DataFrame to verify
df.show()

                                                                                

+-----+----------+--------------------+---------------+--------------------+
|index|   user_id|           timestamp|       username|          tweet_text|
+-----+----------+--------------------+---------------+--------------------+
|    0|1467810369|Mon Apr 06 22:19:...|_TheSpecialOne_|@switchfoot http:...|
|    1|1467810672|Mon Apr 06 22:19:...|  scotthamilton|is upset that he ...|
|    2|1467810917|Mon Apr 06 22:19:...|       mattycus|@Kenichan I dived...|
|    3|1467811184|Mon Apr 06 22:19:...|        ElleCTF|my whole body fee...|
|    4|1467811193|Mon Apr 06 22:19:...|         Karoli|@nationwideclass ...|
|    5|1467811372|Mon Apr 06 22:20:...|       joy_wolf|@Kwesidei not the...|
|    6|1467811592|Mon Apr 06 22:20:...|        mybirch|         Need a hug |
|    7|1467811594|Mon Apr 06 22:20:...|           coZZ|@LOLTrish hey  lo...|
|    8|1467811795|Mon Apr 06 22:20:...|2Hood4Hollywood|@Tatiana_K nope t...|
|    9|1467812025|Mon Apr 06 22:20:...|        mimismo|@twittera que me ...|

In [6]:
from pyspark.sql.functions import when, count, col

# Counting missing data
missing_data_count = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).collect()
missing_data_count

                                                                                

[Row(index=0, user_id=0, timestamp=0, username=0, tweet_text=0)]

In [7]:
from pyspark.sql.functions import regexp_replace, col

# Initial data cleanup steps
# Remove URLs
df = df.withColumn("cleaned_text", regexp_replace(col("tweet_text"), "(http://[^\\s]+|https://[^\\s]+)", ""))

# Remove mentions
df = df.withColumn("cleaned_text", regexp_replace(col("cleaned_text"), "(@[\\w]+)", ""))

# Remove hashtags
df = df.withColumn("cleaned_text", regexp_replace(col("cleaned_text"), "(#[\\w]+)", ""))

# Remove other special characters (like &, *, %, etc.)
df = df.withColumn("cleaned_text", regexp_replace(col("cleaned_text"), "[&*%$#@!?]+", ""))

# Removing multiple spaces left after removal
df = df.withColumn("cleaned_text", regexp_replace(col("cleaned_text"), "\\s+", " "))

# Trimming spaces at the beginning and the end
df = df.withColumn("cleaned_text", regexp_replace(col("cleaned_text"), "^\\s+|\\s+$", ""))

# Counting rows that had URLs, mentions, hashtags, and special characters removed
affected_count = df.filter(col("tweet_text") != col("cleaned_text")).count()

print(f"Number of rows affected by the cleanup: {affected_count}")



Number of rows affected by the cleanup: 1600000


                                                                                

In [8]:
df.show()

+-----+----------+--------------------+---------------+--------------------+--------------------+
|index|   user_id|           timestamp|       username|          tweet_text|        cleaned_text|
+-----+----------+--------------------+---------------+--------------------+--------------------+
|    0|1467810369|Mon Apr 06 22:19:...|_TheSpecialOne_|@switchfoot http:...|- Awww, that's a ...|
|    1|1467810672|Mon Apr 06 22:19:...|  scotthamilton|is upset that he ...|is upset that he ...|
|    2|1467810917|Mon Apr 06 22:19:...|       mattycus|@Kenichan I dived...|I dived many time...|
|    3|1467811184|Mon Apr 06 22:19:...|        ElleCTF|my whole body fee...|my whole body fee...|
|    4|1467811193|Mon Apr 06 22:19:...|         Karoli|@nationwideclass ...|no, it's not beha...|
|    5|1467811372|Mon Apr 06 22:20:...|       joy_wolf|@Kwesidei not the...|  not the whole crew|
|    6|1467811592|Mon Apr 06 22:20:...|        mybirch|         Need a hug |          Need a hug|
|    7|1467811594|Mo