# Import librarys and create spark session 

In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
#import modules
from pyspark.sql import *
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [3]:
#create Spark session
application_name= 'Social Media Analytics'
spark = SparkSession.builder \
    .master("local") \
    .appName(application_name) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [4]:
#read data into spark dataFrame with automatically inferred schema
twitter = spark.read.csv('/content/twitter.csv', inferSchema=True, header=True)
twitter.show(truncate = False,n=3)
print("count of data " , twitter.count())

+---+-----+--------------------------------------------------------------------------------------------------------------------------+
|id |label|tweet                                                                                                                     |
+---+-----+--------------------------------------------------------------------------------------------------------------------------+
|1  |0    | @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run                    |
|2  |0    |@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked|
|3  |0    |  bihday your majesty                                                                                                     |
+---+-----+--------------------------------------------------------------------------------------------------------------------------+
only showing top 3 rows

count of data  31962


# **Prepare the data for analysis**

In [5]:
twitter =  twitter.select('label' , 'tweet' )
#Dropping Rows With Empty Values
twitter_new = twitter.dropna()
print("count of data after dropping null" ,twitter_new.count()) #There is no null values 
twitter_new.show(truncate = False,n=3)

count of data after dropping null 31962
+-----+--------------------------------------------------------------------------------------------------------------------------+
|label|tweet                                                                                                                     |
+-----+--------------------------------------------------------------------------------------------------------------------------+
|0    | @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run                    |
|0    |@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked|
|0    |  bihday your majesty                                                                                                     |
+-----+--------------------------------------------------------------------------------------------------------------------------+
only showing top 3 rows



In [6]:
#Get Distinct Rows
print("count of data after dropping null" ,twitter_new.count())
distinctDF = twitter_new.distinct()
print("Distinct count: "+str(distinctDF.count()))
#removing duplicates
new_tweet =  twitter_new.dropDuplicates()
print("count of data after removing duplicated rows: "+str(new_tweet.count()))
distinctDF.show(truncate=False, n=3)

count of data after dropping null 31962
Distinct count: 29528
count of data after removing duplicated rows: 29528
+-----+---------------------------------------------------------------------------------------------------------------+
|label|tweet                                                                                                          |
+-----+---------------------------------------------------------------------------------------------------------------+
|0    |last day at work tomorrow for me- self employed from monday #scared   #redditchbizhour                         |
|0    |2 hours to go until our music video is released! #music   #edinburgh #scotland #band @user #musicvideo #a @user|
|0    | @user omg bitch ðððð i knew you was gone do lil kim .. actually look like a music video           |
+-----+---------------------------------------------------------------------------------------------------------------+
only showing top 3 rows



In [7]:
#Preprocessing
new_tweet.groupBy("tweet").count().orderBy(col("count").desc()).show()

+--------------------+-----+
|               tweet|count|
+--------------------+-----+
|#tgif   #ff to my...|    1|
|bus riding to not...|    1|
|i wish this was m...|    1|
|sad or dukhi shai...|    1|
|this week new me ...|    1|
| @user she is my ...|    1|
| @user   my #mom ...|    1|
| @user as forecas...|    1|
|lack of access sp...|    1|
|just finished s4 ...|    1|
|here you have it ...|    1|
|stage set for tom...|    1|
|good morning #ins...|    1|
|@user #god should...|    1|
|  #fathers day fr...|    1|
|if someone gossip...|    1|
|@user president j...|    1|
|leaving the home ...|    1|
|new logo and webs...|    1|
|thank you bjp for...|    1|
+--------------------+-----+
only showing top 20 rows



In [8]:
# Other CSV options
new_tweet.write.options(header='True', delimiter=',').csv("preprocessing_twitter_")
