In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,lit

spark = SparkSession.builder.getOrCreate()

import os
import glob

In [2]:
username = 'mhk9c'
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install demoji
sys.path.append(f'/home/{username}/.local/lib/python3.7/site-packages/')

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import demoji 
demoji.download_codes()

  from ipykernel import kernelapp as app


In [4]:
data_path = "/project/ds5559/team1_sp22/data/russian-troll-tweets-master"
reload = False

In [5]:
def load_data(_data_path, reload=True):
    
    dir_name = os.path.basename(os.path.normpath(_data_path))
    print(dir_name)
    
    if(reload):
        first = True
        for file in glob.glob(f'{_data_path}/*.csv'):            
            print(file)

            if(first):
                _df = spark.read.csv(file, header=True, inferSchema=True, mode="DROPMALFORMED")                
                _df = _df.withColumn("source_file",lit(file))
            else:
                new_df = spark.read.csv(file, header=True, inferSchema=True, mode="DROPMALFORMED")
                new_df = new_df.withColumn("source_file",lit(file))                
                _df = _df.union(new_df)                        
            first = False        
            
        _df.write.format("parquet").mode("overwrite").save(f"{data_path}/{dir_name}_parquet")
        
    else:
        _df = spark.read.parquet(f"{data_path}/{dir_name}_parquet")
        
    print('Done loading.')
    return _df
        
        
df = load_data(data_path, False)
total_tweets = df.count()
print(f'There are {total_tweets} tweets in this dataset')

russian-troll-tweets-master
Done loading.
There are 2914254 tweets in this dataset


In [6]:
df.printSchema()

root
 |-- external_author_id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- content: string (nullable = true)
 |-- region: string (nullable = true)
 |-- language: string (nullable = true)
 |-- publish_date: string (nullable = true)
 |-- harvested_date: string (nullable = true)
 |-- following: string (nullable = true)
 |-- followers: string (nullable = true)
 |-- updates: string (nullable = true)
 |-- post_type: string (nullable = true)
 |-- account_type: string (nullable = true)
 |-- retweet: string (nullable = true)
 |-- account_category: string (nullable = true)
 |-- new_june_2018: string (nullable = true)
 |-- alt_external_id: string (nullable = true)
 |-- tweet_id: string (nullable = true)
 |-- article_url: string (nullable = true)
 |-- tco1_step1: string (nullable = true)
 |-- tco2_step1: string (nullable = true)
 |-- tco3_step1: string (nullable = true)
 |-- source_file: string (nullable = true)



In [7]:
df.createOrReplaceTempView("tweets")

In [8]:
sqlDF = spark.sql("SELECT * FROM tweets where language = 'English' ")
# sqlDF.show(10, False)
english_tweets = sqlDF.count()
print(f'There are {english_tweets} english tweets in this dataset. They account for {english_tweets/total_tweets:%} of the dataset.')

There are 2096049 english tweets in this dataset. They account for 71.924033% of the dataset.


In [None]:
sqlDF.createOrReplaceTempView("english_tweets")

In [None]:
sqlDF = spark.sql("SELECT content,source_file FROM english_tweets LIMIT 100")

In [9]:
import pyspark.sql.functions as func
from pyspark.sql.types import StringType

def convert_emojii(string):
    return demoji.replace_with_desc(string, ":")
    
test = convert_emojii("🐝🐝🐝")   
print(test)

convert_emojii_UDF = func.udf(lambda z:convert_emojii(z),StringType())   

sqlDF = sqlDF.withColumn("curated_contenet", convert_emojii_UDF(col("content")))
  



sqlDF.select(["curated_contenet", "source_file"]).show(100, False)
sqlDF.printSchema()

:honeybee::honeybee::honeybee:
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------+
|curated_contenet                                                                                                                                                        |source_file                                                                       |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------+
|"#ATLCollegeFair + #ATLFair FEB 25TH  - GO KARTS  - BOWLING �  - TRAMPOLINE &amp; MORE  TEXT ""FAIR"" TO 678.755.9821   https://t.co/2Gq5W73cMN z19"                    |/project/ds5559/team1_sp22/data/russi

In [None]:
sqlDF.map