## Import Libraries

In [0]:
%sh pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-c9d9fa13-e3a7-403b-bb94-9d7fb24b1573/bin/python -m pip install --upgrade pip' command.


In [0]:
# Pyspark SQL
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType
from pyspark.sql.functions import col, sum, udf
import pyspark.sql.functions as F

# Sentiment Analyzer
# !pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Pyspark Machine Learning
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

## Load Data

In [0]:

# Define a function to mount

def mount_s3_bucket(access_key, secret_key, bucket_name, mount_folder):
  ACCESS_KEY_ID = access_key
  SECRET_ACCESS_KEY = secret_key
  ENCODED_SECRET_KEY = SECRET_ACCESS_KEY.replace("/", "%2F")

  print ("Mounting", bucket_name)

  try:
    # Unmount the data in case it was already mounted.
    dbutils.fs.unmount("/mnt/%s" % mount_folder)
    
  except:
    # If it fails to unmount it most likely wasn't mounted in the first place
    print ("Directory not unmounted: ", mount_folder)
    
  finally:
    # Lastly, mount our bucket.
    dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY_ID, ENCODED_SECRET_KEY, bucket_name), "/mnt/%s" % mount_folder)
    #dbutils.fs.mount("s3a://"+ ACCESS_KEY_ID + ":" + ENCODED_SECRET_KEY + "@" + bucket_name, mount_folder)
    print ("The bucket", bucket_name, "was mounted to", mount_folder, "\n")

In [0]:
# Set AWS programmatic access credentials
ACCESS_KEY = "****************"
SECRET_ACCESS_KEY = "*************************"

In [0]:
mount_s3_bucket(ACCESS_KEY, SECRET_ACCESS_KEY, 'weclouddata/twitter/', 'project')

Mounting weclouddata/twitter/
/mnt/project has been unmounted.
The bucket weclouddata/twitter/ was mounted to project 



In [0]:
%fs ls /mnt/project/

path,name,size,modificationTime
dbfs:/mnt/project/AI/,AI/,0,0
dbfs:/mnt/project/BlackFriday/,BlackFriday/,0,0
dbfs:/mnt/project/CSIS/,CSIS/,0,0
dbfs:/mnt/project/Do Not Use/,Do Not Use/,0,0
dbfs:/mnt/project/ElonMusk/,ElonMusk/,0,0
dbfs:/mnt/project/Inflation/,Inflation/,0,0
dbfs:/mnt/project/Iran/,Iran/,0,0
dbfs:/mnt/project/MTA/,MTA/,0,0
dbfs:/mnt/project/WorldCup/,WorldCup/,0,0
dbfs:/mnt/project/cancer/,cancer/,0,0


In [0]:
filePath = '/mnt/project/BlackFriday/2022/11/24/*/*'

###Create Spark Session

In [0]:
spark = (SparkSession
        .builder
        .appName('df')
        .getOrCreate())

print('Session created')


Session created


In [0]:
sc = spark.sparkContext

In [0]:

# Define schema
schema = StructType([
    StructField('id', StringType(), True),
    StructField('name', StringType(), True),
    StructField('username', StringType(), True),
    StructField('tweet', StringType(), True),
    StructField('followers_count', StringType(), True),
    StructField('location', StringType(), True),
    StructField('geo', StringType(), True),
    StructField('created_at', StringType(), True)
])

In [0]:
df = (spark.read.schema(schema).option('delimiter','\t').csv(filePath))

storing dataframe for faster access

In [0]:
df.cache()

# run the count action to materialize the cache
df.count()

Out[11]: 113814

In [0]:
display(df.head(10))

id,name,username,tweet,followers_count,location,geo,created_at
1595930122003320832,Alan L. Stewart - unetomaterouge,unetomaterouge,"RT @ProgIntl: BREAKING: Tomorrow, Amazon workers at 18 warehouses are going on strike in France and Germany to #MakeAmazonPay, with many ot…",2394,,,Thu Nov 24 23:59:26 +0000 2022
1595930122124550144,Bismillah BigWin,CityxWin0,RT @ChiefElrond: $50 | 24 Hours 🥏 RT & Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM ON BLA…,262,,,Thu Nov 24 23:59:26 +0000 2022
1595930122682478592,Fuck You I Quit,fuckyouiquit,This is the only Black Friday ad you need to see https://t.co/7Hj3CR4YlS,272510,Corporate Accounts Payable,,Thu Nov 24 23:59:26 +0000 2022
1595930123337105417,kumiii🐻 • freetag,chunvrwin,RT @jihanicorn: $150 | 2.250.000 IDR • 24 Hours 💜 - RT & Follow @CryptoCoinCoach + @NeblioTeam ____________________________ (BE ACTIVE ON…,53,🍀,,Thu Nov 24 23:59:27 +0000 2022
1595930124075294722,D_Adrian,DexaWinWin,RT @HeiraCrypto: $100 ~ Ends in 24 hrs. 🔶 RT - Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM…,45,,,Thu Nov 24 23:59:27 +0000 2022
1595930124305649664,$NEST 🎄$TTC freetag sepuasnya🍀,desnumber1,RT @May7ven: $50 — 24 Hours — ➖RT & Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM ON BLACK…,189,kwangya,,Thu Nov 24 23:59:27 +0000 2022
1595930125413285890,bigwin 🔥 $COOKIES 🍪,0xbigwinasn,RT @Beast_Cryptox: 💰 $70 ~ 24 HOURS 🐄🦥 ➖ RT & Follow @CryptoCoinCoach @NeblioTeam ----------------- (BE ACTIVE ON PROFILE) Tweet - #NEBL…,42,"Jawa Tengah, Indonesia",,Thu Nov 24 23:59:27 +0000 2022
1595930125903990784,Rex00o,Rex00o2,"RT @GFuelEnergy: 💛 𝗟𝗜𝗞𝗘 + 𝗥𝗧 to win a #BanjoKazooie x #GFUEL ""HONEY BERRY"" Tub!!! Picking 2 winners tomorrow bc we just RESTOCKED these bab…",23,,,Thu Nov 24 23:59:27 +0000 2022
1595930126499332099,CJ Tocco,DMAlCo241,"@ThisIsKyleR Working for doubletime and a half is MY tradition. Of course, the sweet paycheck gets gobbled up on Bl… https://t.co/kaDhOvmW5n",23,,,Thu Nov 24 23:59:27 +0000 2022
1595930127816609792,Martin Jones,BackPackJones,If y’all don’t get y’all grandma a tv for Black Friday,616,"Dallas, TX",,Thu Nov 24 23:59:28 +0000 2022


In [0]:
# Get the shape of the DataFrame
num_rows = df.count()
num_cols = len(df.columns)

print("Number of rows:", num_rows)
print("Number of columns:", num_cols)

Number of rows: 113814
Number of columns: 8


### EDA

dropping rows with null values 

In [0]:
df = df.dropna(subset=["tweet"])
display(df.head(10))

id,name,username,tweet,followers_count,location,geo,created_at
1595930122003320832,Alan L. Stewart - unetomaterouge,unetomaterouge,"RT @ProgIntl: BREAKING: Tomorrow, Amazon workers at 18 warehouses are going on strike in France and Germany to #MakeAmazonPay, with many ot…",2394,,,Thu Nov 24 23:59:26 +0000 2022
1595930122124550144,Bismillah BigWin,CityxWin0,RT @ChiefElrond: $50 | 24 Hours 🥏 RT & Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM ON BLA…,262,,,Thu Nov 24 23:59:26 +0000 2022
1595930122682478592,Fuck You I Quit,fuckyouiquit,This is the only Black Friday ad you need to see https://t.co/7Hj3CR4YlS,272510,Corporate Accounts Payable,,Thu Nov 24 23:59:26 +0000 2022
1595930123337105417,kumiii🐻 • freetag,chunvrwin,RT @jihanicorn: $150 | 2.250.000 IDR • 24 Hours 💜 - RT & Follow @CryptoCoinCoach + @NeblioTeam ____________________________ (BE ACTIVE ON…,53,🍀,,Thu Nov 24 23:59:27 +0000 2022
1595930124075294722,D_Adrian,DexaWinWin,RT @HeiraCrypto: $100 ~ Ends in 24 hrs. 🔶 RT - Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM…,45,,,Thu Nov 24 23:59:27 +0000 2022
1595930124305649664,$NEST 🎄$TTC freetag sepuasnya🍀,desnumber1,RT @May7ven: $50 — 24 Hours — ➖RT & Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM ON BLACK…,189,kwangya,,Thu Nov 24 23:59:27 +0000 2022
1595930125413285890,bigwin 🔥 $COOKIES 🍪,0xbigwinasn,RT @Beast_Cryptox: 💰 $70 ~ 24 HOURS 🐄🦥 ➖ RT & Follow @CryptoCoinCoach @NeblioTeam ----------------- (BE ACTIVE ON PROFILE) Tweet - #NEBL…,42,"Jawa Tengah, Indonesia",,Thu Nov 24 23:59:27 +0000 2022
1595930125903990784,Rex00o,Rex00o2,"RT @GFuelEnergy: 💛 𝗟𝗜𝗞𝗘 + 𝗥𝗧 to win a #BanjoKazooie x #GFUEL ""HONEY BERRY"" Tub!!! Picking 2 winners tomorrow bc we just RESTOCKED these bab…",23,,,Thu Nov 24 23:59:27 +0000 2022
1595930126499332099,CJ Tocco,DMAlCo241,"@ThisIsKyleR Working for doubletime and a half is MY tradition. Of course, the sweet paycheck gets gobbled up on Bl… https://t.co/kaDhOvmW5n",23,,,Thu Nov 24 23:59:27 +0000 2022
1595930127816609792,Martin Jones,BackPackJones,If y’all don’t get y’all grandma a tv for Black Friday,616,"Dallas, TX",,Thu Nov 24 23:59:28 +0000 2022


####Create Sentiment column using VADER

In [0]:
# define a function to get sentiment score using VADER
def getSentimentScore(tweetText):
    sia = SentimentIntensityAnalyzer()
    ss = sia.polarity_scores(tweetText)
    return float(ss['compound'])

# define a function to get sentiment
def getSentiment(score):
    return 1 if score >= 0 else 0

In [0]:
udfss=udf(getSentimentScore, FloatType())
df = df.withColumn('sentiment score',udfss('tweet'))

In [0]:
udfSentiment = udf(getSentiment, IntegerType())
df1 = df.withColumn('sentiment', udfSentiment('sentiment score'))

In [0]:
display(df1.head(5))

id,name,username,tweet,followers_count,location,geo,created_at,sentiment score,sentiment
1595930122003320832,Alan L. Stewart - unetomaterouge,unetomaterouge,"RT @ProgIntl: BREAKING: Tomorrow, Amazon workers at 18 warehouses are going on strike in France and Germany to #MakeAmazonPay, with many ot…",2394,,,Thu Nov 24 23:59:26 +0000 2022,0.0516000017523765,1
1595930122124550144,Bismillah BigWin,CityxWin0,RT @ChiefElrond: $50 | 24 Hours 🥏 RT & Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM ON BLA…,262,,,Thu Nov 24 23:59:26 +0000 2022,0.5318999886512756,1
1595930122682478592,Fuck You I Quit,fuckyouiquit,This is the only Black Friday ad you need to see https://t.co/7Hj3CR4YlS,272510,Corporate Accounts Payable,,Thu Nov 24 23:59:26 +0000 2022,0.0,1
1595930123337105417,kumiii🐻 • freetag,chunvrwin,RT @jihanicorn: $150 | 2.250.000 IDR • 24 Hours 💜 - RT & Follow @CryptoCoinCoach + @NeblioTeam ____________________________ (BE ACTIVE ON…,53,🍀,,Thu Nov 24 23:59:27 +0000 2022,0.8240000009536743,1
1595930124075294722,D_Adrian,DexaWinWin,RT @HeiraCrypto: $100 ~ Ends in 24 hrs. 🔶 RT - Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM…,45,,,Thu Nov 24 23:59:27 +0000 2022,0.7034000158309937,1


In [0]:
tweets=df1.select('sentiment','tweet')
display(tweets.head(10))

sentiment,tweet
1,"RT @ProgIntl: BREAKING: Tomorrow, Amazon workers at 18 warehouses are going on strike in France and Germany to #MakeAmazonPay, with many ot…"
1,RT @ChiefElrond: $50 | 24 Hours 🥏 RT & Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM ON BLA…
1,This is the only Black Friday ad you need to see https://t.co/7Hj3CR4YlS
1,RT @jihanicorn: $150 | 2.250.000 IDR • 24 Hours 💜 - RT & Follow @CryptoCoinCoach + @NeblioTeam ____________________________ (BE ACTIVE ON…
1,RT @HeiraCrypto: $100 ~ Ends in 24 hrs. 🔶 RT - Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM…
1,RT @May7ven: $50 — 24 Hours — ➖RT & Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM ON BLACK…
1,RT @Beast_Cryptox: 💰 $70 ~ 24 HOURS 🐄🦥 ➖ RT & Follow @CryptoCoinCoach @NeblioTeam ----------------- (BE ACTIVE ON PROFILE) Tweet - #NEBL…
1,"RT @GFuelEnergy: 💛 𝗟𝗜𝗞𝗘 + 𝗥𝗧 to win a #BanjoKazooie x #GFUEL ""HONEY BERRY"" Tub!!! Picking 2 winners tomorrow bc we just RESTOCKED these bab…"
1,"@ThisIsKyleR Working for doubletime and a half is MY tradition. Of course, the sweet paycheck gets gobbled up on Bl… https://t.co/kaDhOvmW5n"
1,If y’all don’t get y’all grandma a tv for Black Friday


###Text Cleaning Preprocessing

We'll be following these steps: Remove URLs such as http://cnn.com, Remove special characters, Substituting multiple spaces with single space, Lowercase all text, Trim the leading/trailing whitespaces

In [0]:
# clean the tweets as mentioned above
tweets_clean = tweets.withColumn('tweet', F.regexp_replace('tweet', r"http\S+", "")) \
                    .withColumn('tweet', F.regexp_replace('tweet', r"[^a-zA-z]", " ")) \
                    .withColumn('tweet', F.regexp_replace('tweet', r"\s+", " ")) \
                    .withColumn('tweet', F.lower('tweet')) \
                    .withColumn('tweet', F.trim('tweet')) 
display(tweets_clean.head(5))

sentiment,tweet
1,rt progintl breaking tomorrow amazon workers at warehouses are going on strike in france and germany to makeamazonpay with many ot
1,rt chiefelrond hours rt amp follow cryptocoincoach neblioteam be active on profile tweet nebl next gem on bla
1,this is the only black friday ad you need to see
1,rt jihanicorn idr hours rt amp follow cryptocoincoach neblioteam ____________________________ be active on
1,rt heiracrypto ends in hrs rt follow cryptocoincoach neblioteam be active on profile tweet nebl next gem


###Feature Transformer: Tokenizer

In [0]:
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens") 
tweets_tokenized = tokenizer.transform(tweets_clean)

display(tweets_tokenized.head(5))

sentiment,tweet,tokens
1,rt progintl breaking tomorrow amazon workers at warehouses are going on strike in france and germany to makeamazonpay with many ot,"List(rt, progintl, breaking, tomorrow, amazon, workers, at, warehouses, are, going, on, strike, in, france, and, germany, to, makeamazonpay, with, many, ot)"
1,rt chiefelrond hours rt amp follow cryptocoincoach neblioteam be active on profile tweet nebl next gem on bla,"List(rt, chiefelrond, hours, rt, amp, follow, cryptocoincoach, neblioteam, be, active, on, profile, tweet, nebl, next, gem, on, bla)"
1,this is the only black friday ad you need to see,"List(this, is, the, only, black, friday, ad, you, need, to, see)"
1,rt jihanicorn idr hours rt amp follow cryptocoincoach neblioteam ____________________________ be active on,"List(rt, jihanicorn, idr, hours, rt, amp, follow, cryptocoincoach, neblioteam, ____________________________, be, active, on)"
1,rt heiracrypto ends in hrs rt follow cryptocoincoach neblioteam be active on profile tweet nebl next gem,"List(rt, heiracrypto, ends, in, hrs, rt, follow, cryptocoincoach, neblioteam, be, active, on, profile, tweet, nebl, next, gem)"


###Feature Transformer: Stopword Removal

In [0]:
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
tweets_stopword = stopword_remover.transform(tweets_tokenized)

display(tweets_stopword.head(5))

sentiment,tweet,tokens,filtered
1,rt progintl breaking tomorrow amazon workers at warehouses are going on strike in france and germany to makeamazonpay with many ot,"List(rt, progintl, breaking, tomorrow, amazon, workers, at, warehouses, are, going, on, strike, in, france, and, germany, to, makeamazonpay, with, many, ot)","List(rt, progintl, breaking, tomorrow, amazon, workers, warehouses, going, strike, france, germany, makeamazonpay, many, ot)"
1,rt chiefelrond hours rt amp follow cryptocoincoach neblioteam be active on profile tweet nebl next gem on bla,"List(rt, chiefelrond, hours, rt, amp, follow, cryptocoincoach, neblioteam, be, active, on, profile, tweet, nebl, next, gem, on, bla)","List(rt, chiefelrond, hours, rt, amp, follow, cryptocoincoach, neblioteam, active, profile, tweet, nebl, next, gem, bla)"
1,this is the only black friday ad you need to see,"List(this, is, the, only, black, friday, ad, you, need, to, see)","List(black, friday, ad, need, see)"
1,rt jihanicorn idr hours rt amp follow cryptocoincoach neblioteam ____________________________ be active on,"List(rt, jihanicorn, idr, hours, rt, amp, follow, cryptocoincoach, neblioteam, ____________________________, be, active, on)","List(rt, jihanicorn, idr, hours, rt, amp, follow, cryptocoincoach, neblioteam, ____________________________, active)"
1,rt heiracrypto ends in hrs rt follow cryptocoincoach neblioteam be active on profile tweet nebl next gem,"List(rt, heiracrypto, ends, in, hrs, rt, follow, cryptocoincoach, neblioteam, be, active, on, profile, tweet, nebl, next, gem)","List(rt, heiracrypto, ends, hrs, rt, follow, cryptocoincoach, neblioteam, active, profile, tweet, nebl, next, gem)"


###Feature Transformer: CountVectorizer (TF - Term Frequency)

In [0]:
# Apply count vectorizer
cv = CountVectorizer(vocabSize=2**8, inputCol="filtered", outputCol='cv')
cv_model = cv.fit(tweets_stopword)
tweets_cv = cv_model.transform(tweets_stopword)

display(tweets_cv.head(5))

sentiment,tweet,tokens,filtered,cv
1,rt progintl breaking tomorrow amazon workers at warehouses are going on strike in france and germany to makeamazonpay with many ot,"List(rt, progintl, breaking, tomorrow, amazon, workers, at, warehouses, are, going, on, strike, in, france, and, germany, to, makeamazonpay, with, many, ot)","List(rt, progintl, breaking, tomorrow, amazon, workers, warehouses, going, strike, france, germany, makeamazonpay, many, ot)","Map(vectorType -> sparse, length -> 256, indices -> List(0, 20, 41, 57, 252), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))"
1,rt chiefelrond hours rt amp follow cryptocoincoach neblioteam be active on profile tweet nebl next gem on bla,"List(rt, chiefelrond, hours, rt, amp, follow, cryptocoincoach, neblioteam, be, active, on, profile, tweet, nebl, next, gem, on, bla)","List(rt, chiefelrond, hours, rt, amp, follow, cryptocoincoach, neblioteam, active, profile, tweet, nebl, next, gem, bla)","Map(vectorType -> sparse, length -> 256, indices -> List(0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 85, 88), values -> List(2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
1,this is the only black friday ad you need to see,"List(this, is, the, only, black, friday, ad, you, need, to, see)","List(black, friday, ad, need, see)","Map(vectorType -> sparse, length -> 256, indices -> List(1, 2, 129, 150), values -> List(1.0, 1.0, 1.0, 1.0))"
1,rt jihanicorn idr hours rt amp follow cryptocoincoach neblioteam ____________________________ be active on,"List(rt, jihanicorn, idr, hours, rt, amp, follow, cryptocoincoach, neblioteam, ____________________________, be, active, on)","List(rt, jihanicorn, idr, hours, rt, amp, follow, cryptocoincoach, neblioteam, ____________________________, active)","Map(vectorType -> sparse, length -> 256, indices -> List(0, 3, 5, 6, 7, 9, 10, 71, 72, 73), values -> List(2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
1,rt heiracrypto ends in hrs rt follow cryptocoincoach neblioteam be active on profile tweet nebl next gem,"List(rt, heiracrypto, ends, in, hrs, rt, follow, cryptocoincoach, neblioteam, be, active, on, profile, tweet, nebl, next, gem)","List(rt, heiracrypto, ends, hrs, rt, follow, cryptocoincoach, neblioteam, active, profile, tweet, nebl, next, gem)","Map(vectorType -> sparse, length -> 256, indices -> List(0, 3, 4, 5, 6, 7, 8, 11, 12, 13, 36, 45, 48), values -> List(2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"


###Feature Transformer: TF-IDF Vectorization

In [0]:
# TF-IDF Vectorization
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) 
idf_model = idf.fit(tweets_cv)
tweets_idf = idf_model.transform(tweets_cv)

display(tweets_idf.head(5)) # The dataframe is now ready for the following machine learning stage.

sentiment,tweet,tokens,filtered,cv,features
1,rt progintl breaking tomorrow amazon workers at warehouses are going on strike in france and germany to makeamazonpay with many ot,"List(rt, progintl, breaking, tomorrow, amazon, workers, at, warehouses, are, going, on, strike, in, france, and, germany, to, makeamazonpay, with, many, ot)","List(rt, progintl, breaking, tomorrow, amazon, workers, warehouses, going, strike, france, germany, makeamazonpay, many, ot)","Map(vectorType -> sparse, length -> 256, indices -> List(0, 20, 41, 57, 252), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 256, indices -> List(0, 20, 41, 57, 252), values -> List(0.3676543272010515, 3.1175924090018903, 3.5883125702634744, 3.9732819270253423, 5.352113566943643))"
1,rt chiefelrond hours rt amp follow cryptocoincoach neblioteam be active on profile tweet nebl next gem on bla,"List(rt, chiefelrond, hours, rt, amp, follow, cryptocoincoach, neblioteam, be, active, on, profile, tweet, nebl, next, gem, on, bla)","List(rt, chiefelrond, hours, rt, amp, follow, cryptocoincoach, neblioteam, active, profile, tweet, nebl, next, gem, bla)","Map(vectorType -> sparse, length -> 256, indices -> List(0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 85, 88), values -> List(2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 256, indices -> List(0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 85, 88), values -> List(0.735308654402103, 1.3188603556373257, 1.3732660863895567, 1.5056109771931794, 1.5058888597300504, 1.5203253570295348, 1.540835557564483, 1.5713939547037072, 1.5846222252600537, 1.6124985947883086, 1.6561496019170256, 1.6958706016840448, 4.103608188700917, 4.185831048958749))"
1,this is the only black friday ad you need to see,"List(this, is, the, only, black, friday, ad, you, need, to, see)","List(black, friday, ad, need, see)","Map(vectorType -> sparse, length -> 256, indices -> List(1, 2, 129, 150), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 256, indices -> List(1, 2, 129, 150), values -> List(0.4434163164512347, 0.46447935985153693, 4.599435736889472, 4.7272293066122515))"
1,rt jihanicorn idr hours rt amp follow cryptocoincoach neblioteam ____________________________ be active on,"List(rt, jihanicorn, idr, hours, rt, amp, follow, cryptocoincoach, neblioteam, ____________________________, be, active, on)","List(rt, jihanicorn, idr, hours, rt, amp, follow, cryptocoincoach, neblioteam, ____________________________, active)","Map(vectorType -> sparse, length -> 256, indices -> List(0, 3, 5, 6, 7, 9, 10, 71, 72, 73), values -> List(2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 256, indices -> List(0, 3, 5, 6, 7, 9, 10, 71, 72, 73), values -> List(0.735308654402103, 1.3188603556373257, 1.5056109771931794, 1.5058888597300504, 1.5203253570295348, 1.5713939547037072, 1.5846222252600537, 3.987426434411507, 3.9979277242321705, 3.9984076858444557))"
1,rt heiracrypto ends in hrs rt follow cryptocoincoach neblioteam be active on profile tweet nebl next gem,"List(rt, heiracrypto, ends, in, hrs, rt, follow, cryptocoincoach, neblioteam, be, active, on, profile, tweet, nebl, next, gem)","List(rt, heiracrypto, ends, hrs, rt, follow, cryptocoincoach, neblioteam, active, profile, tweet, nebl, next, gem)","Map(vectorType -> sparse, length -> 256, indices -> List(0, 3, 4, 5, 6, 7, 8, 11, 12, 13, 36, 45, 48), values -> List(2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 256, indices -> List(0, 3, 4, 5, 6, 7, 8, 11, 12, 13, 36, 45, 48), values -> List(0.735308654402103, 1.3188603556373257, 1.3732660863895567, 1.5056109771931794, 1.5058888597300504, 1.5203253570295348, 1.540835557564483, 1.6124985947883086, 1.6561496019170256, 1.6958706016840448, 3.449340446201889, 3.61609213437055, 3.6466897990038367))"


In [0]:
# rename column 'sentiment' to 'label'
tweets_idf = tweets_idf.withColumnRenamed("sentiment", "label")

#### Model Training: Logistic Regression

In [0]:
# split the data into traning and test sets
train_data, test_data = tweets_idf.randomSplit([0.7, 0.3], seed=1234)

lr = LogisticRegression(maxIter=10)

lr_model = lr.fit(train_data)

predictions = lr_model.transform(test_data)

display(predictions.head(5))

label,tweet,tokens,filtered,cv,features,rawPrediction,probability,prediction
0,a distraction from shopping in the black friday sales an epic thread combines gynaecological anatomy with vague,"List(a, distraction, from, shopping, in, the, black, friday, sales, an, epic, thread, combines, gynaecological, anatomy, with, vague)","List(distraction, shopping, black, friday, sales, epic, thread, combines, gynaecological, anatomy, vague)","Map(vectorType -> sparse, length -> 256, indices -> List(1, 2, 32, 64, 97, 246), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 256, indices -> List(1, 2, 32, 64, 97, 246), values -> List(0.4434163164512347, 0.46447935985153693, 3.4413326718080586, 3.8492760959306898, 4.286249796705796, 5.313822653950329))","Map(vectorType -> dense, length -> 2, values -> List(-2.287427416704114, 2.287427416704114))","Map(vectorType -> dense, length -> 2, values -> List(0.09216958344256845, 0.9078304165574316))",1.0
0,a dizzy nendroid man,"List(a, dizzy, nendroid, man)","List(dizzy, nendroid, man)","Map(vectorType -> sparse, length -> 256, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 256, indices -> List(), values -> List())","Map(vectorType -> dense, length -> 2, values -> List(-1.4900190958821828, 1.4900190958821828))","Map(vectorType -> dense, length -> 2, values -> List(0.18391886124942677, 0.8160811387505732))",1.0
0,adorepixsxoxo give me my black friday shopping allowance im going crazy this year findom findomme fins,"List(adorepixsxoxo, give, me, my, black, friday, shopping, allowance, im, going, crazy, this, year, findom, findomme, fins)","List(adorepixsxoxo, give, black, friday, shopping, allowance, im, going, crazy, year, findom, findomme, fins)","Map(vectorType -> sparse, length -> 256, indices -> List(1, 2, 41, 64, 78, 108, 114), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 256, indices -> List(1, 2, 41, 64, 78, 108, 114), values -> List(0.4434163164512347, 0.46447935985153693, 3.5883125702634744, 3.8492760959306898, 4.052661621082812, 4.381559976510121, 4.4924128559159735))","Map(vectorType -> dense, length -> 2, values -> List(0.9977199585861443, -0.9977199585861443))","Map(vectorType -> dense, length -> 2, values -> List(0.7306100591836439, 0.26938994081635614))",0.0
0,amazon workers across the world plan black friday strike to demand the tech giant pays fairly and ceases awful un,"List(amazon, workers, across, the, world, plan, black, friday, strike, to, demand, the, tech, giant, pays, fairly, and, ceases, awful, un)","List(amazon, workers, across, world, plan, black, friday, strike, demand, tech, giant, pays, fairly, ceases, awful, un)","Map(vectorType -> sparse, length -> 256, indices -> List(1, 2, 57), values -> List(1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 256, indices -> List(1, 2, 57), values -> List(0.4434163164512347, 0.46447935985153693, 3.9732819270253423))","Map(vectorType -> dense, length -> 2, values -> List(-0.9016749126855091, 0.9016749126855091))","Map(vectorType -> dense, length -> 2, values -> List(0.2887064239524982, 0.7112935760475019))",1.0
0,another bullshit from wbd firedavidzaslav,"List(another, bullshit, from, wbd, firedavidzaslav)","List(another, bullshit, wbd, firedavidzaslav)","Map(vectorType -> sparse, length -> 256, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 256, indices -> List(), values -> List())","Map(vectorType -> dense, length -> 2, values -> List(-1.4900190958821828, 1.4900190958821828))","Map(vectorType -> dense, length -> 2, values -> List(0.18391886124942677, 0.8160811387505732))",1.0


In [0]:
display(tweets.head(10))

sentiment,tweet
1,"RT @ProgIntl: BREAKING: Tomorrow, Amazon workers at 18 warehouses are going on strike in France and Germany to #MakeAmazonPay, with many ot…"
1,RT @ChiefElrond: $50 | 24 Hours 🥏 RT & Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM ON BLA…
1,This is the only Black Friday ad you need to see https://t.co/7Hj3CR4YlS
1,RT @jihanicorn: $150 | 2.250.000 IDR • 24 Hours 💜 - RT & Follow @CryptoCoinCoach + @NeblioTeam ____________________________ (BE ACTIVE ON…
1,RT @HeiraCrypto: $100 ~ Ends in 24 hrs. 🔶 RT - Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM…
1,RT @May7ven: $50 — 24 Hours — ➖RT & Follow: @CryptoCoinCoach + @NeblioTeam (BE ACTIVE ON PROFILE) Tweet #NEBL NEXT GEM ON BLACK…
1,RT @Beast_Cryptox: 💰 $70 ~ 24 HOURS 🐄🦥 ➖ RT & Follow @CryptoCoinCoach @NeblioTeam ----------------- (BE ACTIVE ON PROFILE) Tweet - #NEBL…
1,"RT @GFuelEnergy: 💛 𝗟𝗜𝗞𝗘 + 𝗥𝗧 to win a #BanjoKazooie x #GFUEL ""HONEY BERRY"" Tub!!! Picking 2 winners tomorrow bc we just RESTOCKED these bab…"
1,"@ThisIsKyleR Working for doubletime and a half is MY tradition. Of course, the sweet paycheck gets gobbled up on Bl… https://t.co/kaDhOvmW5n"
1,If y’all don’t get y’all grandma a tv for Black Friday


####Model Evaluation

In [0]:

# Evaluate the model using binary classification evaluation 
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") 
roc_auc = evaluator.evaluate(predictions)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(predictions.count())

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.9250
ROC-AUC: 0.9007


With our logistic regression model, we have an accuracy score of 0.9250, and a ROC-AUC score of 0.9007.

### Save the data and the predictions into my bucket

In [0]:
mount_s3_bucket(ACCESS_KEY, SECRET_ACCESS_KEY, "b18-arunabho", "my_bucket")


Mounting b18-arunabho
/mnt/my_bucket has been unmounted.
The bucket b18-arunabho was mounted to my_bucket 



In [0]:
from pyspark.sql.types import ArrayType
from pyspark.ml.linalg import VectorUDT

# Function to identify columns that are Vectors or Arrays
def get_non_vector_cols(df):
    return [f.name for f in df.schema.fields if not isinstance(f.dataType, (ArrayType, VectorUDT))]

clean_predictions_columns = get_non_vector_cols(predictions)


# Save predictions
predictions.select(clean_predictions_columns).write \
    .option('header', 'false') \
    .option('delimiter', '\t') \
    .csv("dbfs:/mnt/my_bucket/main_tweets/")

In [0]:
df1.select(clean_tweets_columns).write .option('header', 'false') \
    .option('delimiter', '\t') \
    .csv("dbfs:/mnt/my_bucket/main_tweets/")

In [0]:
predictions.write.parquet('/mnt/my_bucket/BFpredictions/blackfridaypredictions.parquet')

In [0]:
print(clean_predictions_columns)

['label', 'tweet', 'prediction']
