## Install Libraries to be used
- !pip install -U pip setuptools wheel
- !pip install -U spacy
- !python -m spacy download en_core_web_sm
- !pip install emot --upgrade
- !pip install emoji --upgrade
- !pip install spacymoji
- !pip install spacytextblob

## Importing Libraries that will be used

In [22]:
import ast
import json
import random 
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import re
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import emoji
import spacy
from os import walk
from nltk.tokenize import word_tokenize

from spacytextblob.spacytextblob import SpacyTextBlob

# Load the fields list
import fields
fieldsFilter = fields.fields

# Declare Spacy NLP Module
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")



Creat a small sample data

In [None]:
no_samples = "10000"
list_tweets = None

with open("./complete/20200916.json", "r") as myfile:
    list_tweets = list(myfile)

if int(no_samples) > len(list_tweets):
    no_samples = len(list_tweets)

sample = random.sample(list_tweets, int(no_samples))

file = open("sample.json", "w")
for i in sample:
  file.write(i)
file.close() #This close() is important

Processing functions

In [None]:
no_top_unique_words = "50" #@param {type:"string"}

result = Counter(" ".join(tweet_df['text'].values.tolist()).split(" ")).items()
df2 = pd.DataFrame(result)
df2.columns =['Word', 'Frequency']
df2 = df2[df2.Word != ""] #Deletes the empty spaces counted
df2 = df2.sort_values(['Frequency'], ascending=[False]) #Sort dataframe by frequency (Descending)

print('\033[1mTop '+no_top_unique_words+' most unique words used from the dataset\033[0m \n')
print(df2.head(int(no_top_unique_words)).to_string(index=False)) #Prints the top N unique words used
print("\n")
df3 = df2.head(int(no_top_unique_words))
df3.plot(y='Frequency', kind='pie', labels=df3['Word'], figsize=(9, 9), autopct='%1.1f%%', title='Top '+no_top_unique_words+' most unique words used from the dataset')

In [None]:
complete_tweets = []
with open('./tweets/sample_data.json', 'r') as f:
    for line in f:
        complete_tweets.append(json.loads(line))   

In [None]:
datfile = 's1_sa_dsaf'
fname = str(datfile).split('_')[1]
fname

In [10]:
# CLEANING  FUNCTIONS

def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_urls(text):
    result = re.sub(r"http\S+", "", text)
    return(result)

def remove_twitter_urls(text):
    clean = re.sub(r"pic.twitter\S+", "",text)
    return(clean)

def give_emoji_free_text(text):
    return emoji.get_emoji_regexp().sub(r'', text)

def remove_user_mentions(text):
    clean_tweet = re.sub("@[A-Za-z0-9_]+","", text)
    clean_tweet = re.sub("#[A-Za-z0-9_]+","", clean_tweet)
    return clean_tweet

def change_to_lowercase(text):
    return text.lower()    

def remove_returnchar(text):
    clean_tweet = text.replace('\r','')
    return clean_tweet

def remove_whitespaces(text):
    clean_tweet = text.replace(r'(\r\n)+|\r+|\n+|\t+','')
    return clean_tweet



In [11]:
def extractTweetText(datfile):
    
    #Initialize Temporary Array Storage
    tempStorage = []

    #Parse Json to a temporary storage
    with open('./tweets/'+datfile, 'r') as f:
        for line in f:
            tempStorage.append(json.loads(line))

    # Convert Array to a Pandas Data Frame
    tempFrame = pd.DataFrame(tempStorage)
    
    # Drop all columns except for text column
    tempFrame = tempFrame[['text']]
    
    #Get only the file name and remove the extension
    fname = str(datfile).split('.')[0]
    
    #Save dataframe to csv
    tempFrame.to_csv(r'./step_1/s1_'+str(fname)+'.csv')

In [12]:
# Clean and process tweet text by removing URL, EMOJI and User Mentions
def removeTwitterMeta(datfile):    
    tweet_df = pd.read_csv('./step_1/'+str(datfile),index_col=False) 
    #Replace the spaces and enters
    tweet_df = tweet_df.loc[:, tweet_df.columns.isin(fieldsFilter)]
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_whitespaces(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_urls(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_twitter_urls(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_emoticons(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_emoji(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : give_emoji_free_text(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_user_mentions(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : change_to_lowercase(x))

    #Get only the file name and remove the extension
    fname = str(datfile).split('_')[1]
    fname = str(fname).split('.')[0]
    tweet_df.to_csv(r'./step_2/s2_'+str(fname)+'.csv')


In [13]:
# Tokenize the text in the dataframe
def tokenizeText(data):
    tempStorage = []
    doc = nlp(data)
    for token in doc:
#         tempStorage.append({token.text,token.pos_})
          tempStorage.append(token.text)
    return tempStorage

In [14]:
def defineVocabulary(data):
    tempStorage = []
    doc = nlp(data)
    for token in doc:
        tempStorage.append({token.text,token.pos_})
    return tempStorage

In [15]:
def removeStopWords(data):    
    tempStorage = ast.literal_eval(data)    
    # Load stop words
    spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
    # Loop inside text
    for x in tempStorage:
        if x in spacy_stopwords: tempStorage.remove(x) 
    return tempStorage

In [70]:
# @param data {type:"string"} text data
# p - polarity
def definePolarization(data):
    doc = nlp(data)
    return doc._.polarity

In [69]:
# @param data {type:"string"} text data
# s - subjectivity
def defineSubjectivity(data):
    doc = nlp(data)
    return doc._.subjectivity

In [73]:
# @param data {type:"string"} text data
# a - assessments
def defineAssessments(data):
    doc = nlp(data)
    return doc._.assessments

In [16]:
# Step 0
# Specify Raw Tweet Directory
tweetDir = './tweets'
_, _, filenames = next(walk(tweetDir))

for file in filenames:
    extractTweetText(file)

In [17]:
# Step 1
#Extract Text of Tweets and Save to CSV file

tweetDirS1 = './step_1'
_, _, filenames_s1 = next(walk(tweetDirS1))

for file in filenames_s1:
    removeTwitterMeta(file)

In [18]:
# Step 2
# Get The Files inside the step_2 folder for futher processing
tweetDirS2 = './step_2'
_, _, filenames_s2 = next(walk(tweetDirS2))

for file in filenames_s2:
    mydata = pd.read_csv('./step_2/'+file,index_col=False)
    del mydata['Unnamed: 0'] 
    mydata['ttext'] = mydata['text'].apply(lambda x : tokenizeText(x))
    
    #Get only the file name and remove the extension
    fname = str(file).split('_')[1]
    fname = str(fname).split('.')[0]
    mydata.to_csv(r'./step_3/s3_'+str(fname)+'.csv')


In [19]:
# Step 3
# Get The Files inside the step_3 folder and reduce the connecting words
tweetDirS3 = './step_3'
_, _, filenames_s3 = next(walk(tweetDirS3))

for file in filenames_s3:
    mydata = pd.read_csv('./step_3/'+file,index_col=False)
    del mydata['Unnamed: 0']
    mydata['nstop'] = mydata['ttext'].apply(lambda x : removeStopWords(x))
    
    #Get only the file name and remove the extension
    fname = str(file).split('_')[1]
    fname = str(fname).split('.')[0]
    mydata.to_csv(r'./step_4/s4_'+str(fname)+'.csv')

In [20]:
# Step 4
# Get The Files inside the step_4 folder check

tweetDirS4 = './step_4'
_, _, filenames_s4 = next(walk(tweetDirS4))

for file in filenames_s4:
    mydata = pd.read_csv('./step_4/'+file,index_col=False)
    del mydata['Unnamed: 0'] 
    mydata['wordDef'] = mydata['text'].apply(lambda x : defineVocabulary(x))

    #Get only the file name and remove the extension
    fname = str(file).split('_')[1]
    fname = str(fname).split('.')[0]
    mydata.to_csv(r'./step_5/s5_'+str(fname)+'.csv')

In [71]:
# Step 5
# Get The Files inside the step_5 folder add column for polarity, assessment and subjectivity

tweetDirS5 = './step_5'
_, _, filenames_s5 = next(walk(tweetDirS5))

for file in filenames_s5:
    mydata = pd.read_csv('./step_5/'+file,index_col=False)
    del mydata['Unnamed: 0'] 
    mydata['polarization'] = mydata['text'].apply(lambda x : definePolarization(x))
    mydata['subjectivity'] = mydata['text'].apply(lambda x : defineSubjectivity(x))
    mydata['assessment'] = mydata['text'].apply(lambda x : defineAssessments(x))
    
# Seperate wordscore in 3 seperate columns
mydata

Unnamed: 0,text,ttext,nstop,wordDef,polarization,subjectivity,assessment
0,so all you that say trump never downplayed cov...,"['so', 'all', 'you', 'that', 'say', 'trump', '...","['all', 'that', 'trump', 'downplayed', 'covid1...","[{'so', 'ADV'}, {'DET', 'all'}, {'PRON', 'you'...",0.200000,0.200000,"[([really], 0.2, 0.2, None)]"
1,coronavirus: yorkshire quartet out of t20 blas...,"['coronavirus', ':', 'yorkshire', 'quartet', '...","['coronavirus', ':', 'yorkshire', 'quartet', '...","[{'NOUN', 'coronavirus'}, {'PUNCT', ':'}, {'yo...",0.227273,0.545455,"[([positive], 0.22727272727272727, 0.545454545..."
2,cdc: almost all of the us kids and teens who'v...,"['cdc', ':', 'almost', 'all', 'of', 'the', 'us...","['cdc', ':', 'all', 'the', 'kids', 'teens', ""'...","[{'cdc', 'NOUN'}, {'PUNCT', ':'}, {'ADV', 'alm...",-0.166667,0.433333,"[([black], -0.16666666666666666, 0.43333333333..."
3,fellow south africans must one day tell ukuthi...,"['fellow', 'south', 'africans', 'must', 'one',...","['fellow', 'south', 'africans', 'one', 'day', ...","[{'ADJ', 'fellow'}, {'ADJ', 'south'}, {'africa...",0.000000,0.000000,[]
4,"ahmedabad adds 165 covid-19 cases, taking tall...","['ahmedabad', 'adds', '165', 'covid-19', 'case...","['ahmedabad', 'adds', '165', 'covid-19', 'case...","[{'ahmedabad', 'NOUN'}, {'VERB', 'adds'}, {'NU...",0.500000,0.500000,"[([more], 0.5, 0.5, None)]"
...,...,...,...,...,...,...,...
195,but the riverton fancy car gathering and the c...,"['but', 'the', 'riverton', 'fancy', 'car', 'ga...","['the', 'riverton', 'fancy', 'car', 'gathering...","[{'but', 'CCONJ'}, {'the', 'DET'}, {'ADJ', 'ri...",0.111905,0.289683,"[([parade], -0.25, 0.23333333333333334, None),..."
196,the majority of children who die from covid-19...,"['the', 'majority', 'of', 'children', 'who', '...","['majority', 'children', 'die', 'covid-19', 'c...","[{'the', 'DET'}, {'majority', 'NOUN'}, {'ADP',...",0.000000,0.000000,[]
197,oh god.,"['oh', 'god', '.']","['oh', 'god', '.']","[{'INTJ', 'oh'}, {'INTJ', 'god'}, {'.', 'PUNCT'}]",0.000000,0.000000,[]
198,one of the challenges the us faces during the...,"[' ', 'one', 'of', 'the', 'challenges', 'the',...","[' ', 'of', 'challenges', 'us', 'faces', 'the'...","[{'SPACE', ' '}, {'NUM', 'one'}, {'ADP', 'of'}...",0.000000,0.000000,[]


In [None]:
no_top_unique_words = "50" #@param {type:"string"}

result = Counter(" ".join(tweet_df['text'].values.tolist()).split(" ")).items()
df2 = pd.DataFrame(result)
df2.columns =['Word', 'Frequency']
df2 = df2[df2.Word != ""] #Deletes the empty spaces counted
df2 = df2.sort_values(['Frequency'], ascending=[False]) #Sort dataframe by frequency (Descending)

print('\033[1mTop '+no_top_unique_words+' most unique words used from the dataset\033[0m \n')
print(df2.head(int(no_top_unique_words)).to_string(index=False)) #Prints the top N unique words used
print("\n")
df3 = df2.head(int(no_top_unique_words))
df3.plot(y='Frequency', kind='pie', labels=df3['Word'], figsize=(9, 9), autopct='%1.1f%%', title='Top '+no_top_unique_words+' most unique words used from the dataset')

In [75]:
from spacytextblob.spacytextblob import SpacyTextBlob
# nlp.add_pipe('polarity')
txt = 'coronavirus: yorkshire quartet out of t20 blast group games after positive test'
doc = nlp(txt)
print("Polarity : ", doc._.polarity)
print("Subjectivity : ", doc._.subjectivity)

Polarity :  0.22727272727272727
Subjectivity :  0.5454545454545454


In [None]:
test = mydata.loc[1].text
doc = nlp(test)
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
      token.pos_,[child for child in token.children])




In [None]:
texts = ['some text']
for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
  #do something

In [None]:
# Create Word Cloud
from wordcloud import WordCloud, STOPWORDS

wordStorage = []

mydata = pd.read_csv('./step_5/sample.csv',index_col=False)

for x in mydata.ttext:
    

wordcloud = WordCloud(
    max_font_size=200,
    width=2000,
    height=1000,
    background_color="white",
    stopwords=STOPWORDS | {"one"},
).generate(text)

image = wordcloud.to_image()
image.show()
# image.save("wordcloud.png")

Run for the entire dataset

In [124]:
tempCloud = []
wordlist = pd.read_csv('./step_5/s5_sample.csv',index_col=False)
wordlist = wordlist.loc[:, ['text']]
wordlist

Unnamed: 0,text
0,so all you that say trump never downplayed cov...
1,coronavirus: yorkshire quartet out of t20 blas...
2,cdc: almost all of the us kids and teens who'v...
3,fellow south africans must one day tell ukuthi...
4,"ahmedabad adds 165 covid-19 cases, taking tall..."
...,...
195,but the riverton fancy car gathering and the c...
196,the majority of children who die from covid-19...
197,oh god.
198,one of the challenges the us faces during the...


Tokenize the Dataset