## Install Libraries to be used
- !pip install -U pip setuptools wheel
- !pip install -U spacy
- !python -m spacy download en_core_web_sm
- !pip install emot --upgrade
- !pip install emoji --upgrade
- !pip install spacymoji
- !pip install spacytextblob

## Importing Libraries that will be used

In [None]:
# General Dependencies
import ast
import json
import random 
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import re
import os

# Load the fields list
import fields
fieldsFilter = fields.fields

# Wordcloud Dependencies
from PIL import Image
from wordcloud import WordCloud, STOPWORDS

# Emoji Dependencies
import unicodedata
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import emoji



In [None]:
# Declare Spacy NLP Module
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('spacytextblob')

Creat a small sample data

In [None]:
def createDataSample(datafile = "",sample_count = 0):
    
    '''
        Creates a sample dataset from a given json file
        @param {'type':string} datafile - file path of the twitter json file
        @param {'type':string} sample_count - Number of samples to create
        
    '''
    
    list_tweets = None
    
    with open(datafile, "r") as myfile:        
        list_tweets = list(myfile)

    if int(no_samples) > len(list_tweets):
        sample_count = len(list_tweets)

    sample = random.sample(list_tweets, int(sample_count))
    
    file = open("sample_"+str(sample_count)+".json", "w")
    for i in sample:
        file.write(i)
    file.close() #This close() is important

    return print("Sample File Created.")
    

Processing functions

In [None]:
no_top_unique_words = "50" #@param {type:"string"}

result = Counter(" ".join(tweet_df['text'].values.tolist()).split(" ")).items()
df2 = pd.DataFrame(result)
df2.columns =['Word', 'Frequency']
df2 = df2[df2.Word != ""] #Deletes the empty spaces counted
df2 = df2.sort_values(['Frequency'], ascending=[False]) #Sort dataframe by frequency (Descending)

print('\033[1mTop '+no_top_unique_words+' most unique words used from the dataset\033[0m \n')
print(df2.head(int(no_top_unique_words)).to_string(index=False)) #Prints the top N unique words used
print("\n")
df3 = df2.head(int(no_top_unique_words))
df3.plot(y='Frequency', kind='pie', labels=df3['Word'], figsize=(9, 9), autopct='%1.1f%%', title='Top '+no_top_unique_words+' most unique words used from the dataset')

In [None]:
# CLEANING  FUNCTIONS

def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_urls(text):
    result = re.sub(r"http\S+", "", text)
    return(result)

def remove_twitter_urls(text):
    clean = re.sub(r"pic.twitter\S+", "",text)
    return(clean)

def give_emoji_free_text(text):
    return emoji.get_emoji_regexp().sub(r'', text)

def remove_user_mentions(text):
    clean_tweet = re.sub("@[A-Za-z0-9_]+","", text)
    clean_tweet = re.sub("#[A-Za-z0-9_]+","", clean_tweet)
    return clean_tweet

def change_to_lowercase(text):
    return text.lower()    

def remove_returnchar(text):
    clean_tweet = text.replace('\r','')
    return clean_tweet

def remove_whitespaces(text):
    clean_tweet = text.replace(r'(\r\n)+|\r+|\n+|\t+','')
    return clean_tweet



In [None]:
def extractTweetText(datfile):
    
    #Initialize Temporary Array Storage
    tempStorage = []

    #Parse Json to a temporary storage
    with open('./tweets/'+datfile, 'r') as f:
        for line in f:
            tempStorage.append(json.loads(line))

    # Convert Array to a Pandas Data Frame
    tempFrame = pd.DataFrame(tempStorage)
    
    # Drop all columns except for text column
    tempFrame = tempFrame[['text']]
    
    #Get only the file name and remove the extension
    fname = str(datfile).split('.')[0]
    
    #Save dataframe to csv
    tempFrame.to_csv(r'./step_1/s1_'+str(fname)+'.csv')

In [None]:
# Clean and process tweet text by removing URL, EMOJI and User Mentions
def removeTwitterMeta(datfile):    
    tweet_df = pd.read_csv('./step_1/'+str(datfile),index_col=False) 
    #Replace the spaces and enters
    tweet_df = tweet_df.loc[:, tweet_df.columns.isin(fieldsFilter)]
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_whitespaces(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_urls(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_twitter_urls(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : emoji_to_word(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_emoticons(x))
#     tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_emoji(x))
#     tweet_df['text'] = tweet_df['text'].apply(lambda x : give_emoji_free_text(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_user_mentions(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : change_to_lowercase(x))

    #Get only the file name and remove the extension
    fname = str(datfile).split('_')[1]
    fname = str(fname).split('.')[0]
    tweet_df.to_csv(r'./step_2/s2_'+str(fname)+'.csv')


In [None]:
# Tokenize the text in the dataframe
def tokenizeText(data):
    tempStorage = []
    doc = nlp(data)
    for token in doc:
#         tempStorage.append({token.text,token.pos_})
          tempStorage.append(token.text)
    return tempStorage

In [None]:
test = 'so all you that say trump never downplayed covid19, your really saying woodward lied, somehow got trumps voice, or Python is 👍'

emoji_to_name(test)

In [None]:
def defineVocabulary(data):
    tempStorage = []
    doc = nlp(data)
    for token in doc:
        tempStorage.append({token.text,token.pos_})
    return tempStorage

In [None]:
def generateWordcloudData(payload):    
    """
      This Function Accepts a pandas dataframe from any step of the processing with the named text column.
      payload - pandas dataframe with string text column.
    """   
    
    combinedText = ""
    
    textData = payload['text']
    textData = pd.DataFrame(textData)
    
    combinedText = wordData.join(textData.text)
    


    wordcloud = WordCloud(
                            max_font_size = 140,
                            width = 2000,
                            height = 1000,
                            background_color = "white",
                            #     mask=mask,
                            contour_width = 1,
                            stopwords = STOPWORDS)


    # mask = np.array(Image.open("cov19.png")) #jpg also ok
    stopwords = set(STOPWORDS)
    wordcloud.generate(combinedText)

    plt.imshow(wordcloud)#, interpolation='bilinear')
    plt.axis("off")
    plt.figure()
    # plt.imshow(mask)#, cmap=plt.cm.gray, interpolation='bilinear')
    plt.axis("off")
#     plt.show()
    #write to img
#     image = wordcloud.to_image()
    image.show()    
    
    return plt.show()



In [None]:
def removeStopWords(data):    
    tempStorage = ast.literal_eval(data)    
    # Load stop words
    spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
    # Loop inside text
    for x in tempStorage:
        if x in spacy_stopwords: tempStorage.remove(x) 
    return tempStorage

In [None]:
# @param data {type:"string"} text data
# p - polarity
def definePolarization(data):
    doc = nlp(data)
    return doc._.polarity

# @param data {type:"string"} text data
# s - subjectivity
def defineSubjectivity(data):
    doc = nlp(data)
    return doc._.subjectivity

# @param data {type:"string"} text data
# a - assessments
def defineAssessments(data):
    doc = nlp(data)
    return doc._.assessments

In [None]:
def emoji_to_word(text):
    return emoji.demojize(text, delimiters=("", ""))

In [None]:
# Create A Sample Dataset to process

createDataSample(datafile =  './complete/20200730.json', sample_count = 5000)

In [None]:
# Step 0
# Specify Raw Tweet Directory
tweetDir = './tweets'
_, _, filenames = next(walk(tweetDir))

for file in filenames:
    extractTweetText(file)

In [None]:
# Step 1
#Extract Text of Tweets and Save to CSV file

tweetDirS1 = './step_1'
_, _, filenames_s1 = next(walk(tweetDirS1))

for file in filenames_s1:
    removeTwitterMeta(file)

In [None]:
# Step 2
# Get The Files inside the step_2 folder for futher processing
tweetDirS2 = './step_2'
_, _, filenames_s2 = next(walk(tweetDirS2))

for file in filenames_s2:
    mydata = pd.read_csv('./step_2/'+file,index_col=False)
    del mydata['Unnamed: 0'] 
    mydata['ttext'] = mydata['text'].apply(lambda x : tokenizeText(x))
    
    #Get only the file name and remove the extension
    fname = str(file).split('_')[1]
    fname = str(fname).split('.')[0]
    mydata.to_csv(r'./step_3/s3_'+str(fname)+'.csv')


In [None]:
# Step 3
# Get The Files inside the step_3 folder and reduce the connecting words
tweetDirS3 = './step_3'
_, _, filenames_s3 = next(walk(tweetDirS3))

for file in filenames_s3:
    mydata = pd.read_csv('./step_3/'+file,index_col=False)
    del mydata['Unnamed: 0']
    mydata['nstop'] = mydata['ttext'].apply(lambda x : removeStopWords(x))
    
    #Get only the file name and remove the extension
    fname = str(file).split('_')[1]
    fname = str(fname).split('.')[0]
    mydata.to_csv(r'./step_4/s4_'+str(fname)+'.csv')

In [None]:
# Step 4
# Get The Files inside the step_4 folder check

tweetDirS4 = './step_4'
_, _, filenames_s4 = next(walk(tweetDirS4))

for file in filenames_s4:
    mydata = pd.read_csv('./step_4/'+file,index_col=False)
    del mydata['Unnamed: 0'] 
    mydata['wordDef'] = mydata['text'].apply(lambda x : defineVocabulary(x))

    #Get only the file name and remove the extension
    fname = str(file).split('_')[1]
    fname = str(fname).split('.')[0]
    mydata.to_csv(r'./step_5/s5_'+str(fname)+'.csv')

In [None]:
# Step 5
# Get The Files inside the step_5 folder add column for polarity, assessment and subjectivity

tweetDirS5 = './step_5'
_, _, filenames_s5 = next(walk(tweetDirS5))

for file in filenames_s5:
    mydata = pd.read_csv('./step_5/'+file,index_col=False)
    del mydata['Unnamed: 0'] 
    mydata['polarization'] = mydata['text'].apply(lambda x : definePolarization(x))
    mydata['subjectivity'] = mydata['text'].apply(lambda x : defineSubjectivity(x))
    mydata['assessment'] = mydata['text'].apply(lambda x : defineAssessments(x))
    
    #Get only the file name and remove the extension
    fname = str(file).split('_')[1]
    fname = str(fname).split('.')[0]
    mydata.to_csv(r'./step_6/s6_'+str(fname)+'.csv')
# Seperate wordscore in 3 seperate columns
mydata

In [None]:
texts = ['some text']
for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
  #do something

In [None]:
#Varya
# Update wordcloud
# install wordcloud
# current problem: reads whole text at once 

from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
from wordcloud import WordCloud, STOPWORDS

wordcloud = WordCloud(
    max_font_size=140,
    width=2000,
    height=1000,
    background_color="white",
    mask=mask,
    contour_width=1,
    stopwords=STOPWORDS)


mask = np.array(Image.open("cov19.png")) #jpg also ok
stopwords = set(STOPWORDS)

with open('preprocessed_feb.txt') as file:
    text = file.read() # :(
    
wordcloud.generate(text)
    
plt.imshow(wordcloud)#, interpolation='bilinear')
plt.axis("off")
plt.figure()
plt.imshow(mask)#, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis("off")
plt.show()
#write to img
image = wordcloud.to_image()
image.show()    
    

In [None]:
#Varya
# convert emoji to names
import unicodedata
def emoji_to_name(text):
    for symbol in text:    
        if unicodedata.category(symbol) == 'So':
            name = unicodedata.name(symbol) #returns all names uppercase
            print(name)
            text = emo.replace(symbol, ' '+name)
    return text 