## Install Libraries to be used
- !pip install -U pip setuptools wheel
- !pip install -U spacy
- !python -m spacy download en_core_web_sm
- !pip install emot --upgrade
- !pip install emoji --upgrade

## Importing Libraries that will be used

In [23]:
import time
from datetime import datetime
import json
import random 
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import re
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import emoji
import spacy
from os import walk
import timeit


# Load the fields list
import fields
fieldsFilter = fields.fields

# Declare Spacy NLP Module
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")



Creat a small sample data

In [None]:
no_samples = "200"
list_tweets = None

with open("./tweets/20200916.json", "r") as myfile:
    list_tweets = list(myfile)

if int(no_samples) > len(list_tweets):
    no_samples = len(list_tweets)

sample = random.sample(list_tweets, int(no_samples))

file = open("sample_data.json", "w")
for i in sample:
  file.write(i)
file.close() #This close() is important

Processing functions

In [None]:
no_top_unique_words = "50" #@param {type:"string"}

result = Counter(" ".join(tweet_df['text'].values.tolist()).split(" ")).items()
df2 = pd.DataFrame(result)
df2.columns =['Word', 'Frequency']
df2 = df2[df2.Word != ""] #Deletes the empty spaces counted
df2 = df2.sort_values(['Frequency'], ascending=[False]) #Sort dataframe by frequency (Descending)

print('\033[1mTop '+no_top_unique_words+' most unique words used from the dataset\033[0m \n')
print(df2.head(int(no_top_unique_words)).to_string(index=False)) #Prints the top N unique words used
print("\n")
df3 = df2.head(int(no_top_unique_words))
df3.plot(y='Frequency', kind='pie', labels=df3['Word'], figsize=(9, 9), autopct='%1.1f%%', title='Top '+no_top_unique_words+' most unique words used from the dataset')

In [None]:
complete_tweets = []
with open('./tweets/sample_data.json', 'r') as f:
    for line in f:
        complete_tweets.append(json.loads(line))   

In [None]:
datfile = 's1_sa_dsaf'
fname = str(datfile).split('_')[1]
fname

In [11]:
# CLEANING  FUNCTIONS

def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_urls(text):
    result = re.sub(r"http\S+", "", text)
    return(result)

def remove_twitter_urls(text):
    clean = re.sub(r"pic.twitter\S+", "",text)
    return(clean)

def give_emoji_free_text(text):
    return emoji.get_emoji_regexp().sub(r'', text)

def remove_user_mentions(text):
    clean_tweet = re.sub("@[A-Za-z0-9_]+","", text)
    clean_tweet = re.sub("#[A-Za-z0-9_]+","", clean_tweet)
    return clean_tweet

def change_to_lowercase(text):
    return text.lower()    

# def remove_connecting_words(text):    
#     clean_tweet = re.sub('\s+(a|an|and|the|is|from|as|our|it|in|the|i|by|at|to|of|or|he|on|be|this|up|so|are|has|if|was|for|we)(\s+)', '\2', text)
#     return clean_tweet

def remove_returnchar(text):
    clean_tweet = text.replace('\r','')
    return clean_tweet

def remove_newlines(text):
    clean_tweet = text.replace('\n','')
    return clean_tweet



In [3]:
def extractTweetText(datfile):
    
    #Initialize Temporary Array Storage
    tempStorage = []

    #Parse Json to a temporary storage
    with open('./tweets/'+datfile, 'r') as f:
        for line in f:
            tempStorage.append(json.loads(line))

    # Convert Array to a Pandas Data Frame
    tempFrame = pd.DataFrame(tempStorage)
    
    # Drop all columns except for text column
    tempFrame = tempFrame[['text']]
    
    #Get only the file name and remove the extension
    fname = str(datfile).split('.')[0]
    
    #Save dataframe to csv
    tempFrame.to_csv(r'./step_1/s1_'+str(fname)+'.csv')

In [4]:
# Clean and process tweet text by removing URL, EMOJI and User Mentions
def removeTwitterMeta(datfile):    
    tweet_df = pd.read_csv('./step_1/'+str(datfile),index_col=False) 
    #Replace the spaces and enters
    tweet_df = tweet_df.loc[:, tweet_df.columns.isin(fieldsFilter)]
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_newlines(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_returnchar(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_urls(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_twitter_urls(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_emoticons(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_emoji(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : give_emoji_free_text(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : remove_user_mentions(x))
    tweet_df['text'] = tweet_df['text'].apply(lambda x : change_to_lowercase(x))

    #Get only the file name and remove the extension
    fname = str(datfile).split('_')[1]
    fname = str(fname).split('.')[0]
    tweet_df.to_csv(r'./step_2/s2_'+str(fname)+'.csv')


In [25]:
# Tokenize the text in the dataframe

def tokenizeText(data):
    tempStorage = []
    doc = nlp(data)
    for token in doc:
        tempStorage.append({token.text,token.pos_})
    return tempStorage

In [13]:
# Step 0
# Specify Raw Tweet Directory
tweetDir = './tweets'
_, _, filenames = next(walk(tweetDir))


for file in filenames:
    extractTweetText(file)

In [15]:
# Step 1
#Extract Text of Tweets and Save to CSV file

tweetDirS1 = './step_1'
_, _, filenames_s1 = next(walk(tweetDirS1))

for file in filenames_s1:
    removeTwitterMeta(file)

In [26]:
# Step 2
# Get The Files inside the step_2 folder for futher processing
tweetDirS2 = './step_2'
_, _, filenames_s2 = next(walk(tweetDirS2))
filenames_s2



['s2_sample.csv']

In [16]:



mydata = pd.read_csv('./step_2/s2_sample.csv',index_col=False)

In [19]:
mydata

        
tokenizeText(mydata.loc[0].text)
mydata['ttext'] = mydata['text'].apply(lambda x : tokenizeText(x))



In [None]:
doc = nlp(mydata.loc[0].text)

for token in doc:
    print(token.text, token.pos_)

In [20]:
mydata

Unnamed: 0.1,Unnamed: 0,text,ttext
0,0,so all you that say trump never downplayed cov...,"[{ADV, so}, {all, DET}, {PRON, you}, {that, DE..."
1,1,coronavirus: yorkshire quartet out of t20 blas...,"[{NOUN, coronavirus}, {PUNCT, :}, {NOUN, yorks..."
2,2,cdc: almost all of the us kids and teens who'v...,"[{NOUN, cdc}, {PUNCT, :}, {ADV, almost}, {all,..."
3,3,fellow south africans must one day tell ukuthi...,"[{ADJ, fellow}, {ADJ, south}, {africans, NOUN}..."
4,4,"ahmedabad adds 165 covid-19 cases, taking tall...","[{NOUN, ahmedabad}, {adds, VERB}, {165, NUM}, ..."
...,...,...,...
195,195,but the riverton fancy car gathering and the c...,"[{but, CCONJ}, {the, DET}, {ADJ, riverton}, {A..."
196,196,the majority of children who die from covid-19...,"[{the, DET}, {NOUN, majority}, {of, ADP}, {chi..."
197,197,oh god.,"[{INTJ, oh}, {god, INTJ}, {PUNCT, .}]"
198,198,one of the challenges the us faces during the...,"[{ , SPACE}, {NUM, one}, {of, ADP}, {the, DET}..."


In [21]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

for stop_word in list(spacy_stopwords)[:10]:
    print(stop_word)

if
beyond
with
afterwards
such
along
just
now
back
hers


Run for the entire dataset

Tokenize the Dataset

Train the Classifier Model

Run with the sample Dataset using Classifier Model