## Mount drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

# graph files folder
drive.mount('/content/drive')
data = '/content/drive/My Drive/covid_project/graph_files'
w2vdata = '/content/drive/My Drive/covid_project/word2vec'
print(data)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/covid_project/graph_files


# Import libraries

In [2]:
import json
!pip install jsonlines
import jsonlines
import csv
import os
!pip install tweet-preprocessor
import preprocessor as p
import pandas as pd



# Creating a file .csv with only tweets text

In [3]:
filename = data+'/sampled_tweets.jsonl'
out_f = w2vdata+'/tweets_dataset.csv'

with jsonlines.open(filename, 'r') as json_file:
  with open(out_f, 'w') as out:
    field_names = ['text']
    writer = csv.DictWriter(out, fieldnames=field_names)

    # If retweet take the original clean and complete text 
    for line in json_file:
      if ('retweeted_status' in line):
          x = line['retweeted_status']['full_text']
          writer.writerow({'text':x})
      else:
        x = line['full_text']
        writer.writerow({'text':x})

out.close()

# Preprocessing tweets text

In [4]:
tweets_csv = pd.read_csv(w2vdata+'/tweets_dataset.csv', header = None, names=['text'], delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

tweets_csv.count()

text    43422
dtype: int64

In [5]:
#Dropping duplicates and NaN from the dataframe
tweets_csv = tweets_csv.dropna()
tweets_csv = tweets_csv.drop_duplicates()

tweets_csv.head()

Unnamed: 0,text
0,"""BREAKING: The University of Liverpool has clo..."
1,I am forever grateful to live in this city. It...
2,"#coronavirus https://t.co/9o570wMHF7"""
3,What in the sam Hell? https://t.co/PO0pBhcWH6
4,This is HARD AS FUCK. https://t.co/B7mFPnNEZf


In [6]:
tweets_csv.count()

text    30673
dtype: int64

In [7]:
def preprocess_tweet(row):
    text = row['text']
    text = p.clean(text)
    return text

In [8]:
#Tweet have to be cleaned to normal text
tweets_csv['text'] = tweets_csv.apply(preprocess_tweet, axis=1)
tweets_csv.head()

Unnamed: 0,text
0,"""BREAKING: The University of Liverpool has clo..."
1,I am forever grateful to live in this city. It...
2,""""
3,What in the sam Hell?
4,This is HARD AS FUCK.


In [9]:
from gensim.parsing.preprocessing import remove_stopwords
def stopword_removal(row):
    text = row['text']
    text = remove_stopwords(text)
    return text

In [10]:
#Removing stop words
tweets_csv['text'] = tweets_csv.apply(stopword_removal, axis=1)

In [11]:
tweets_csv.head()

Unnamed: 0,text
0,"""BREAKING: The University Liverpool closed tod..."
1,I forever grateful live city. It's right thing...
2,""""
3,What sam Hell?
4,This HARD AS FUCK.


In [12]:
#Removing alphanumeric words, one character words and other words we are not considering useful
tweets_csv['text'] = tweets_csv['text'].str.lower().str.replace('[^\w\s]',' ').str.replace('\w*\d\w*',' ').str.replace(r'\b[a-zA-Z]\b|\b[a-zA-Z][a-zA-Z]\b', ' ').str.replace(r'\b_*',' ').str.replace('\s\s+', ' ')

In [13]:
tweets_csv.head()

Unnamed: 0,text
0,breaking the university liverpool closed toda...
1,forever grateful live city right thing scouse...
2,
3,what sam hell
4,this hard fuck


In [14]:
#Saving results on a .csv file
tweets_csv.dropna()
tweets_csv.to_csv(w2vdata+'/tweets_preprocessed.csv', index=False, header=False)

In [15]:
# Delete empty rows
with open(w2vdata+'/tweets_preprocessed.csv', 'r', encoding='utf-8', errors='ignore') as inp, open(w2vdata+'/tweets_rows_deleted.csv', 'w', encoding='utf-8') as out:
    writer = csv.writer(out)
    for row in csv.reader(inp):
      if len(row[0]) > 1:
        writer.writerow(row)
inp.close()
out.close()

# Making Dictionary

In [16]:
from nltk import FreqDist

# dictionary with occurrencies
word_dist = FreqDist()
with open(w2vdata+'/tweets_rows_deleted.csv', 'r', encoding='utf-8') as inp:
  for row in csv.reader(inp):
    word_dist.update(row[0].split())

dic = dict(word_dist)

# print first n elements in dic
n = 10
first_n = {k: dic[k] for k in list(dic)[:n]}
print(first_n)

{'breaking': 241, 'the': 2204, 'university': 58, 'liverpool': 10, 'closed': 190, 'today': 544, 'rest': 88, 'academic': 13, 'year': 202, 'forever': 18}


In [17]:
#Alphabetic order dictionary
dic = {key:val for key, val in dic.items() if val != 1 and val != 2}
dic_items = dic.items()
sorted_dic = sorted(dic_items)
print(f'#words: {len(sorted_dic)}')

# print first n elements in orderd dic
n = 10
print(sorted_dic[:n])

#words: 8341
[('abandon', 5), ('abandoned', 9), ('abandoning', 4), ('abbott', 4), ('abc', 5), ('ability', 37), ('able', 101), ('ably', 3), ('aboard', 10), ('abortion', 20)]


In [18]:
#Saving dictionary
with open(w2vdata+'/dictionary.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    for elem in sorted_dic:
        writer.writerow([elem[0],elem[1]])
csv_file.close()

### Most recurrent words

In [19]:
bst = []

values = sorted(dic.values())

for elem in range(1,20):
  i = - elem
  bst.append(values[i])
print(bst)

[2261, 2204, 1917, 1562, 1290, 1061, 932, 920, 825, 775, 759, 752, 741, 698, 691, 657, 653, 643, 616]


In [20]:
words = []

for val in bst:
  for word in dic:
    if dic[word] == val:
      words.append(word)

print(words)

['amp', 'the', 'people', 'this', 'cases', 'health', 'new', 'coronavirus', 'virus', 'need', 'you', 'like', 'spread', 'help', 'time', 'trump', 'china', 'home', 'pandemic']
