## Mount drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import libraries

In [2]:
import json
!pip install jsonlines
import jsonlines
import csv
import os
!pip install tweet-preprocessor
import preprocessor as p
import pandas as pd



# Creating a file .csv with only tweets text

In [3]:
filename = '/content/drive/My Drive/covid_project/graph_files/sampled_tweets.jsonl'
with jsonlines.open(filename, 'r') as json_file:
  out_f = '/content/drive/My Drive/covid_project/word2vec/tweets_dataset.csv'
  with open(out_f, 'w') as out:
    field_names = ['text']
    writer = csv.DictWriter(out, fieldnames=field_names)
    for line in json_file:
      if ('retweeted_status' in line):
          x = line['retweeted_status']['full_text']
          writer.writerow({'text':x})
      else:
        x = line['full_text']
        writer.writerow({'text':x})
out.close()

# Preprocessing tweets text

In [4]:
tweets_csv = pd.read_csv('/content/drive/My Drive/covid_project/word2vec/tweets_dataset.csv', header = None, names=['text'], delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

tweets_csv.count()

text    44419
dtype: int64

In [5]:
#Dropping duplicates and NaN from the dataframe
tweets_csv = tweets_csv.dropna()
tweets_csv = tweets_csv.drop_duplicates()

tweets_csv.head()

Unnamed: 0,text
0,please read this thread before you judge how s...
1,A photo of Chinese medical workers and law enf...
2,"""I’m just going to assume this guy dropped a b..."
3,#Coronavirus #COVID19 is serious. We are not h...
4,"""Coronavirus Thanos"


In [6]:
tweets_csv.count()

text    30832
dtype: int64

In [7]:
def preprocess_tweet(row):
    text = row['text']
    text = p.clean(text)
    return text

In [8]:
#Tweet have to be cleaned to normal text
tweets_csv['text'] = tweets_csv.apply(preprocess_tweet, axis=1)
tweets_csv.head()

Unnamed: 0,text
0,please read this thread before you judge how s...
1,A photo of Chinese medical workers and law enf...
2,"""Im just going to assume this guy dropped a bu..."
3,"is serious. We are not hyping it. """
4,"""Coronavirus Thanos"


In [9]:
from gensim.parsing.preprocessing import remove_stopwords
def stopword_removal(row):
    text = row['text']
    text = remove_stopwords(text)
    return text

In [10]:
#Removing stop words
tweets_csv['text'] = tweets_csv.apply(stopword_removal, axis=1)

In [11]:
tweets_csv.head()

Unnamed: 0,text
0,read thread judge strangers navigating moment
1,A photo Chinese medical workers law enforcemen...
2,"""Im going assume guy dropped bunch F bombs mor..."
3,"serious. We hyping it. """
4,"""Coronavirus Thanos"


In [12]:
#Removing alphanumeric words, one character words and other words nwe're not considering useful
tweets_csv['text'] = tweets_csv['text'].str.lower().str.replace('[^\w\s]',' ').str.replace('\w*\d\w*',' ').str.replace(r'\b[a-zA-Z]\b|\b[a-zA-Z][a-zA-Z]\b', ' ').str.replace(r'\b_*',' ').str.replace('\s\s+', ' ')

In [13]:
tweets_csv.head()

Unnamed: 0,text
0,read thread judge strangers navigating moment
1,photo chinese medical workers law enforcement...
2,going assume guy dropped bunch bombs morning ...
3,serious hyping
4,coronavirus thanos


In [14]:
#Saving results on a .csv file
tweets_csv.to_csv('/content/drive/My Drive/covid_project/word2vec/tweets_preprocessed.csv')

In [15]:
# Deleting rows with less than 5 words
with open('/content/drive/My Drive/covid_project/word2vec/tweets_preprocessed.csv', 'r', encoding='utf-8', errors='ignore') as inp, open('/content/drive/My Drive/covid_project/word2vec/tweets_rows_deleted.csv', 'w', encoding='utf-8') as out:
    writer = csv.writer(out)
    for row in csv.reader(inp):
      if len(row[1]) > 5:
        writer.writerow(row)
inp.close()
out.close()

# Making Dictionary

In [16]:
import csv
lis = []
with open('/content/drive/My Drive/covid_project/word2vec/tweets_rows_deleted.csv', 'r', encoding='utf-8') as inp:
  for row in csv.reader(inp):
    lis.append(row[1])

In [17]:
#creating dictionary
from nltk import FreqDist
word_dist = FreqDist()
for s in lis:
    word_dist.update(s.split())
dic = dict(word_dist)
print(dic)



In [18]:
#ordering dictionary
dic = {key:val for key, val in dic.items() if val != 1 and val != 2}
dic_items = dic.items()
sorted_dic = sorted(dic_items)

In [19]:
#Saving dictionary
with open('/content/drive/My Drive/covid_project/word2vec/dictionary.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    for elem in sorted_dic:
        writer.writerow([elem[0],elem[1]])
csv_file.close()

In [20]:
#looking for most recurrent words
bst = []

values = sorted(dic.values())

for elem in range(1,20):
  i = - elem
  bst.append(values[i])
print(bst)

[2140, 2111, 1838, 1511, 1335, 1052, 894, 883, 817, 794, 747, 745, 727, 715, 681, 643, 611, 545, 531]


In [21]:
def getKeysByValue(dictOfElements, valueToFind):
    listOfKeys = list()
    listOfItems = dictOfElements.items()
    for item  in listOfItems:
        if item[1] == valueToFind:
            listOfKeys.append(item[0])
    return  listOfKeys

In [22]:
words = []
for elem in bst:
  words.append(getKeysByValue(dic,elem))
print(words)

[['amp'], ['the'], ['people'], ['this'], ['cases'], ['health'], ['coronavirus'], ['new'], ['need'], ['virus'], ['you'], ['like'], ['spread'], ['help'], ['time'], ['home'], ['trump'], ['covid'], ['world', 'pandemic']]
