In [1]:
import pandas as pd
import contractions
import re
import emoji
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
import glob
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [4]:
FOLDER_NAME = "data_*/*.pkl"   

In [5]:
full_dataframe = pd.DataFrame()
for each_file in glob.glob(FOLDER_NAME):
    df = pd.read_pickle(each_file)
    full_dataframe = pd.concat([full_dataframe, df])
print(full_dataframe)

                                                    0  \
0   RT @POTUS: As we celebrate the progress we’ve ...   
1   This #EarthDay, I'm happy to be meeting with P...   
2   RT @Khan__sir_patna: All of people wishes and ...   
3   RT @CapsCoalition: Biden Signs Executive Order...   
4   RT @tamannaahspeaks: Animals source their food...   
..                                                ...   
95  RT @SpeakerPelosi: For more than half a centur...   
96  RT @PearlJam: “Station this is Eddie Vedder of...   
97  RT @FDACS: In honor of National Gardening Mont...   
98             #earthday recycling doesn’t even exist   
99  RT @ShekyIhs: 😁 Happy Earth day :)) #SaveSoil ...   

                                 1  
0   Fri Apr 22 15:41:33 +0000 2022  
1   Fri Apr 22 15:41:33 +0000 2022  
2   Fri Apr 22 15:41:33 +0000 2022  
3   Fri Apr 22 15:41:33 +0000 2022  
4   Fri Apr 22 15:41:33 +0000 2022  
..                             ...  
95  Fri Apr 22 17:06:11 +0000 2022  
96  Fri Apr 22 17:0

In [6]:
full_dataframe.columns = ['text', 'date']

In [18]:
full_dataframe.reset_index(inplace=True)
full_dataframe.drop(columns=['index', 'date'], inplace=True)

In [19]:
full_dataframe.head()

Unnamed: 0,level_0,text
0,0,RT @POTUS: As we celebrate the progress we’ve ...
1,1,"This #EarthDay, I'm happy to be meeting with P..."
2,2,RT @Khan__sir_patna: All of people wishes and ...
3,3,RT @CapsCoalition: Biden Signs Executive Order...
4,4,RT @tamannaahspeaks: Animals source their food...


In [20]:
full_dataframe.tail()

Unnamed: 0,level_0,text
30194,30194,RT @SpeakerPelosi: For more than half a centur...
30195,30195,RT @PearlJam: “Station this is Eddie Vedder of...
30196,30196,RT @FDACS: In honor of National Gardening Mont...
30197,30197,#earthday recycling doesn’t even exist
30198,30198,RT @ShekyIhs: 😁 Happy Earth day :)) #SaveSoil ...


In [22]:
df = full_dataframe.copy()

In [23]:
def remove_urls(text):
    ''' This method takes in text to remove urls and website links, if any'''
    url_pattern = r'(www.|http[s]?://)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(url_pattern, '', text)
    return text

def get_account_tags(text):
    ''' This method account tags'''
    tag_pattern = r'@([A-Za-z0-9_]+)'
    text = re.findall(tag_pattern, text)
    return text

def get_hash_tags(text):
    ''' This method gets hashtags'''
    tag_pattern = r'#([A-Za-z0-9]+)'
    text = re.findall(tag_pattern, text)
    return text

def get_emoji_list(text):
    dict_loc_emoji = emoji.emoji_lis(text)
    if dict_loc_emoji:
        list_emojis = [each['emoji'] for each in dict_loc_emoji]
        return list_emojis
    return None

def preprocess_text(text):
    # remove accounts and hash tags
    cleaned_text = re.sub(r'(@|#)([A-Za-z0-9_]+)', '', text)
    # Keep only English alphabetts, digits and spaces
    cleaned_text = re.sub(r'[^a-zA-Z\d\s]+', '', cleaned_text)
    word_list = []
    for each_word in cleaned_text.split(' '):
        word_list.append(contractions.fix(each_word).lower())
    word_list = [
        wnl.lemmatize(each_word.strip()) for each_word in word_list
        if each_word not in STOPWORDS and each_word.strip() != ''
    ]
    return " ".join(word_list)

In [24]:
df['text_cleaned'] = df.text.apply(remove_urls)
df['account_tags'] = df.text_cleaned.apply(get_account_tags)
df['hash_tags'] = df.text_cleaned.apply(get_hash_tags)
df['text_preprocessed'] = df.text_cleaned.apply(preprocess_text)
df['emoji_lists'] = df.text_cleaned.apply(get_emoji_list)

In [25]:
df.head()

Unnamed: 0,level_0,text,text_cleaned,account_tags,hash_tags,text_preprocessed,emoji_lists
0,0,RT @POTUS: As we celebrate the progress we’ve ...,RT @POTUS: As we celebrate the progress we’ve ...,[POTUS],[],rt celebrate progress we have made earth day m...,
1,1,"This #EarthDay, I'm happy to be meeting with P...","This #EarthDay, I'm happy to be meeting with P...",[PennEnvironment],"[EarthDay, GetTheLeadOut]",i am happy meeting philly city council behalf ...,
2,2,RT @Khan__sir_patna: All of people wishes and ...,RT @Khan__sir_patna: All of people wishes and ...,[Khan__sir_patna],[EarthDay],rt people wish happy earth day,
3,3,RT @CapsCoalition: Biden Signs Executive Order...,RT @CapsCoalition: Biden Signs Executive Order...,[CapsCoalition],[EarthDay],rt biden sign executive order natural capital ...,
4,4,RT @tamannaahspeaks: Animals source their food...,RT @tamannaahspeaks: Animals source their food...,"[tamannaahspeaks, SadhguruJV, cpsavesoil]",[MyconnectwithSoil],rt animal source food soil soil dying beautifu...,


In [None]:
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()

# Read the whole text.
text = ' '.join(list(df.text_preprocessed))
# print(text)
# read the mask / color image taken from
# http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
alice_coloring = np.array(Image.open(path.join(d, "earth_image.jpg")))
stopwords = set(STOPWORDS)
stopwords.add("rt")
stopwords.add("day")
stopwords.add("earth")

wc = WordCloud(background_color="white",
               max_words=1000,
               mask=alice_coloring,
               stopwords=stopwords,
               max_font_size=40,
               repeat=False,
               include_numbers=True,
               random_state=42)
# generate word cloud
wc.generate(text)

# create coloring from image
image_colors = ImageColorGenerator(alice_coloring)

plt.figure(figsize=(80, 50), facecolor=None)
plt.axis("off")
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.show()