In [1]:
import json
# import glob
from tqdm import tqdm
import numpy as np
import unidecode
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords, words
import re
import contractions
from bs4 import BeautifulSoup
import pandas as pd
from nltk.metrics.distance  import edit_distance
import emoji
import plotly.express as px

In [2]:
messages = open('./Data/result.json', 'r', encoding='utf8')
data = json.load(messages)

## Preprocessing

In [3]:
len(data['messages'][10000].keys())

7

In [4]:
def dataset_extraction(data):
    df = []
    crypto = ['shib', 'doge', 'shiba', 'dogecoin']
    print("Extracting Messages...")
    for i in tqdm(range(len(data['messages']))):
        frame = []
        st = data['messages'][i]
        
        if type(st['text']) == str:
            if any(curr in st['text'].lower() for curr in crypto):
                frame.append(st['date'].split("T")[0])
                frame.append(st['text'])

        if type(st['text']) == list: 
            b = str()
            
            for j in st['text']: 
                if type(j) == dict:
                    a = j['text']
                    b += a
                else: 
                    b += j
                    
            if any(curr in b.lower() for curr in crypto):
                frame.append(st['date'].split("T")[0])
                frame.append(b)
        
        if len(frame): 
            df.append(frame)
        

    return df

In [5]:
messages = dataset_extraction(data)

Extracting Messages...


100%|████████████████████████████████████████████████████████████████████████| 49436/49436 [00:00<00:00, 309742.40it/s]


In [6]:
def convert_data_to_df(messages): 
    df = pd.DataFrame(messages, columns = ['Date', 'Messages'])
    return df

In [7]:
df = convert_data_to_df(messages)

In [8]:
pre = df.copy()

In [9]:
pre

Unnamed: 0,Date,Messages
0,2021-05-01,Doge is going craY
1,2021-05-01,Sell target of doge
2,2021-05-01,Doge
3,2021-05-01,Dogecoin!!! Que hago?
4,2021-05-01,"Anyway, is doge a good crypto for long term in..."
...,...,...
3056,2021-05-15,How is Shib?
3057,2021-05-15,How is shib
3058,2021-05-15,Shiba swap is gonna launch soon... It will def...
3059,2021-05-15,Whole crypto market is being dominated by bear...


In [10]:
def preprocessing(df):
    df = demojizing(df)
    df = convert_accented_chars(df)
    df = remove_case_sensitive(df)
    df = remove_htmls_and_urls(df)
    df = remove_extra_spaces_between_words(df)
    df = expand_contractions(df)
    df = remove_stop_words(df)
    df = lemmatization(df)
    df = removing_non_english(df)
    df = spelling_corrections(df)
    return df

def demojizing(dataframe): 
    tqdm.pandas()
    print("Decoding Emojis in Text...")
    dataframe["Messages"] = dataframe["Messages"].progress_apply(lambda txt: emoji.demojize(txt))
    return dataframe

def convert_accented_chars(dataframe): 
    tqdm.pandas()
    print("Converting Accented Characters...")
    dataframe["Messages"] = dataframe["Messages"].progress_apply(lambda txt: unidecode.unidecode(txt))
    return dataframe

def remove_case_sensitive(dataframe):
    tqdm.pandas()
    print("Removing Case Sensitive Characters...")
    dataframe['Messages'] = dataframe['Messages'].progress_apply(lambda txt: str(txt).lower())
    return dataframe

def remove_htmls_and_urls(dataframe):
    tqdm.pandas()
    print("Removing HTMLs and URLs...")
    dataframe['Messages'] = dataframe['Messages'].progress_apply(lambda txt: re.sub(r"http\S+", "", txt))
    dataframe['Messages'] = dataframe['Messages'].progress_apply(lambda txt: BeautifulSoup(txt, 'lxml').get_text())
    return dataframe
    
def remove_extra_spaces_between_words(dataframe):
    tqdm.pandas()
    print("Removing Extra Whitespaces...")
    dataframe['Messages'] = dataframe['Messages'].progress_apply(lambda txt: re.sub(" +"," ", txt))
    return dataframe

def expand_contractions(dataframe):
    tqdm.pandas()
    print("Expanding Contractions...")
    dataframe['Messages'] = dataframe['Messages'].progress_apply(lambda txt: contractions.fix(txt))
    return dataframe

def spelling_corrections(dataframe):
    
    def spell_check(message, correct_words):
        new_message = ""
        for word in message.split(" "):
            if word.isalpha() and (word not in correct_words) and (word.lower() not in ["doge", "dogecoin", "shibe", "shiba", "shib", "shiba inu"]):
                temp = [(edit_distance(word, w),w) for w in correct_words if w[0]==word[0]]
                new_message = new_message + sorted(temp, key = lambda val:val[0])[0][1] + " "
            else:
                new_message =  new_message + word + " "
        return new_message

    tqdm.pandas()
    print("Performing Spelling Corrections...")
    
    slangs = ["doge", "dogecoin", "dogecoins", "shib", "shiba", "shiba inu", "shibe inu", 
          "dollar", "dolar", "$", "ps", "p.s.", "app", "money", "tarde", "telegram", "whatsapp", 
          "buy", "issue", "crypto", "usdc", "bank", "account", "portfolio", "Elon", "Musk", "shibaa",
          "profit", "cro", "€", "inr", "mill", "cdc", "tbh", "hi", "hey", "plz", "wbu", "%",
          "crypto.com", "email", "usdt", "cent", "ct", "mil", "ppl", "btc", "curr"]

    a = [w for w in wordnet.all_lemma_names()]
    a = list(set(a).union(set(slangs)))
    correct_words = list(set(words.words()).union(set(a)))    
    dataframe['Messages'] = dataframe['Messages'].progress_apply(lambda txt: spell_check(txt, correct_words))
    return dataframe

def removing_non_english(dataframe): 
    def word_in_english(message, correct_words): 
        st = []
        for word in message.split(" "): 
            if wordnet.synsets(word) or word.lower() in correct_words:
                st.append(1)
            else: 
                st.append(0)

        cutoff_value = 0.45
        
        if sum(st)/len(st) >= cutoff_value: 
            return message
        else: 
            return ""
        
    tqdm.pandas()
    print("Checking English Messages...")
    slangs = ["doge", "dogecoin", "dogecoins", "shib", "shiba", "shiba inu", "shibe inu", 
              "dollar", "dolar", "$", "ps", "p.s.", "app", "money", "tarde", "telegram", "whatsapp", 
              "buy", "issue", "crypto", "usdc", "bank", "account", "portfolio", "Elon", "Musk", "shibaa",
              "profit", "cro", "€", "inr", "mill", "cdc", "tbh", "hi", "hey", "plz", "wbu", "%",
              "crypto.com", "email", "usdt", "cent", "ct", "mil", "ppl", "btc", "curr"]
    
    correct_words = list(set(words.words()).union(set(slangs)))    
    dataframe["Messages"] = dataframe["Messages"].progress_apply(lambda txt: word_in_english(txt, correct_words))
    return dataframe[dataframe['Messages'] > ""]

def remove_stop_words(dataframe): 

    def Remove_Stopwords(message, stop_words_list):
        tokens = message.split(" ")
        clean_message = [word for word in tokens if not word in stop_words_list]
        return [(" ").join(clean_message)]

    tqdm.pandas()
    print("Removing Stop Words...")
    deselect_stop_words = ['not', 'nor', 'no', 'against', 'don', "don't", 
          'should', "should've", 'aren', "aren't", 'couldn', 
          "couldn't", 'didn', "didn't", 'doesn', "doesn't", 
          'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 
          'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't", 
          'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 
          'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 
          'wouldn', "wouldn't"]

    stop_words_list = set([stopwords.words('english').remove(word) for word in deselect_stop_words])
    dataframe['Messages'] = dataframe['Messages'].progress_apply(lambda txt: Remove_Stopwords(txt, stop_words_list))
    return dataframe

def lemmatization(dataframe): 
    def lemmatize(message):
        word_lemma = WordNetLemmatizer()
        lemmatize_word = [word_lemma.lemmatize(word) for word in message]
        return (" ").join(lemmatize_word)
    
    tqdm.pandas()
    print("Lemmatizing Words in Messages...")
    dataframe['Messages']= dataframe['Messages'].progress_apply(lambda txt: lemmatize(txt))
    return dataframe

In [None]:
df_pre = preprocessing(df)
df_pre

Decoding Emojis in Text...


100%|████████████████████████████████████████████████████████████████████████████| 3061/3061 [00:01<00:00, 2320.01it/s]


Converting Accented Characters...


100%|███████████████████████████████████████████████████████████████████████████| 3061/3061 [00:00<00:00, 52919.57it/s]


Removing Case Sensitive Characters...


100%|██████████████████████████████████████████████████████████████████████████| 3061/3061 [00:00<00:00, 306846.50it/s]


Removing HTMLs and URLs...


100%|██████████████████████████████████████████████████████████████████████████| 3061/3061 [00:00<00:00, 307096.05it/s]
100%|████████████████████████████████████████████████████████████████████████████| 3061/3061 [00:01<00:00, 2887.29it/s]


Removing Extra Whitespaces...


100%|███████████████████████████████████████████████████████████████████████████| 3061/3061 [00:00<00:00, 47217.48it/s]


Expanding Contractions...


100%|███████████████████████████████████████████████████████████████████████████| 3061/3061 [00:00<00:00, 36108.98it/s]


Removing Stop Words...


100%|██████████████████████████████████████████████████████████████████████████| 3061/3061 [00:00<00:00, 127874.87it/s]


Lemmatizing Words in Messages...


100%|████████████████████████████████████████████████████████████████████████████| 3061/3061 [00:02<00:00, 1361.28it/s]


Checking English Messages...


100%|██████████████████████████████████████████████████████████████████████████████| 3061/3061 [02:44<00:00, 18.59it/s]


Performing Spelling Corrections...


  8%|█████▉                                                                       | 227/2954 [04:47<1:53:56,  2.51s/it]

In [None]:
df_pre.to_csv(r"Preprocessed_data.csv")