# 1. Load relevant packages

In [10]:
# Commonly used
import numpy as np
import pandas as pd
import os
import string
from collections import Counter
import re
from datetime import datetime, timedelta
from codecs import encode
import pytz

# A package for preprocessing(officially used in SemEval NLP competition)
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

from sklearn.utils import shuffle

# 2. Load an uncleaned dataset and randomly sample 20 tweets for preprocessing

### Load an uncleaned tweet dataset first

In [66]:
tweet_2017_path = r'F:\CityU\Datasets\Hong Kong Tweets 2017'
final_uncleaned = pd.read_pickle(os.path.join(tweet_2017_path, 'final_uncleaned.pkl'))

### Construct Chinese tweets and English tweets dataframe

In [67]:
en_tweets = final_uncleaned.loc[final_uncleaned['lang'] == 'en']
zh_tweets = final_uncleaned.loc[final_uncleaned['lang'] == 'zh']

### Randomly sample 15 tweets in each dataframe

In [68]:
en_tweets_sample = en_tweets.sample(15)
zh_tweets_sample = zh_tweets.sample(15)

In [69]:
zh_tweets_sample.shape

(15, 44)

In [70]:
en_tweets_sample.shape

(15, 44)

In [71]:
all_sample_tweets = pd.concat([en_tweets_sample, zh_tweets_sample])

In [72]:
final_sample_tweets = shuffle(all_sample_tweets)

In [73]:
final_sample_tweets = final_sample_tweets[['user_id_str', 'lang', 'text', 'url', 'lat', 'lon']]

### Show how the dataframe looks like

In [74]:
final_sample_tweets

Unnamed: 0,user_id_str,lang,text,url,lat,lon
1,5759862.0,zh,æäºäºæä¸æ¯æéå¯ä»¥æ²æ·¡â¯åå¹´ä...,https://t.co/C2vBDbADEJ,22.345455,114.206915
314,122704400.0,en,Motto in HK day 2 #motto #gdragon #gdragonhkco...,https://t.co/5hvW7GPRWy,22.321389,113.943889
2,4053995000.0,en,"06:48 Temp. 22.9Â°C, Hum. 73%, Dewp. 16.7Â°C, ...",,22.406059,114.224178
62,15402480.0,zh,@miniyoshima @happynatalie @_lamdavid @uituit ...,,22.338452,114.18791
5,3146791000.0,en,"@akari_oozora Source: ""Three statues"" by yewen...",https://t.co/bzNRNCYiRB,22.309999,113.910004
170,8296432.0,zh,é·è¤¸åºåæå°èªå·±èº«æææä¿¡å¿ @ H...,https://t.co/fZw6T7OT1x,22.317157,113.930282
51,14065960.0,zh,"Sai Kung Town â¢ Sai Kung, Hong Kong â¢ è¥¿è...",https://t.co/JjzW6gKjaO,22.382405,114.274583
99,218915600.0,en,I'm at Ho Hung Kee in Causeway Bay https://t.c...,https://t.co/wo18j9ecPW,22.280171,114.183922
98,67988130.0,zh,The price of dim sum in hkg í ½í±\n#onedimsum...,https://t.co/9sT9uVeqVr,22.325091,114.16865
69,53893150.0,zh,<U+6CD5><U+5F0F><U+5343><U+5C64><U+9165><U+86C...,https://t.co/GzuwqjcG0X,22.310546,114.226005


### We could see that there are many strange strings in the text. So how to clean them?

# 3. Clean the text

In [75]:
def remove_u_plus(text):
    result = re.sub(pattern=r'U\+00', repl=r'', string=text)
    return result


def show_emoji_in_tweet(text, emoji_dictionary):
    without_u = remove_u_plus(text)
    old_text = without_u
    old_text = old_text.encode('unicode_escape').decode('utf-8')
    result1 = re.sub(pattern='\\\\r', repl='', string=old_text)
    result2 = re.sub(pattern='\\\\n', repl='', string=result1)
    result3 = re.sub(pattern='\\\\x([a-z0-9]{2})', repl = '<\\1>', string=result2)
    old_text = result3
    for _, row in emoji_dictionary.iterrows():
        if row['R_Encoding'] in old_text:
            new_text = re.sub(pattern=row['R_Encoding'], repl=row['emoji'], string=old_text)
            old_text = new_text
        else:
            pass
        if row['R_Encoding_lower'] in old_text:
            new_text = re.sub(pattern=row['R_Encoding_lower'], repl=row['emoji'], string=old_text)
            old_text = new_text
        else:
            pass
    return old_text


def show_chinese_step1(text, emoji_dataset):
    result1 = re.sub('\<u\+', '\\'+'u', text.lower())
    result2 = re.sub('\>', '', result1)
    all_chars = result2.split()
    new_all_chars = []
    for char in all_chars:
        emoji_in_char = False
        for emoji in list(emoji_dataset['emoji']):
            if emoji in char:
                emoji_in_char = True
                new_char = char.encode('utf-8').decode('utf-8')
                new_all_chars.append(new_char)
            else:
                pass
        if not emoji_in_char:
            new_char = char.encode('utf-8').decode('unicode_escape')
            new_all_chars.append(new_char)
    return " ".join(new_all_chars)


def show_chinese_step2(text):
    result1 = re.sub('<', '\\x', text)
    result2 = encode(result1.encode().decode('unicode_escape', 'ignore'), 'raw_unicode_escape')
    result3 = result2.decode('utf-8', 'ignore')
    return result3


def show_chinese_step3(text):
    patterns = re.findall(pattern='\\\\u[a-z0-9]{4}', string=text)
    old_text = text
    for pattern in patterns:
        new_pattern = pattern.encode('utf-8').decode('unicode_escape', 'ignore')
        new_text = re.sub(pattern='\\'+pattern, repl=new_pattern, string=old_text)
        old_text = new_text
    return old_text


text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
               'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
              'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens

    # corpus from which the word statistics are going to be used
    # for word segmentation
    segmenter="twitter",

    # corpus from which the word statistics are going to be used
    # for spell correction
    corrector="twitter",

    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)


def preprocessing_for_english(text_preprocessor, raw_text):
    preprocessed_text = ' '.join(text_preprocessor.pre_process_doc(str(raw_text)))
    # remove punctuations
    result = re.sub(u'[{}]'.format(string.punctuation), u'', preprocessed_text)
    return result


def clean_english_tweet_for_review(text, emoji_dictionary):
    text_with_emoji = show_emoji_in_tweet(text, emoji_dictionary)
    processed_text = preprocessing_for_english(text_processor, text_with_emoji)
    return processed_text


def clean_chinese_tweet_for_review(text, emoji_dictionary):
    tweet_with_emoji = show_emoji_in_tweet(text, emoji_dictionary)
    step1 = show_chinese_step1(tweet_with_emoji, emoji_dictionary)
    step2 = show_chinese_step2(step1)
    step3 = show_chinese_step3(step2)
    return step3

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


### Load the emoji dataset in here:
https://github.com/bright1993ff66/Social-Media-Data-Analysis/blob/master/Datasets/emoji.pkl

In [76]:
emoji_dict = pd.read_pickle(os.path.join(tweet_2017_path, 'emoji.pkl'))

### Clean the English tweets

In [77]:
en_tweets_sample['cleaned_text'] = en_tweets_sample.apply(lambda row: clean_english_tweet_for_review(row['text'], emoji_dict),
                                        axis = 1)

In [78]:
en_tweets_sample = en_tweets_sample[['text', 'cleaned_text', 'url']]

In [79]:
en_tweets_sample

Unnamed: 0,text,cleaned_text,url
121,It's on! #ArtBaselHK #AlanCristeaGallery 1B30 ...,it s on hashtag art basel hk hashtag hashtag...,https://t.co/2dgLXWGEpt
133,Some #streetfood to work up an appetite. í ½í¸...,some hashtag street food hashtag to work up an...,https://t.co/DdZE5YdE2f
117,@chiewata take care bro,user take care bro,
5,"@akari_oozora Source: ""Three statues"" by yewen...",user source three statues by yewenyi on fli...,https://t.co/bzNRNCYiRB
85,ASIA | AS â China is pessimistic about Kore...,allcaps asia allcaps as e2 number number ...,https://t.co/A7DHzAJHTv
88,ASIA â China's Love-Hate Relationship With T...,allcaps asia allcaps e2 number number chin...,https://t.co/7aA5u5aodm
179,I can't believe today's my last workout sessio...,i can not believe today s my last workout ses...,https://t.co/WHyDUGSIBj
314,Motto in HK day 2 #motto #gdragon #gdragonhkco...,motto in hk day number hashtag motto hashtag h...,https://t.co/5hvW7GPRWy
10,How do you process human connection? Be yourse...,how do you process human connection be yourse...,https://t.co/mUI6TnNvBa
2,"06:48 Temp. 22.9Â°C, Hum. 73%, Dewp. 16.7Â°C, ...",time temp number c2 b0 c hum percent dewp ...,


### Clean the Chinese tweets

In [80]:
zh_tweets_sample['cleaned_text'] = zh_tweets_sample.apply(lambda row: clean_chinese_tweet_for_review(row['text'], emoji_dict),
                                        axis = 1)

In [81]:
zh_tweets_sample = zh_tweets_sample[['text', 'cleaned_text', 'url']]

In [82]:
zh_tweets_sample

Unnamed: 0,text,cleaned_text,url
1,æäºäºæä¸æ¯æéå¯ä»¥æ²æ·¡â¯åå¹´ä...,有些事情不是時間可以沖淡⋯十年了⋯心痛仍感受如初 @ 鑽石山墳埸思親堂 https://t....,https://t.co/C2vBDbADEJ
134,Happy Easter!<U+5FA9><U+6D3B><U+7BC0><U+5FEB><...,happy easter!復活節快樂!#復活節要有復活蛋 @ 香港迪士尼樂園 | hong ...,https://t.co/O43UcPb1VB
256,#reuniondinner2017 #<U+5718><U+5E74><U+98EF>20...,#reuniondinner2017 #團年飯2017 @ 天后廟道 https://t.c...,https://t.co/WcMxmEqY9I
51,"Sai Kung Town â¢ Sai Kung, Hong Kong â¢ è¥¿è...","sai kung town • sai kung, hong kong • 西貢市 • 香港...",https://t.co/JjzW6gKjaO
67,ç¹¼ä¸æ¬¡æåå¯¶çä¹å¾ï¼ä»æææåé...,繼上次月光寶盒之後，今晚有月光餅盒比大家頂住癮先。仲帶咗特別版既推石頭的人比大家! 最開心又...,https://t.co/vMo4q0WClE
136,<U+6930><U+5B50><U+70AD> &amp; 70% <U+7DA0><U+...,椰子炭 &amp; 70% 綠茶雪糕 @ a fun store https://t.co/...,https://t.co/1ks5nnaOHX
98,The price of dim sum in hkg í ½í±\n#onedimsum...,the price of dim sum in hkg 👌#onedimsum #dimsu...,https://t.co/9sT9uVeqVr
69,<U+6CD5><U+5F0F><U+5343><U+5C64><U+9165><U+86C...,"法式千層酥蛋糕配士多啤梨雪糕。這是個非常符合港人口味和期望的「妹灰」,足可見樂師傅的心思。-...",https://t.co/GzuwqjcG0X
115,I'm at Nina Tower å¦å¿å»£å ´ in Tsuen Wan ht...,i'm at nina tower 如心廣場 in tsuen wan https://t....,https://t.co/teai9iAiV4
13,Live Band çä¿åå¾é  #ê¹íì° #íì° #...,live band 真係冇得頂 #김태연 #태연 #sone #persona #hongk...,https://t.co/cLHY2ye0D5


### For the Chinese tweets, you could use Google Translate and translate them into English so that you could generate tweet representation