In [134]:
# Commonly used
import numpy as np
import pandas as pd
import os
import string
import time
import csv
from collections import Counter
import re
from datetime import datetime, timedelta
from codecs import encode
import pytz
from wordcloud import STOPWORDS
import nltk
from nltk.tokenize import word_tokenize

# A package for preprocessing(officially used in SemEval NLP competition)
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

# For machine translation
from googletrans import Translator
from google.cloud import storage
from google.cloud import translate

from sklearn.utils import shuffle

In [45]:
# Set the credential environment in jupyter notebook
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="XXXXX"

In [46]:
# Instantiates a client
translate_client = translate.Client()

In [102]:
time_zone_hk = pytz.timezone('Asia/Shanghai')

In [17]:
dataset_path = r'XXXXX'
cross_sectional = r'XXXXX'
desktop = r'XXXXX'
before_and_after = r'XXXXX'
tweet_combined_path = r'XXXXX'
tweet_2018_path = r'XXXXX'
tweet_2017_path = r'XXXXX'

In [18]:
def remove_u_plus(text):
    result = re.sub(pattern=r'U\+00', repl=r'', string=text)
    return result


def show_emoji_in_tweet(text, emoji_dictionary):
    without_u = remove_u_plus(text)
    old_text = without_u
    old_text = old_text.encode('unicode_escape').decode('utf-8')
    result1 = re.sub(pattern='\\\\r', repl='', string=old_text)
    result2 = re.sub(pattern='\\\\n', repl='', string=result1)
    result3 = re.sub(pattern='\\\\x([a-z0-9]{2})', repl = '<\\1>', string=result2)
    old_text = result3
    for _, row in emoji_dictionary.iterrows():
        if row['R_Encoding'] in old_text:
            new_text = re.sub(pattern=row['R_Encoding'], repl=row['emoji'], string=old_text)
            old_text = new_text
        else:
            pass
        if row['R_Encoding_lower'] in old_text:
            new_text = re.sub(pattern=row['R_Encoding_lower'], repl=row['emoji'], string=old_text)
            old_text = new_text
        else:
            pass
    return old_text

def show_chinese_step1(text, emoji_dataset):
    result1 = re.sub('\<u\+', '\\'+'u', text.lower())
    result2 = re.sub('\>', '', result1)
    all_chars = result2.split()
    new_all_chars = []
    for char in all_chars:
        emoji_in_char = False
        for emoji in list(emoji_dataset['emoji']):
            if emoji in char:
                emoji_in_char = True
                new_char = char.encode('utf-8').decode('utf-8')
                new_all_chars.append(new_char)
            else:
                pass
        if not emoji_in_char:
            new_char = char.encode('utf-8').decode('unicode_escape')
            new_all_chars.append(new_char)
    return " ".join(new_all_chars)

def show_chinese_step2(text):
    result1 = re.sub('<', '\\x', text)
    result2 = encode(result1.encode().decode('unicode_escape', 'ignore'), 'raw_unicode_escape')
    result3 = result2.decode('utf-8', 'ignore')
    return result3


def show_chinese_step3(text):
    patterns = re.findall(pattern='\\\\u[a-z0-9]{4}', string=text)
    old_text = text
    for pattern in patterns:
        new_pattern = pattern.encode('utf-8').decode('unicode_escape', 'ignore')
        new_text = re.sub(pattern='\\'+pattern, repl=new_pattern, string=old_text)
        old_text = new_text
    return old_text

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
               'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
              'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens

    # corpus from which the word statistics are going to be used
    # for word segmentation
    segmenter="twitter",

    # corpus from which the word statistics are going to be used
    # for spell correction
    corrector="twitter",

    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

def preprocessing_for_english(text_preprocessor, raw_text):
    preprocessed_text = ' '.join(text_preprocessor.pre_process_doc(str(raw_text)))
    # remove punctuations
    result = re.sub(u'[{}]'.format(string.punctuation), u'', preprocessed_text)
    return result


def clean_english_tweet_for_review(text, emoji_dictionary):
    text_with_emoji = show_emoji_in_tweet(text, emoji_dictionary)
    processed_text = preprocessing_for_english(text_processor, text_with_emoji)
    return processed_text


def clean_chinese_tweet_for_review(text, emoji_dictionary):
    tweet_with_emoji = show_emoji_in_tweet(text, emoji_dictionary)
    step1 = show_chinese_step1(tweet_with_emoji, emoji_dictionary)
    step2 = show_chinese_step2(step1)
    step3 = show_chinese_step3(step2)
    return step3

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [52]:
def show_translated_chinese(text):
    result1 = re.sub('\<u\+', '\\' + 'u', text.lower())
    result2 = re.sub('\>', '', result1)
    result3 = re.sub('\<([a-z0-9]{2})*', '', result2)
    all_chars = result3.split()
    for index, char in enumerate(all_chars):
        try:
            if translate_client.detect_language(char)['language'][:2] != 'en':
                all_chars[index] = translate_client.translate(char, target_language='en')['translatedText']
            else:
                all_chars[index] = char.encode('utf-8').decode('utf-8')
        except:
            pass
    return ' '.join(all_chars)

def preprocessing_for_chinese(text_preprocessor, raw_text):
    preprocessed_text = ' '.join(text_preprocessor.pre_process_doc(str(raw_text)))
    # remove punctuations
    result1 = re.sub(u'[{}]'.format(string.punctuation), u'', preprocessed_text)
    # remove hashtag
    result2 = re.sub(r'hashtag', u'', result1)
    # remove url
    result3 = re.sub(r'url', '', result2)
    # replace the multiple blanks to one blank
    result4 = re.sub('\\s+', u' ', result3)
    # remove the digits
    result5 = re.sub(r'number', u'', result4)
    return result5

def get_translated_text(text_string):
    # remove hashtag
    result1 = re.sub('#', '', string=text_string)
    # remove @
    result2 = re.sub('@', '', string=result1)
    result3 = show_translated_chinese(result2)
    processed_text = preprocessing_for_chinese(text_processor, result3)
    return processed_text

In [95]:
def build_selected_tweet_dataframe(dataframe, english_or_not = True):
    selected_columns = ['created_at', 'id_str', 'lang', 'lat', 'lon',
       'place_id', 'place_lat', 'place_lon', 'place_name', 'text', 'time_zone',
       'truncated', 'url', 'user_created_at', 'user_id_str', 'user_lang',
       'user_url', 'verified', 'hk_time', 'year', 'month', 'month_plus_year',
       'day', 'TPU_longitudinal', 'TPU_cross_sectional', 'cleaned_text']
    
    if english_or_not:
        result_dataframe = dataframe[selected_columns]
        return result_dataframe
    else:
        translated_text_list = list(dataframe['translated_text'])
        filtered_dataframe = dataframe[selected_columns]
        filtered_dataframe_copy = filtered_dataframe.copy()
        filtered_dataframe_copy['cleaned_text'] = translated_text_list
        return filtered_dataframe_copy

In [83]:
def transform_string_time_to_datetime(string):
    """
    :param string: the string which records the time of the posted tweets(this string's timezone is HK time)
    :return: a datetime object which could get access to the year, month, day easily
    """
    datetime_object = datetime.strptime(string, '%Y-%m-%d %H:%M:%S+08:00')
    final_time_object = datetime_object.replace(tzinfo=time_zone_hk)
    return final_time_object

In [19]:
emoji_dict = pd.read_pickle(os.path.join(tweet_2017_path, 'emoji.pkl'))

In [24]:
tweet_combined_dataframe = pd.read_csv(os.path.join(tweet_combined_path, 'tweet_combined_in_hk_withoutbot_step2.csv'), 
                                      encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC, dtype='str', index_col=0)

In [25]:
tweet_combined_dataframe.head()

Unnamed: 0,Unnamed: 0.1,created_at,id_str,lang,lat,lon,place_id,place_lat,place_lon,place_name,...,user_lang,user_url,verified,hk_time,year,month,month_plus_year,day,TPU_longitudinal,TPU_cross_sectional
0,12,Sat May 07 06:18:59 +0000 2016,7.28831e+17,en,22.2788499,114.18462,,22.271674,114.185178,,...,en,http://www.facebook.com/derekhysteric525,False,2016-05-07 14:18:59+08:00,2016,5,2016_5,7,146,146 - 147
1,19,Sat May 07 07:02:19 +0000 2016,7.28842e+17,en,22.31530176,113.9348316,,22.2465325,114.064237,,...,en,http://manishmaurya89.blogspot.com/?m=1,False,2016-05-07 15:02:19+08:00,2016,5,2016_5,7,951,950 - 951
2,20,Sat May 07 07:02:34 +0000 2016,7.28842e+17,en,22.27680815,113.9161873,,22.2465325,114.064237,,...,en,http://kotakitam.wordpress.com,False,2016-05-07 15:02:34+08:00,2016,5,2016_5,7,943,941 - 943
3,21,Sat May 07 07:03:11 +0000 2016,7.28843e+17,en,22.27564274,114.1711743,,22.271674,114.185178,,...,en,http://www.denkipenki.vsco.co,False,2016-05-07 15:03:11+08:00,2016,5,2016_5,7,131,131
4,32,Sat May 07 07:05:37 +0000 2016,7.28843e+17,en,22.30277224,114.0117169,,22.2465325,114.064237,,...,en,http://about.me/natalia.segura,False,2016-05-07 15:05:37+08:00,2016,5,2016_5,7,971,971 - 974


## 1. Get the English Tweets and the Chinese Tweets

In [26]:
english_tweets = tweet_combined_dataframe.loc[tweet_combined_dataframe['lang'] == 'en']
chinese_tweets = tweet_combined_dataframe.loc[tweet_combined_dataframe['lang'] == 'zh']

## 2. Clean the English Tweets and Chinese Tweets

### Clean the English tweets

In [41]:
%%time
english_tweets['cleaned_text'] = english_tweets.apply(
    lambda row: clean_english_tweet_for_review(row['text'], emoji_dict), axis = 1)

Wall time: 17h 19min 38s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [42]:
en_tweets_sample = english_tweets[['text', 'cleaned_text', 'url']].sample(10)
en_tweets_sample

Unnamed: 0,text,cleaned_text,url
414677,Here we go... typhoon signal no. 8 is up.\r\r\...,here we go repeated typhoon signal no number...,https://t.co/w4JgKFOEEE
321352,"When I passed by here, came across four uncles...",when i passed by here came across four uncles...,https://t.co/UOhrwPHpu9
403290,#cyberformulagpx #aoiogre #an21 at #tsimshatsu...,hashtag cyber formula gpx hashtag hashtag aoi ...,https://t.co/nG7JbAa8m4
234004,Could this be #santa car?\r\r\r\nOr his ex-gir...,could this be hashtag santa hashtag car or hi...,https://t.co/3WCmB8FywL
297127,2500m swim session today with 4 different stro...,2 5 0 0 m swim session today with number diffe...,https://t.co/obJn5z17ef
321824,Only chocolate í ¼í½« makes me happy during fi...,only chocolate 🍫 makes me happy during final 😭...,https://t.co/uKZRuRFBwy
42622,#sunset on my #trip to #hongkong #instagramhub...,hashtag sunset hashtag on my hashtag trip hash...,https://t.co/slVmpcYiyg
30595,#brexitin5words or one picture? @drjesimon #re...,hashtag brexit in5 words hashtag or one pictur...,https://t.co/8ZfdnAbAku
264787,Just some #BioQueens enjoying themselves on a ...,just some hashtag bio queens hashtag enjoying ...,https://t.co/WW7r0vdGRP
3525,"Pacific Coffee at night, Mui Wo ferry pier. #c...",pacific coffee at night mui wo ferry pier ha...,https://t.co/MgOzNU3eFm


In [43]:
english_tweets.to_csv(os.path.join(tweet_combined_path, 'tweet_combined_english.csv'), encoding='utf-8', 
                      quoting=csv.QUOTE_NONNUMERIC)

### Clean the Chinese tweets

In [30]:
%%time
chinese_tweets['cleaned_text'] = chinese_tweets.apply(lambda row: clean_chinese_tweet_for_review(
    row['text'], emoji_dict), axis = 1)

Wall time: 5h 42min 22s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [38]:
zh_tweets_sample = chinese_tweets[['text', 'cleaned_text', 'url', 'year', 'hk_time', 'created_at']].sample(10)
zh_tweets_sample

Unnamed: 0,text,cleaned_text,url,year,hk_time,created_at
286025,é ­åæè¬åäºï¼éå·¦å»å°±ç®ï¼åä½¿æ...,頭先所講嘅事，過左去就算，唔使放喺心，睇波！btw，今日夢想沙龍乜事？對南區成隊飲左redb...,https://t.co/i0ZWZCXFTd,2017,2017-10-01 19:08:25+08:00,Sun Oct 01 11:08:25 +0000 2017
384814,í ½í± @ Ocean Park Hong Kong é¦æ¸¯æµ·æ´å¬å...,👌 @ ocean park hong kong 香港海洋公園 https://t.co/y...,https://t.co/yfrQFSEblU,2018,2018-06-10 11:14:13+08:00,Sun Jun 10 03:14:13 +0000 2018
159369,<U+5EFA><U+8A2D><U+4E2D><U+3002><U+5C1A><U+572...,建設中。尚在討論建設中。尚在先決定後咨詢建設中。尚在建設衰敗中...... @ west k...,https://t.co/9vuJN0AxkC,2017,2017-01-22 18:39:03+08:00,Sun Jan 22 10:39:03 +0000 2017
395975,I'm at è±è¨éèä» in Tsim Sha Tsui https:...,i'm at 豐記雞蛋仔 in tsim sha tsui https://t.co/krt...,https://t.co/KRtF9ijaxj,2018,2018-07-15 12:12:17+08:00,Sun Jul 15 04:12:17 +0000 2018
422639,@n_t_lie @Dualman @miniyoshima @uituit @28481k...,@n_t_lie @dualman @miniyoshima @uituit @28481k...,https://t.co/VtYoWqvN8a,2018,2018-10-10 16:11:24+08:00,Wed Oct 10 08:11:24 +0000 2018
166,<U+9EC3><U+91D1><U+6BD2><U+8CE4><U+86D9> (@ Go...,黃金毒賤蛙 (@ golden computer arcade 黃金電腦商場 in sham...,https://t.co/KaeBx77XF2,2016,2016-05-07 17:39:55+08:00,Sat May 07 09:39:55 +0000 2016
80804,<U+5C0D><U+9762><U+6709><U+96BB><U+9CE5> @ <U+...,對面有隻鳥 @ 元朗西鐵站 https://t.co/9j0h0ivzjl,https://t.co/9j0h0IVZJl,2016,2016-09-21 14:40:03+08:00,Wed Sep 21 06:40:03 +0000 2016
353194,é¢éåªçºä¸æ¬¡åè¨\r\r\r\nè½é¨å²å¢æ¿...,離開只為下次再臨落雨啲嘢濕哂要返屋企洗嘢😑#逃脫 @ 東龍 https://t.co/d1p...,https://t.co/D1pZJmcWz9,2018,2018-03-04 14:02:36+08:00,Sun Mar 04 06:02:36 +0000 2018
116563,Today's lunch is <U+8C6C><U+6252><U+5305><U+54...,today's lunch is 豬扒包和奶茶。 @ 蘭芳園 (中環) https://t....,https://t.co/fTIofoCX0X,2016,2016-11-19 11:16:50+08:00,Sat Nov 19 03:16:50 +0000 2016
24345,I'm at Main St Station <U+7F8E><U+570B><U+5C0F...,i'm at main st station 美國小鎮大街火車站 in lantau isl...,https://t.co/7voQvQ0NsJ,2016,2016-06-16 09:53:48+08:00,Thu Jun 16 01:53:48 +0000 2016


In [39]:
chinese_tweets.to_csv(os.path.join(tweet_combined_path, 'tweet_combined_chinese.csv'), encoding='utf-8', 
                      quoting=csv.QUOTE_NONNUMERIC)

## 3. Translate the Chinese Tweets to English Tweets

In [48]:
chinese_tweets_copy = chinese_tweets.copy()

In [53]:
chinese_tweets_copy['translated_text'] = chinese_tweets_copy.apply(
    lambda row: get_translated_text(row['cleaned_text']), axis=1)

In [61]:
chinese_tweets_copy.to_csv(os.path.join(tweet_combined_path, 'tweets_combined_chinese_translated.csv'), encoding='utf-8', 
                          quoting=csv.QUOTE_NONNUMERIC)

# 4. Combine the Processed Chinese Tweets and English Tweets

In this section, before combining the English tweets and Chinese tweets together, we should consider the following steps:

1. delete rows of which the ```translated_text``` column is None
2. Use the ```translated_text``` as the ```cleaned_text``` to combine two tweet dataset together

After finishing the above two steps, order the combined data again and sort it by time. Finally, save the combined dataframe to the local directory.

In [71]:
chinese_tweets_copy[chinese_tweets_copy['translated_text'].isnull()]

Unnamed: 0,Unnamed: 0.1,created_at,id_str,lang,lat,lon,place_id,place_lat,place_lon,place_name,...,verified,hk_time,year,month,month_plus_year,day,TPU_longitudinal,TPU_cross_sectional,cleaned_text,translated_text


In [94]:
english_tweets.columns

Index(['Unnamed: 0.1', 'created_at', 'id_str', 'lang', 'lat', 'lon',
       'place_id', 'place_lat', 'place_lon', 'place_name', 'text', 'time_zone',
       'truncated', 'url', 'user_created_at', 'user_id_str', 'user_lang',
       'user_url', 'verified', 'hk_time', 'year', 'month', 'month_plus_year',
       'day', 'TPU_longitudinal', 'TPU_cross_sectional', 'cleaned_text'],
      dtype='object')

In [96]:
english_tweet_dataframe = build_selected_tweet_dataframe(english_tweets, english_or_not=True)
chinese_tweet_dataframe = build_selected_tweet_dataframe(chinese_tweets_copy, english_or_not=False)

In [98]:
combined_tweet_dataframe = pd.concat([english_tweet_dataframe, chinese_tweet_dataframe], axis=0)

In [99]:
combined_tweet_dataframe.shape

(431074, 26)

In [100]:
combined_tweet_dataframe.head()

Unnamed: 0,created_at,id_str,lang,lat,lon,place_id,place_lat,place_lon,place_name,text,...,user_url,verified,hk_time,year,month,month_plus_year,day,TPU_longitudinal,TPU_cross_sectional,cleaned_text
0,Sat May 07 06:18:59 +0000 2016,7.28831e+17,en,22.2788499,114.18462,,22.271674,114.185178,,#Working #Saturday #Afternoon! #Final #Touch i...,...,http://www.facebook.com/derekhysteric525,False,2016-05-07 14:18:59+08:00,2016,5,2016_5,7,146,146 - 147,hashtag working hashtag hashtag saturday hasht...
1,Sat May 07 07:02:19 +0000 2016,7.28842e+17,en,22.31530176,113.9348316,,22.2465325,114.064237,,I'm at Hong Kong International Airport <U+9999...,...,http://manishmaurya89.blogspot.com/?m=1,False,2016-05-07 15:02:19+08:00,2016,5,2016_5,7,951,950 - 951,i am at hong kong international airport u nu...
2,Sat May 07 07:02:34 +0000 2016,7.28842e+17,en,22.27680815,113.9161873,,22.2465325,114.064237,,The cable car ride... #cablecar #mountain #360...,...,http://kotakitam.wordpress.com,False,2016-05-07 15:02:34+08:00,2016,5,2016_5,7,943,941 - 943,the cable car ride repeated hashtag cable car...
3,Sat May 07 07:03:11 +0000 2016,7.28843e+17,en,22.27564274,114.1711743,,22.271674,114.185178,,Love Roses! <ed><U+00A0><U+00BD><ed><U+00B8><U...,...,http://www.denkipenki.vsco.co,False,2016-05-07 15:03:11+08:00,2016,5,2016_5,7,131,131,love roses 😍 😍 😍 🌹 🌷 💐 hashtag flower hashtag...
4,Sat May 07 07:05:37 +0000 2016,7.28843e+17,en,22.30277224,114.0117169,,22.2465325,114.064237,,Preparing for a photo shoot #glamourshots #bla...,...,http://about.me/natalia.segura,False,2016-05-07 15:05:37+08:00,2016,5,2016_5,7,971,971 - 974,preparing for a photo shoot hashtag glamour sh...


In [103]:
combined_tweet_dataframe_copy = combined_tweet_dataframe.copy()

combined_tweet_dataframe_copy['hk_time'] = combined_tweet_dataframe_copy.apply(lambda row: transform_string_time_to_datetime(row['hk_time']), axis=1)

In [104]:
combined_tweet_dataframe_copy_sorted = combined_tweet_dataframe_copy.sort_values(by='hk_time')

In [106]:
combined_tweet_dataframe_copy_sorted[['hk_time', 'text', 'cleaned_text', 'lang']].head()

Unnamed: 0,hk_time,text,cleaned_text,lang
0,2016-05-07 14:12:59+08:00,#Working #Saturday #Afternoon! #Final #Touch i...,hashtag working hashtag hashtag saturday hasht...,en
1,2016-05-07 14:56:19+08:00,I'm at Hong Kong International Airport <U+9999...,i am at hong kong international airport u nu...,en
2,2016-05-07 14:56:34+08:00,The cable car ride... #cablecar #mountain #360...,the cable car ride repeated hashtag cable car...,en
3,2016-05-07 14:57:11+08:00,Love Roses! <ed><U+00A0><U+00BD><ed><U+00B8><U...,love roses 😍 😍 😍 🌹 🌷 💐 hashtag flower hashtag...,en
4,2016-05-07 14:59:37+08:00,Preparing for a photo shoot #glamourshots #bla...,preparing for a photo shoot hashtag glamour sh...,en


In [107]:
combined_tweet_dataframe_copy_sorted.to_csv(os.path.join(tweet_combined_path, 'tweet_combined_cleaned_translated.csv'), 
                                           encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

In [121]:
stopwords = list(set(STOPWORDS))
strange_terms = ['allcaps', 'repeated', 'elongated', 'repeat', 'user', 'percent_c', 'hong kong', 'hong',
                 'kong', 'u_u', 'u_u_number', 'u_u_u_u', 'u_number', 'elongate', 'u_number_u',
                 'u', 'number', 'm', 'will', 'hp', 'grad', 'ed', 'boo', 'url', 'hashtag']
unuseful_terms = stopwords + strange_terms
unuseful_terms_set = set(unuseful_terms)

In [135]:
english_words = set(nltk.corpus.words.words())
english_words_lower = set(word.lower() for word in english_words)
# emoji_dict = pd.read_pickle(os.path.join(read_data.tweet_2017, 'emoji.pkl'))
emoji_list = list(emoji_dict['emoji'])
english_words_lower.update(emoji_list)
# Add station names to the english word set
station_location = pd.read_csv(os.path.join(tweet_2017_path, 'station_location.csv'))
station_names_list = list(station_location['Name'])
names_lower = [word_tokenize(name.lower()) for name in station_names_list]
words = []
for word_list in names_lower:
    for word in word_list:
        words.append(word)
english_words_lower.update(words)

In [136]:
len(english_words_lower)

235928

In [137]:
processed_list = []

for index, text in enumerate(list(combined_tweet_dataframe_copy_sorted['cleaned_text'])):
    text_list = text.split()
    processed_text_list_step1 = [text_step1 for text_step1 in text_list if text_step1 not in unuseful_terms_set]
    processed_text_list_step2 = [text_step2 for text_step2 in text_list if text_step2 in english_words_lower]
    if len(processed_text_list_step2) != 0:
        processed_strings = ' '.join(processed_text_list_step2)
        processed_list.append(processed_strings)
    else:
        processed_list.append(text)
        
    if (index + 1) % 10000 == 0:
        print("The first {} has been processed!".format(index+1))
    else:
        pass

The first 10000 has been processed!
The first 20000 has been processed!
The first 30000 has been processed!
The first 40000 has been processed!
The first 50000 has been processed!
The first 60000 has been processed!
The first 70000 has been processed!
The first 80000 has been processed!
The first 90000 has been processed!
The first 100000 has been processed!
The first 110000 has been processed!
The first 120000 has been processed!
The first 130000 has been processed!
The first 140000 has been processed!
The first 150000 has been processed!
The first 160000 has been processed!
The first 170000 has been processed!
The first 180000 has been processed!
The first 190000 has been processed!
The first 200000 has been processed!
The first 210000 has been processed!
The first 220000 has been processed!
The first 230000 has been processed!
The first 240000 has been processed!
The first 250000 has been processed!
The first 260000 has been processed!
The first 270000 has been processed!
The first 

In [140]:
combined_tweet_dataframe_copy_sorted['cleaned_text'] = processed_list

In [141]:
combined_tweet_dataframe_copy_sorted.to_csv(os.path.join(tweet_combined_path, 'tweet_combined_cleaned_translated.csv'), 
                                           encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)