In [41]:
# Import library
import pandas as pd
import numpy as np
import pymongo
import json
import datetime
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

In [48]:
# Load json data into dataframe (read directly from Mongodb)

# Declare MONGODB Instance (localhost)
mongo_client = pymongo.MongoClient('mongodb://127.0.0.1:27017/')
mongo_db = mongo_client['twitterdb_update']
mongo_collection = mongo_db['twitter_trump']

# read all data into pandas dataframe
df = pd.DataFrame(list(mongo_collection.find()))

In [49]:
df.shape

(3006, 9)

In [50]:
df.head()

Unnamed: 0,_id,created_at,id,screen_name:,lang,is_quote_status,is_retweet_status,full_text,quote_text
0,5ebe3bd2403dbfa81a78369a,2020-05-15 06:50:51,1261187267852451840,FalconhunterNRA,en,False,True,ICYMI: #FLYNNATTORNEY: Entrapment Plan Orchest...,
1,5ebe3bd6403dbfa81a78369b,2020-05-15 06:50:54,1261187283530788867,RafaelGarciaLAF,es,False,True,"#Trump dice que los médicos y enfermeras ""corr...",
2,5ebe3bdc403dbfa81a78369c,2020-05-15 06:51:00,1261187309204103168,ZA1194,en,False,False,@Neganwillclocku @AngelaBelcamino @realDonaldT...,
3,5ebe3bdd403dbfa81a78369d,2020-05-15 06:51:01,1261187312005926913,gary_burch,en,False,True,"When this pandemic is all over, the four count...",
4,5ebe3bdd403dbfa81a78369e,2020-05-15 06:51:01,1261187312429314048,Praveenkumarur3,hi,False,False,500 अरब डॉलर की बचत होगी.'' ट्रंप ने चीन से सा...,


### Things to clean:
- Checking any missing/ duplicated value
- Extract data from json object inside cell for columns `_id`, `created_at`, `id`
- Rename columns

### Text cleaning:
- For `full_text`, `quote_text` column, text preprocessing is required for sentiment analysis
- Convert to lowercase, remove noise and stopword, tokenization

Before data cleaning, create another copy of dataframe.

In [51]:
df_clean = df.copy()

#### Checking missing value

In [52]:
# Checking missing value
df_clean.isnull().sum()

_id                  0
created_at           0
id                   0
screen_name:         0
lang                 0
is_quote_status      0
is_retweet_status    0
full_text            0
quote_text           0
dtype: int64

No missing value is found.

#### Checking duplicated value

In [53]:
# Checking duplicated value
df_clean.duplicated().sum()

0

No duplicate row is found.

#### Drop Columns
`_id` is id from mongo database, thus, it is not required in this project and could be dropped.

In [54]:
df_clean.drop('_id', axis =1, inplace =True)

#### Rename Columns
- Change `id` to `user_id`  
- Change `screen_name:` to `username`
- Change `lang` to `language`

In [55]:
df_clean.rename(columns = {'id':'user_id', 'screen_name:':'username', 'lang':'language'}, inplace = True)

In [56]:
df_clean.sample(10)

Unnamed: 0,created_at,user_id,username,language,is_quote_status,is_retweet_status,full_text,quote_text
122,2020-05-15 06:54:59,1261188309784682496,Ukrinform_News,en,False,False,#Trump sends nomination of new ambassador to #...,
2178,2020-05-15 14:23:05,1261301079544823811,banke718,en,False,True,God Bless @realDonaldTrump and his draining th...,
1564,2020-05-15 13:59:00,1261295018704277504,faznet,de,False,False,Den Rücktritt des #WTO-Chefs nutzt #Trump für ...,
1248,2020-05-15 07:42:32,1261200277182570496,CyprusTel,en,False,False,British Writer Pens The Best Description Of Tr...,
597,2020-05-15 07:12:56,1261192829163802624,DocMcQuinnn,en,False,True,Fuhrer Democrat Gov. Gretchen Whitmer seeks re...,
32,2020-05-15 06:51:49,1261187513990803456,qwertyuiioasdg,zh,False,True,#特朗普 延長行政令，繼續封殺 #華為、 #中興 等中企\n全文👉https://t.co/...,
2771,2020-05-15 14:37:24,1261304681411817473,GhostofReason1,en,False,True,"@JonLemire Hmm, maybe because #Trump wants to ...",
291,2020-05-15 07:01:17,1261189895659421698,GTRao,en,False,True,#Trump\n#Taiwan's #TSMC Announces Intention t...,
729,2020-05-15 07:18:55,1261194334571036674,iamlmch,en,False,False,I wonder how many people have watched @netflix...,
2020,2020-05-15 14:18:56,1261300034030755840,vladivos7,fr,True,True,Vivement le grand ménage aussi en #France !\n#...,Les noms des membres de l’administration Obama...


#### Define Data Cleaning Functions
For tweet cleaning, may refer [here](https://towardsdatascience.com/extracting-twitter-data-pre-processing-and-sentiment-analysis-using-python-3-0-7192bd8b47cf).

In [57]:
# Convert all characters to lowercase
def to_lowercase(text):
    text  = text.lower()
    return text

In [58]:
#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

In [59]:
# Remove Emoji
def remove_emoji(text):
    text = emoji_pattern.sub(r'', text)
    return text

In [60]:
# Remove Mentions
def remove_mention(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    return text

In [61]:
# Remove Hashtag
def remove_hashtag(text):
    text = re.sub(r'#\w+', '', text)
    return text

In [62]:
# Remove URL
def remove_url(text):
    text = re.sub(r'\b(?:(?:https?|ftp)://)?\w[\w-]*(?:\.[\w-]+)+\S*', ' ', text)
    return text

In [63]:
# Replace consecutive non-ASCII characters with a space
def replace_nonASCII(text):
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    return text

In [64]:
# Remove punctuation
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [65]:
# Remove stopwords
stopwordsList = set(stopwords.words('english')) 
def remove_stopwords(text):
    text = [word for word in text if word not in stopwordsList]
    return text

In [66]:
# Tokenize words
def tokenize(text):
    word_tokens = word_tokenize(text)
    return word_tokens

In [67]:
# Combine all the functions
def datapreprocessing(review):
    
    # Convert the text into lowercase
    review = to_lowercase(review)
    
    # Clean tweet
    review = remove_mention(review)
    review = remove_url(review)
    review = remove_hashtag(review)
    review = remove_emoji(review)
    review = replace_nonASCII(review)
        
    # Remove punctuation
    review = remove_punct(review)
    
    # Tokenization
    review = word_tokenize(review)
    
    # Remove stopwords
    review = remove_stopwords(review)
    
    #return review
    return " ".join(review)

Before text preprocessing, we would like to remain all the text in **English** only.

In [68]:
df_clean['language'].value_counts()

en     2174
fr      163
de      163
und     139
it      104
es       77
nl       53
hi       47
tr       28
ja       12
ta        9
gu        5
in        5
th        4
pl        4
pt        3
zh        3
fa        1
ru        1
ca        1
bn        1
ar        1
lv        1
tl        1
ne        1
fi        1
el        1
ml        1
cy        1
te        1
Name: language, dtype: int64

In [69]:
df_clean = df_clean.loc[df_clean['language'] == 'en']

In [70]:
cols = ['full_text', 'quote_text']

for c in cols:
    df_clean[c] = df_clean[c].apply(lambda x: datapreprocessing(x))

In [71]:
df_clean.sample(10)

Unnamed: 0,created_at,user_id,username,language,is_quote_status,is_retweet_status,full_text,quote_text
330,2020-05-15 07:02:51,1261190290255368193,UncommonKat,en,False,False,highest death toll earth thats lack leadership...,
1721,2020-05-15 14:02:24,1261295873587245056,jimquigley48,en,False,True,truth matter,
659,2020-05-15 07:16:22,1261193690556571648,bitcoinconnect,en,False,False,watch new covid deaths iowa businesses prepare...,
2025,2020-05-15 14:19:08,1261300083875737600,tamyanne,en,False,True,paul manafort released dems uproar evidence le...,
2455,2020-05-15 14:29:38,1261302724856963073,ibsuzyf,en,False,True,democrat come people,
2782,2020-05-15 14:37:45,1261304768422535169,RealSaleemJuma,en,False,False,male feminist simp based republican chad lmao ...,
2924,2020-05-15 15:05:25,1261311732942684160,ChartrandTyson,en,False,True,china taking advantage united states many many...,
2354,2020-05-15 14:27:10,1261302104016257025,Thor_2000,en,False,True,way hell ever pardon trump,
128,2020-05-15 06:55:08,1261188348619698176,marco_vrg,en,False,False,faces darkest winter planning falters,
1093,2020-05-15 07:35:33,1261198520788283397,Subhank25219819,en,False,True,us president threatened slap new taxes america...,


In [26]:
#df_clean.to_csv('dataset/data_clean.csv', index = False)
df_clean.to_csv('C:/Users/ngwei/Desktop/mlsingapore.csv', index = False)


In [72]:
df_clean.sample(10)

Unnamed: 0,created_at,user_id,username,language,is_quote_status,is_retweet_status,full_text,quote_text
1915,2020-05-15 14:07:45,1261297217329147904,bitcoinconnect,en,False,False,china pushes back virus europe wakes wolf warr...,
279,2020-05-15 07:00:59,1261189820421943297,han_asly,en,True,False,covid response summarised,
687,2020-05-15 07:17:21,1261193940671377408,RavesTrump,en,False,False,trump says testing may frankly overrated well ...,
2079,2020-05-15 14:20:17,1261300374218117121,voteblueforgood,en,False,True,using epidemic tool install,
1536,2020-05-15 13:58:27,1261294878182400000,VendbienJon,en,False,False,may biggest worst hypocrites always blame some...,
737,2020-05-15 07:19:22,1261194446160441344,OsthIntWast,en,False,True,insane administration impose limits perchlorat...,
990,2020-05-15 07:32:27,1261197738240180224,redirectloop,en,False,False,clearly energy spend intellect unless u r sugg...,
2861,2020-05-15 15:03:35,1261311269589704705,barryslee,en,False,False,actual thing word made repeat people believe t...,
1618,2020-05-15 14:00:05,1261295291371859968,Mtherfckerjones,en,False,True,states covidrelated illness kids cuomo says,
2744,2020-05-15 14:36:53,1261304550914310146,xyzcuriosa,en,False,False,ur loser australia would love president rebuil...,


In [73]:
df_clean.shape

(2174, 8)