In [2]:
from pymongo import MongoClient
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Lemmatizer variable
wnl = WordNetLemmatizer()
# For Pos Tag transformation 
from nltk.corpus import wordnet as wn

# Emoticons
neg_emo = ['>:[', ':-(', ':(', ':-c', ':c', ':-<\n', ':?C', ':<', ':-[', ':[', ':{',
           ':-||', ':@', ":'-(", ":'(", 'D:<', 'D:', 'D8', 'D;', 'D=', 'DX', 'v.v', 
           "D-':", '>:O', ':-O', ':O', '∞o∞', '∞O∞', ':O', 'o_O', 'o_0', 'o.O', '8-0',
           '>:\\', '>:/', ':-/', ':-.', ':/', ':\\', '=/', '=\\', ':L', '=L', ':S', '>.<', 
           ':-|', ':$', ':-X', ':X', ':-#', ':#', '>:)', '>;)', '>:-)', '}:-)', '}:)', 
           '3:-)', '3:)', ':-&', ':&', '#-)', '%-)', '%)', ':-###..', ':###..', '<:-|', 
           '</3', ':-\\\\', ':\\\\', ':-/', ':/', ')-:', '):', ';-(', ';(', '😩','🙏','😂','😭']
pos_emo =[':-)', ':)', ':o)', ':]', ':3', ':c)',':>','=]','8)','=)',':}',':^)',':?)',':-D',
          ':D','8-D','8D','x-D','xD','X-D','XD','=-D','=D','=-3','=3','B^D',":-))",'(-:',
          '(:','B-)',';-)',';)',':-P',':P','<3',":'-)",":')",':*',':^*',"('}{' )",'*-)','*)',
          ';-]',';]',';D',';^)',':-,','>:P','X-P','x-p','xp','XP',':-p',':p','=p',':-\xde',
          ':\xde',':-b',':b','O:-)','0:-3','0:3 ','0:-)','0:)','0;^)','o/\\o','^5 ','>_>^ ^<_<',
          '|;-)','|-O','👌','😘','💋','😍']

# totalEmo: makes a list of the pos and neg emoticones
totalEmo = pos_emo + neg_emo
totalEmo2 = set(totalEmo)

# _______________________________________
# Punctuation marks 
punc = (['!', '#', '"', '%', '$', "'", '&', ')', '(', '+', '*', '-', ',', '/', '.', ';', ':', 
         '=', '<', '?', '>', '[', ']', '\\', '_', '^', '`', '{', '}','💥','🔥','🌷','😉','😌', '|', '~', '↺','🌲','🏁','🙋','🍷','👩','🍸','🍹','⏬','⏬','⏬','⏩','📹'])
# Exclude:  defines the punctuations marks
exclude = set(punc)

# _______________________________________
# Stopwords 
stopwords = stopwords.words('english')
#_______________________________________
# Regex Function to remove URL , HTML and Unicode
ignore_regex = [
    "https?[\w:/.?=]*",   # Remove URLs
    "@\w*",  # Remove @tag
    
    # "\\u\w{4}"  # Remove Unicode
]

def remove(text):
    for i in ignore_regex: 
        p = re.compile(i)
        t = p.sub('', text)
        text = t
    return text

client = MongoClient() 

# Connect to your Database "tweetsdb" -
db = client.tweetsdb_2

# Connect to the Collection "rawtweets"
# Change the name if you used another collection to store tweets.
tweets = db.rawtweets

#cleans the collection for the modified tweets - extra step
db.modtweet.drop()
    
# _______________________________________
# Modifier for the Pos Tager

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']
# wn.ADJ: a, wn.VERB: v, wn.NOUN: n, wn.ADV: r
def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None

# _______________________________________
######## Start Cleaning ############
# _______________________________________

# Make a list from the Collection for only the text in the tweet :
tw = tweets.find({},{'text':1,'_id':0})

for x in tw: # x: dictionary object, e.g., {'text': 'Apple iPhone 6 - 64GB - Silver (AT&amp;T)}
    y = x['text'].lower()  
    # Remove rt in various potion
    result = y.replace('rt @','@')
    result = result.replace(' rt ','')
    result = result.replace(' rt','')
    st = remove(result) # URL & @user removed
    
# Remove punctuation and Hashtag:
    # 1. Temporary emoticon variable storage
    emoticonExt = ' '.join(ch for ch in totalEmo2 if ch in st) 
    # 2. Remove Punctuation (including some emoticon)
    st = ''.join(ch for ch in st if ch not in exclude)
    # 3. Remove emoticon
    st = ''.join(ch for ch in st if ch not in totalEmo2)
    
# Lemmatize:
    # 1. Tokenize with NLTK
    toke = nltk.word_tokenize(st)
 
    # 2. Pos Tag with NlTK
    posTag = nltk.pos_tag(toke)

    lema_string = ""
    # 3. Need to modify the TAGS to be recognized in Wordnet Pos Tagger
    for (x,tag) in posTag:
          if penn_to_wn(tag) == None:
                  lema_string = lema_string + ' ' +  wnl.lemmatize((x))                
          else:
                  lema_string = lema_string + ' ' +  wnl.lemmatize(x,penn_to_wn(tag))

# Add the emoticon at the end of the sentence
    lema_string = lema_string +' '+ emoticonExt
    emoticonExt = "" 
      
# Remove Stopwords
    swSplit = lema_string.split(' ')
    noSW = [x for x in swSplit if x not in stopwords]
    lema_string = ' '.join(noSW)
    
    print('')
    print('modified: ')
    print(lema_string)

# saving data to a new Collection called "modtweet"
    db.modtweet.insert_one({"text":lema_string})

print('Cleaning done')
    



modified: 
 mira whatsapp aaaaaaaa 

modified: 
 whatsapp developer offer endtoend encryption nokia phone however user see new display 

modified: 
 size 2634 available pick updelivery whatsapp2348091222253 bbm59240cf9 polo 3500 and… 

modified: 
 whatsapp number choose perfect profile pic initiate chat … 

modified: 
 accord police juma gun 0930pm whatsapp 25426 active 2205pm rip jj … 

modified: 
 whatsapp number choose perfect profile pic initiate chat … 

modified: 
 open member 👉 

modified: 
 help hi pre order bts young forever daynight ver due 19th may 12 noon rm92wmrm97em whatsapp 0197748160 tq 

modified: 
 whatsapp developer offer endtoend encryption nokia phone neurogadget 

modified: 
 open member amp ps info whatsapp 087889355679 

modified: 
 create meme following message hate people never chat steal whatsapp profile picture 

modified: 
 help hi pre order bts young forever daynight ver due 19th may 12 noon rm92wmrm97em whatsapp 0197748160 tq 

modified: 
 conversation m