In [20]:
import numpy as np
import pandas as pd
import re
from spellchecker import SpellChecker
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import demoji
from bs4 import BeautifulSoup
demoji.download_codes()

from datetime import datetime
from tqdm import tqdm

# Initializes the spell checker, tokenizer and lammatizer.
check = SpellChecker()
tokenizer = RegexpTokenizer(r'\w+')
lemma = WordNetLemmatizer()

# Create a set of stopwords
stop_words = set(stopwords.words('english'))

# Reads in the data.
data = pd.read_csv('tweets_only_eng_and_US_without_cleaning_2020-09-24.csv')


########################### FUNCTIONS ###################################
def correct_text(text):
# text needs to be a list of clean word tokens without other characters.
    misspelled = check.unknown(text)
    for word in misspelled:
        text[text.index(word)] = check.correction(word)
    return list(set(text) - misspelled)

def lemmatize_text(text):
    return [lemma.lemmatize(word) for word in text]

# removing URL links (http or www pattern)
def rm_URL_lowcase(record: str) -> str:
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    sub_record = re.sub(regex, '', record)
    return sub_record

# removing hashtags and @
def rm_hashtags(record: str) -> str:
    sub_record = re.sub(r'@|#', ' ', record)
    return sub_record

# removing punctuation
def rm_punctuation(record: str) -> str:
    s = re.sub(r'[^\w\s]', '', record)
    return s


# removing stopwords - i.e. the, a, an, he\
def rm_stopwords(record: str) -> str:
    words = list(record.split(' '))
    filtered_sentence = ' '.join([w for w in words if not w in stop_words])
    return filtered_sentence

# removeing emoji
def rm_emoji(record: str) -> str:
    plain = demoji.replace(record, " ")
    return plain

# Removing html coding
def rm_html(record: str) -> str:
    soup = BeautifulSoup(record, 'html.parser')
    plain = soup.get_text()
    return plain

Downloading emoji data ...
... OK (Got response in 0.91 seconds)
Writing emoji data to C:\Users\PC\.demoji\codes.json ...
... OK


In [21]:
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

data.loc[:, 'text'] = data['text'].apply(lambda x: re.sub(r'https?://[^\s<>"]+|www\.[^\s<>"]+', "", x))
print("Regex done at:", datetime.now())
data.loc[:, 'text'] = data['text'].progress_apply(lambda x: rm_html(x))
print("XTML decoding deleted at:", datetime.now())
#data.loc[:, 'normalized'] = data['normalized'].apply(lambda x: rm_newline(x))
#print("Newlines cleaned at:", datetime.now())
data.loc[:, 'text'] = data['text'].apply(lambda x: re.sub(r'@|#', ' ', x))
print("Hashtags done at:", datetime.now())
data.loc[:, 'text'] = data['text'].apply(lambda x: re.sub(r'[0-9]+', ' ', x))
print("Numbers replaced at:", datetime.now())
data.loc[:, 'text'] = data['text'].apply(lambda x: re.sub(r'\_', ' ', x))
print("Floors replaced at:", datetime.now())
data.loc[:, 'text'] = data['text'].progress_apply(lambda x: demoji.replace(x, " "))
print("Emojis replaced at:", datetime.now())
data.loc[:, 'text'] = data['text'].apply(lambda x: x.rstrip())
print("Whitespaces done at:", datetime.now())
data.loc[:, 'text'] = data['text'].apply(lambda x: x.lower())
print("Lowered at:", datetime.now())
data.loc[:, 'text'] = data['text'].apply(lambda x: rm_punctuation(x))
print("Punctuation done at:", datetime.now())
data.loc[:, 'text'] = data['text'].apply(lambda x: rm_stopwords(x))
print("Stop words bye at:", datetime.now())

print("Start tokenization:", datetime.now())
data.loc[:, 'text'] = data['text'].apply(lambda x: tokenizer.tokenize(x))
print("Tokenized at:", datetime.now())
data.loc[:, 'text'] = data['text'].apply(lambda x: lemmatize_text(x))
print("Lemmatized at:", datetime.now())

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Regex done at: 2020-09-24 00:21:06.343851


HBox(children=(IntProgress(value=0, max=517138), HTML(value='')))



XTML decoding deleted at: 2020-09-24 00:23:02.999386
Hashtags done at: 2020-09-24 00:23:04.748858
Numbers replaced at: 2020-09-24 00:23:07.233368
Floors replaced at: 2020-09-24 00:23:08.259777


HBox(children=(IntProgress(value=0, max=517138), HTML(value='')))


Emojis replaced at: 2020-09-24 00:27:33.417735
Whitespaces done at: 2020-09-24 00:27:33.727566
Lowered at: 2020-09-24 00:27:34.103208
Punctuation done at: 2020-09-24 00:27:36.637769
Stop words bye at: 2020-09-24 00:27:39.385259
Start tokenization: 2020-09-24 00:27:39.409238
Tokenized at: 2020-09-24 00:27:48.385428
Lemmatized at: 2020-09-24 00:28:14.976252


In [22]:
from datetime import date
file_name = ["cleaned_data_" + str(date.today()) + ".csv"]
file_name

['cleaned_data_2020-09-24.csv']

In [23]:
data = data.rename(columns = {'Unnamed: 0':'initial_index'})
data

Unnamed: 0,initial_index,tweet_created_at,user_created_at,text,user_id,user_name,followers_count,friends_count,user_lang,place_type,place_full_name,place_bounding_box,country,tweet_lang,retweet_count,favorite_count
0,1,2016-08-12 10:04:02+00:00,Thu Oct 15 00:28:04 +0000 2009,"[barackobama, fbi, lorettalynch, all, collusio...",82496193,Red Octopus,531,677,en,city,"Baton Rouge, LA","{'type': 'Polygon', 'coordinates': [[[-91.2189...",United States,en,0,0
1,4,2016-08-12 10:04:30+00:00,Mon Aug 20 09:43:48 +0000 2012,"[cnn, newday, clear, trump, deliberately, thro...",769208504,Beverly Spence,2652,2976,en,city,"Baltimore, MD","{'type': 'Polygon', 'coordinates': [[[-76.7115...",United States,en,0,0
2,5,2016-08-12 10:04:46+00:00,Tue May 19 03:18:19 +0000 2009,"[realdonaldtrump, wouldnt, recognize, lie, cam...",41043316,"Asa DeMatteo, Ph.D.",183,98,en,city,"Palm Springs, CA","{'type': 'Polygon', 'coordinates': [[[-116.567...",United States,en,0,0
3,7,2016-08-12 10:04:48+00:00,Sun Aug 07 00:57:29 +0000 2016,"[kid, know, suing, someone, thats, beautiful, ...",762090248159371264,Rafael Alejandro,159,993,en,city,"Secaucus, NJ","{'type': 'Polygon', 'coordinates': [[[-74.0938...",United States,en,0,0
4,8,2016-08-12 10:04:48+00:00,Wed Oct 28 18:34:22 +0000 2009,"[hillaryclinton, cofounder, isi, crooked, evil...",85879639,tom b,68,268,en,city,"Irving, TX","{'type': 'Polygon', 'coordinates': [[[-97.0341...",United States,en,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517133,657300,2016-09-12 13:20:28+00:00,Sun Jul 10 16:13:13 +0000 2011,"[cnbc, squawkalley, realdonaldtrump, kudlow, m...",332888709,Linda Mannes Wildes,2097,2716,en,admin,"Florida, USA","{'type': 'Polygon', 'coordinates': [[[-87.6346...",United States,en,0,0
517134,657302,2016-09-12 13:20:32+00:00,Wed May 02 22:48:50 +0000 2012,"[trump, u, tax, weird, medical, report, whacke...",569487350,Sheryl Berghoff,1997,2599,en,city,"San Diego, CA","{'type': 'Polygon', 'coordinates': [[[-117.282...",United States,en,0,0
517135,657303,2016-09-12 13:20:33+00:00,Fri Apr 01 21:42:36 +0000 2016,"[carolcnn, msm, honest, watch, utube, video, r...",716017946166857728,Karen B,756,1229,en,city,"Coral Gables, FL","{'type': 'Polygon', 'coordinates': [[[-80.2971...",United States,en,0,0
517136,657305,2016-09-12 13:20:38+00:00,Wed Jun 10 08:43:55 +0000 2015,"[interesting, hillary, clinton, crowd, small, ...",3241116564,Robert Chaffin,2827,3970,en,city,"Fairbanks, AK","{'type': 'Polygon', 'coordinates': [[[-147.813...",United States,en,0,0


In [24]:
data.to_csv('tweets_cleaned_text_and_tokenized_2020-09-24.csv')

In [30]:
import pandas as pd
import numpy as np
from city_state_dict import city_to_state_dict

def get_state(city):
    if city in city_to_state_dict.keys():
        return city_to_state_dict[city]
    return np.nan

def check_Trump_Clinton(tokenized_text):
    if 'hillaryclinton' in tokenized_text:
        return "clinton"
    elif 'realdonaldtrump' in tokenized_text:
        return "trump"
    elif 'hillary' in tokenized_text:
        return 'clinton'
    elif 'trump' in tokenized_text:
        return "trump"
    else:
        return np.nan

In [31]:
data = pd.read_csv('tweets_cleaned_text_and_tokenized_2020-09-24.csv', index_col='initial_index')
#print(len(data.index))
#data['states'] = data['place_full_name'].progress_apply(lambda row: get_state(row))
#data['candidate'] = data['text'].progress_apply(lambda row: check_Trump_Clinton(row))

#data.to_csv("tweets_with_states_and_candidates_2020-09-24.csv")

517138


HBox(children=(IntProgress(value=0, max=517138), HTML(value='')))




In [35]:
#data[data.states.notnull()]

Unnamed: 0_level_0,Unnamed: 0,tweet_created_at,user_created_at,text,user_id,user_name,followers_count,friends_count,user_lang,place_type,place_full_name,place_bounding_box,country,tweet_lang,retweet_count,favorite_count,states,candidate
initial_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
125788,99213,2016-08-18 04:11:46+00:00,Thu Jun 28 02:45:39 +0000 2012,"['time', 'rule', 'people', 'rule', 'special', ...",620571475,Dan Scavino Jr.,140824,114,en,poi,Manhattan,"{'type': 'Polygon', 'coordinates': [[[-73.9840...",United States,en,0,0,New York,trump
141158,111183,2016-08-19 00:47:39+00:00,Wed May 16 20:36:52 +0000 2012,"['oh', 'yeah', 'got', 'first', 'tattoo', 'last...",582227166,RT,254,199,en,poi,Atlantic City,"{'type': 'Polygon', 'coordinates': [[[-74.1751...",United States,en,0,0,New Jersey,
