In [1]:
import pandas as pd
import re
import unicodedata
import nltk
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
df.shape

(7613, 5)

In [7]:
df['location'].fillna('unknown', inplace = True)

In [8]:
df = df.dropna()

In [9]:
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords


def basic_clean(original):
    '''
    The function takes in a string and does the basic clean to the string
    '''
    
    # convert text to all lower case for normalcy. 
    article = original.lower()
    
    # remove any accented, non-ACSII cahracters 
    article = unicodedata.normalize('NFKD', article)\
                .encode('ascii', 'ignore')\
                .decode('utf-8', 'ignore')
    
    # replace anthing that is not a letter, number, whitespace 
    article = re.sub(r"[^a-z0-9'\s]", '', article)
    
    return article                

# %%
def tokenize(original):
    '''
    This function takes in a string and returns a tokenized string. 
    '''
    # Create the object
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use the tokenizer
    article = tokenizer.tokenize(original, return_str=True)
    
    return article

# %%
def stem(article):
    '''
    This function takes in a string and returns a string with words stemmed. 
    '''    
    # Create the nltk stemmer object
    ps = nltk.porter.PorterStemmer()
    
    # Use list comprehension to stemmingly transform all the words in the article
    stems = [ps.stem(word) for word in article.split()]
    
    # Join the stemmed words back to a string
    stemmed_article = ' '.join(stems)
    
    return stemmed_article

# %%
def lemmatize(article):
    '''
    This function takes in a string and returns a string with words lemmatized. 
    '''
    
    # Create the nltk lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use list comprehension to lemmatizedly transform all the words in the article
    lemmas = [wnl.lemmatize(word) for word in article.split()]
    
    # Join the lemmatized words back to a string
    lemmatized_article = ' '.join(lemmas)
    
    return lemmatized_article

# %%
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string, optional extra_words and exlude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in the text
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in the string
    words = string.split()
    
    # Create a list of words from the string with stopwords removed and assign to variable
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a varibale
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

# %%
def prep_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with the option
    to pass lists for extra_words and exlucde_words and returns a df with the text article title, 
    original text, stemmed text, lemmatized text, cleaned-tokenized-lemmatized-stopwords removed text.  
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, extra_words=extra_words, exclude_words=exclude_words)\
                            .apply(lemmatize)
    
    df['stemmed'] = df[column].apply(basic_clean).apply(stem)
    
    df['lemmatized'] = df[column].apply(basic_clean).apply(lemmatize)
    
    return df[['id', 'keyword', 'location', column, 'stemmed', 'lemmatized', 'clean']]

In [10]:
tweets = prep_data(df, 'text', extra_words=[], exclude_words=[])

In [11]:
tweets.head()

Unnamed: 0,id,keyword,location,text,stemmed,lemmatized,clean
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,bbcmtd wholesal market ablaz httptcolhyxeohy6c,bbcmtd wholesale market ablaze httptcolhyxeohy6c,bbcmtd wholesale market ablaze httptcolhyxeohy6c
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,we alway tri to bring the heavi metal rt httpt...,we always try to bring the heavy metal rt http...,always try bring heavy metal rt httptcoyao1e0xngw
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,africanbaz break newsnigeria flag set ablaz in...,africanbaze breaking newsnigeria flag set abla...,africanbaze breaking newsnigeria flag set abla...
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,cri out for more set me ablaz,cry out for more set me ablaze,cry set ablaze
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,on plu side look at the sky last night it wa a...,on plus side look at the sky last night it wa ...,plus side look sky last night ablaze httptcoqq...


In [12]:
words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df.clean]

In [13]:
words = pd.DataFrame({'words': words})

In [14]:
words.head()

Unnamed: 0,words
0,"[bbcmtd, wholesale, market, ablaze, httptcolhy..."
1,"[always, try, bring, heavy, metal, rt, httptco..."
2,"[africanbaze, breaking, newsnigeria, flag, set..."
3,"[cry, set, ablaze]"
4,"[plus, side, look, sky, last, night, ablaze, h..."


In [16]:
tweets.head()

Unnamed: 0,id,keyword,location,text,stemmed,lemmatized,clean
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,bbcmtd wholesal market ablaz httptcolhyxeohy6c,bbcmtd wholesale market ablaze httptcolhyxeohy6c,bbcmtd wholesale market ablaze httptcolhyxeohy6c
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,we alway tri to bring the heavi metal rt httpt...,we always try to bring the heavy metal rt http...,always try bring heavy metal rt httptcoyao1e0xngw
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,africanbaz break newsnigeria flag set ablaz in...,africanbaze breaking newsnigeria flag set abla...,africanbaze breaking newsnigeria flag set abla...
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,cri out for more set me ablaz,cry out for more set me ablaze,cry set ablaze
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,on plu side look at the sky last night it wa a...,on plus side look at the sky last night it wa ...,plus side look sky last night ablaze httptcoqq...


In [18]:
tweets.isnull().sum()

id            0
keyword       0
location      0
text          0
stemmed       0
lemmatized    0
clean         0
dtype: int64

In [19]:
words.isnull().sum()

words    0
dtype: int64

In [20]:
# Add bigrams columns for visualizations
words['bigrams'] = [list(nltk.ngrams(wordlist, 2)) for wordlist in words.words]
words.head()

Unnamed: 0,words,bigrams
0,"[bbcmtd, wholesale, market, ablaze, httptcolhy...","[(bbcmtd, wholesale), (wholesale, market), (ma..."
1,"[always, try, bring, heavy, metal, rt, httptco...","[(always, try), (try, bring), (bring, heavy), ..."
2,"[africanbaze, breaking, newsnigeria, flag, set...","[(africanbaze, breaking), (breaking, newsniger..."
3,"[cry, set, ablaze]","[(cry, set), (set, ablaze)]"
4,"[plus, side, look, sky, last, night, ablaze, h...","[(plus, side), (side, look), (look, sky), (sky..."


In [21]:
# Add trigrams columns for visualizations
words['trigrams'] = [list(nltk.ngrams(wordlist, 3)) for wordlist in words.words]
words.head()

Unnamed: 0,words,bigrams,trigrams
0,"[bbcmtd, wholesale, market, ablaze, httptcolhy...","[(bbcmtd, wholesale), (wholesale, market), (ma...","[(bbcmtd, wholesale, market), (wholesale, mark..."
1,"[always, try, bring, heavy, metal, rt, httptco...","[(always, try), (try, bring), (bring, heavy), ...","[(always, try, bring), (try, bring, heavy), (b..."
2,"[africanbaze, breaking, newsnigeria, flag, set...","[(africanbaze, breaking), (breaking, newsniger...","[(africanbaze, breaking, newsnigeria), (breaki..."
3,"[cry, set, ablaze]","[(cry, set), (set, ablaze)]","[(cry, set, ablaze)]"
4,"[plus, side, look, sky, last, night, ablaze, h...","[(plus, side), (side, look), (look, sky), (sky...","[(plus, side, look), (side, look, sky), (look,..."


In [22]:
# Add fourgrams columns for visualizations
words['fourgrams'] = [list(nltk.ngrams(wordlist, 4)) for wordlist in words.words]
words.head()

Unnamed: 0,words,bigrams,trigrams,fourgrams
0,"[bbcmtd, wholesale, market, ablaze, httptcolhy...","[(bbcmtd, wholesale), (wholesale, market), (ma...","[(bbcmtd, wholesale, market), (wholesale, mark...","[(bbcmtd, wholesale, market, ablaze), (wholesa..."
1,"[always, try, bring, heavy, metal, rt, httptco...","[(always, try), (try, bring), (bring, heavy), ...","[(always, try, bring), (try, bring, heavy), (b...","[(always, try, bring, heavy), (try, bring, hea..."
2,"[africanbaze, breaking, newsnigeria, flag, set...","[(africanbaze, breaking), (breaking, newsniger...","[(africanbaze, breaking, newsnigeria), (breaki...","[(africanbaze, breaking, newsnigeria, flag), (..."
3,"[cry, set, ablaze]","[(cry, set), (set, ablaze)]","[(cry, set, ablaze)]",[]
4,"[plus, side, look, sky, last, night, ablaze, h...","[(plus, side), (side, look), (look, sky), (sky...","[(plus, side, look), (side, look, sky), (look,...","[(plus, side, look, sky), (side, look, sky, la..."


In [38]:
words.bigrams.value_counts().head()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[]                                                                                                                                                                                                                                                                                  41
[(11yearold, boy), (boy, charged), (charged, manslaughter), (manslaughter, toddler), (toddler, report), (report, 11yearold), (11yearold, boy), (boy, charged), (charged, manslaughter), (manslaughter, fatal), (fatal, sh)]                                                         10
[(horrible, sinking), (sinking, feeling), (feeling, youuave), (youuave, home), (home, phone), (phone, realise), (realise, 3g), (3g, whole), (whole, time)]                                                                                                                           7
[(bestnaijamade, 16yr), (16yr, old), (old, pkk), (pkk, suicide), (suicide, bomber), (bomber, detonated), (detonated, bomb), (bomb, httptcoksawlyux02), (httptcoksaw

In [39]:
words.words.value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[11yearold, boy, charged, manslaughter, toddler, report, 11yearold, boy, charged, manslaughter, fatal, sh]                                                         10
[horrible, sinking, feeling, youuave, home, phone, realise, 3g, whole, time]                                                                                        7
[prophet, peace, upon, saidsave, hellfire, even, giving, half, date, charity]                                                                                       6
[came, land, engulfed, tribal, war, turned, land, peace, ie, madinah, prophetmuhammad, islam]                                                                       6
[bestnaijamade, 16yr, old, pkk, suicide, bomber, detonated, bomb, httptcoksawlyux02, bestnaijamade, bestnaijamade, bestnaijamade, beu]                              6
[madhya, pradesh, train, derailment, village, youth, saved, many, life]                                                                                             5
[vid