In [516]:
import nltk
import re, string
import pandas as pd 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from jupyterthemes import jtplot
jtplot.style(theme='solarizedl')

In [517]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import inflect
from spellchecker import SpellChecker


In [531]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [532]:
df['text'] = " "+df.text
df.drop_duplicates(subset=['text'], inplace=True)
df['hashtags'] = df.text.str.findall(r'#.*?(?=\s|$)')

In [535]:
#### text preprocessing specifically formatted for tweets but will work on any text
def tweet_preprocess(df): 
    df = " "+df
    """combine regex and tokenization text processing for tweet text processing"""
    # dealing with those damn contractions
    contractions_dict = {
                          " aint": "are not",
                          " arent": "are not",
                          " cant": "can not",
                          " cause": "because",
                          " couldve": "could have",
                          " couldnt": "could not",
                          " didnt": "did not",
                          " doesnt": "does not",
                          " dont": "do not",
                          " hadnt": "had not",
                          " hasnt": "has not",
                          " havent": "have not",
                          " hed": "he would",
                          " hes": "he is",
                          " howd": "how did",
                          " howdy": "how do you",
                          " howll": "how will",
                          " hows": "how is",
                          " id": "i would",
                          " ida": "i would have",
                          " im": "i am",
                          " ive": "i have",
                          " isnt": "is not",
                          " itd": "it had",
                          " itll": "it will",
                          " its": "it is",
                          " lets": "let us",
                          " maam": "madam",
                          " mightve": "might have",
                          " mighta": "might have",
                          " mightnt": "might not",
                          " mustve": "must have",
                          " musta": "must have",
                          " mustnt": "must not",
                          " neednt": "need not",
                          " oclock": "of the clock",
                          " shes": "she is",
                          " shoulda": "should have",
                          " shouldve": "should have",
                          " shouldnt": "should not",
                          " so'd": "so did",
                          " thatd": "that would",
                          " thats": "that is",
                          " thered": "there had",
                          " theres": "there is",
                          " theyd": "they would",
                          " theyda": "they would have",
                          " theyll": "they will",
                          " theyre": "they are",
                          " theyve": "they have",
                          " wasnt": "was not",
                          " weve": "we have",
                          " werent": "were not",
                          " whatll": "what will",
                          " whatllve": "what will have",
                          " whatre": "what are",
                          " whats": "what is",
                          " whatve": "what have",
                          " whens": "when is",
                          " whenve": "when have",
                          " whered": "where did",
                          " whers": "where is",
                          " whereve": "where have",
                          " wholl": "who will",
                          " whollve": "who will have",
                          " whos": "who is",
                          " whove": "who have",
                          " whys": "why is",
                          " whyve": "why have",
                          " willve": "will have",
                          " wont": "will not",
                          " wontve": "will not have",
                          " wouldve": "would have",
                          " wouldnt": "would not",
                          " wouldntve": "would not have",
                          " yall": "you all",
                          " yalls": "you alls",
                          " yalld": "you all would",
                          " yalldve": "you all would have",
                          " yallre": "you all are",
                          " yallve": "you all have",
                          " youd": "you had",
                          " youda": "you would have",
                          " youdve": "you would have",
                          " youll": "you you will",
                          " youllve": "you you will have",
                          " youre": "you are",
                          " youve": "you have",
                          " ain't": "are not",
                          " aren't": "are not",
                          " can't": "can not",
                          " can't've": "can not have",
                          " 'cause": "because",
                          " bc": "because",
                          " b/c": "because",
                          " could've": "could have",
                          " couldn't": "could not",
                          " couldn't've": "could not have",
                          " didn't": "did not",
                          " doesn't": "does not",
                          " don't": "do not",
                          " hadn't": "had not",
                          " hadn't've": "had not have",
                          " hasn't": "has not",
                          " haven't": "have not",
                          " he'd": "he would",
                          " he'd've": "he would have",
                          " he'll": "he will",
                          " he'll've": "he will have",
                          " he's": "he is",
                          " how'd": "how did",
                          " how'd'y": "how do you",
                          " how'll": "how will",
                          " how's": "how is",
                          " i'd": "i would",
                          " i'd've": "i would have",
                          " i'll": "i will",
                          " i'll've": "i will have",
                          " i'm": "i am",
                          " i've": "i have",
                          " isn't": "is not",
                          " it'd": "it had",
                          " it'd've": "it would have",
                          " it'll": "it will",
                          " it'll've": "it will have",
                          " it's": "it is",
                          " let's": "let us",
                          " ma'am": "madam",
                          " mayn't": "may not",
                          " might've": "might have",
                          " mightn't": "might not",
                          " mightn't've": "might not have",
                          " must've": "must have",
                          " mustn't": "must not",
                          " mustn't've": "must not have",
                          " needn't": "need not",
                          " needn't've": "need not have",
                          " o'clock": "of the clock",
                          " oughtn't": "ought not",
                          " oughtn't've": "ought not have",
                          " shan't": "shall not",
                          " sha'n't": "shall not",
                          " shan't've": "shall not have",
                          " she'd": "she would",
                          " she'd've": "she would have",
                          " she'll": "she will",
                          " she'll've": "she will have",
                          " she's": "she is",
                          " should've": "should have",
                          " shouldn't": "should not",
                          " shouldn't've": "should not have",
                          " so've": "so have",
                          " so's": "so is",
                          " that'd": "that would",
                          " that'd've": "that would have",
                          " that's": "that is",
                          " there'd": "there had",
                          " there'd've": "there would have",
                          " there's": "there is",
                          " they'd": "they would",
                          " they'd've": "they would have",
                          " they'll": "they will",
                          " they'll've": "they will have",
                          " they're": "they are",
                          " they've": "they have",
                          " to've": "to have",
                          " wasn't": "was not",
                          " we'd": "we had",
                          " we'd've": "we would have",
                          " we'll": "we will",
                          " we'll've": "we will have",
                          " we're": "we are",
                          " we've": "we have",
                          " weren't": "were not",
                          " what'll": "what will",
                          " what'll've": "what will have",
                          " what're": "what are",
                          " what's": "what is",
                          " what've": "what have",
                          " when's": "when is",
                          " when've": "when have",
                          " where'd": "where did",
                          " where's": "where is",
                          " where've": "where have",
                          " who'll": "who will",
                          " who'll've": "who will have",
                          " who's": "who is",
                          " who've": "who have",
                          " why's": "why is",
                          " why've": "why have",
                          " will've": "will have",
                          " won't": "will not",
                          " won't've": "will not have",
                          " would've": "would have",
                          " wouldn't": "would not",
                          " wouldn't've": "would not have",
                          " y'all": "you all",
                          " y'alls": "you alls",
                          " y'all'd": "you all would",
                          " y'all'd've": "you all would have",
                          " y'all're": "you all are",
                          " y'all've": "you all have",
                          " you'd": "you had",
                          " you'da": "you would have",
                          " you'd've": "you would have",
                          " you'll": "you you will",
                          " you'll've": "you you will have",
                          " you're": "you are",
                          " you've": "you have",
                          " hwy": "highway",
                          " fvck": "fuck",
                          " im": "i am",
                          " rt": "retweet",
                          " fyi": "for your information",
                          " omw": "on my way",
                          " 1st": "first",
                          " 2nd": "second",
                          " 3rd": "third",
                          " 4th": "fourth",


}

    def expand_contractions(text, c_re=c_re):
        c_re = re.compile('|'.join('(%s)' % k for k in contractions_dict.keys()))
        def replace(match):
            expansion =  f" {contractions_dict[match.group(0)]}"
            return expansion
        text = c_re.sub(replace, text.lower())
        return text

    # function to expand contractions, remove urls and characters before tokenization processing
    def denoise_text(text):
        new_text = re.sub(r"\S*https?:\S*",  r"", text.lower())
        new_text_contractions = expand_contractions(new_text)
        new_text_symbols = re.sub(r"[^\w\s@#]",  r"", new_text_contractions)
        new_text_ascii = re.sub('[^\u0000-\u007f]', '',  new_text_symbols)
        text = new_text_ascii.strip()
        return text 
    
# tokenization & lemmatization function returns tokens    
    def lemmatize_text(text):
        tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
        lemmatizer = WordNetLemmatizer() 
        return [lemmatizer.lemmatize(w, pos='v') for w in tokenizer.tokenize(text)]

# tokenization & stemmer function returns tokens
    def stem_text(text):
        tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
        stemmer = PorterStemmer()
        return [stemmer.stem(w) for w in tokenizer.tokenize(text)]

    def replace_numbers(tokens):
# replace integers with string formatted words for numbers
        dig2word = inflect.engine()
        new_tokens = []
        for word in tokens:
            if word.isdigit():
                new_word = dig2word.number_to_words(word)
                new_tokens.append(new_word)
            else:
                new_tokens.append(word)
        return new_tokens
    
    def remove_non_ascii(tokens):
# remove non ascii characters from text
        new_tokens = []
        for word in tokens:
            new_token = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            new_tokens.append(new_token)
        return new_tokens
    
# remove stopwords   
    def remove_stopwords(tokens):
        stop_list = stopwords.words('english')  
        new_tokens = []
        for word in tokens:
            if word not in stop_list:
                new_tokens.append(word)
        return new_tokens
  
 
    def normalize_text(tokens):
# wrapper for processing tokenized text 
        words = replace_numbers(tokens)
        tokens = remove_stopwords(words)
        return tokens
    

    def text_prepare(text):
        clean_text = denoise_text(text)
        processed_text =  (clean_text)
        text = ' '.join([x for x in normalize_text(lem_text)])
        text = re.sub(r"-",  r" ", text)
        return text
    
    new_df = [text_prepare(x) for x in df]

    return new_df 

In [537]:
df['tweets'] = tweet_preprocess(df.text,text_process=lemmatize_text)
df['keyword'] = tweet_preprocess(df.keyword.astype(str).replace({r"%20" : r" "}, regex=True))
df['hashtags'] = df.tweets.str.findall(r'#.*?(?=\s|$)')

TypeError: text_prepare() missing 1 required positional argument: 'text_process'

In [530]:
df

Unnamed: 0,id,keyword,location,text,target,hashtags,tweets
0,1,,,Our Deeds are the Reason of this #earthquake ...,1,[#earthquake],deeds reason #earthquake may allah forgive us
1,4,,,Forest fire near La Ronge Sask. Canada,1,[],forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are...,1,[],residents ask shelter place notify officer eva...
3,6,,,"13,000 people receive #wildfires evacuation o...",1,[#wildfires],thirteen thousand people receive #wildfires ev...
4,7,,,Just got sent this photo from Ruby #Alaska as...,1,"[#alaska, #wildfires]",get send photo ruby #alaska smoke #wildfires p...
...,...,...,...,...,...,...,...
7604,10863,,,#WorldNews Fallen powerlines on G:link tram: ...,1,[#worldnews],#worldnews fall powerlines glink tram update f...
7605,10864,,,on the flip side I'm at Walmart and there is ...,1,[],flip side walmart bomb everyone evacuate stay ...
7606,10866,,,Suicide bomber kills 15 in Saudi security sit...,1,[],suicide bomber kill fifteen saudi security sit...
7608,10869,,,Two giant cranes holding a bridge collapse in...,1,[],two giant crane hold bridge collapse nearby home


In [522]:
hashtag_list = []
# splitting the text into words
for row in test:
    for x in row.split():
        if x.startswith('#') == True:
            hashtag_list.append(x)

In [None]:
pos_text_tags = df['tweets'].apply(lambda row: [nltk.pos_tag(row) for item in row])
pos_keyword_tags = df['keyword'].apply(lambda row: [nltk.pos_tag(row) for item in row])

In [524]:
df

Unnamed: 0,id,keyword,location,text,target,hashtags,tweets
0,1,,,Our Deeds are the Reason of this #earthquake ...,1,[#earthquake],deeds reason #earthquake may allah forgive us
1,4,,,Forest fire near La Ronge Sask. Canada,1,[],forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are...,1,[],residents ask shelter place notify officer eva...
3,6,,,"13,000 people receive #wildfires evacuation o...",1,[#wildfires],thirteen thousand people receive #wildfires ev...
4,7,,,Just got sent this photo from Ruby #Alaska as...,1,"[#alaska, #wildfires]",get send photo ruby #alaska smoke #wildfires p...
...,...,...,...,...,...,...,...
7604,10863,,,#WorldNews Fallen powerlines on G:link tram: ...,1,[#worldnews],#worldnews fall powerlines glink tram update f...
7605,10864,,,on the flip side I'm at Walmart and there is ...,1,[],flip side walmart bomb everyone evacuate stay ...
7606,10866,,,Suicide bomber kills 15 in Saudi security sit...,1,[],suicide bomber kill fifteen saudi security sit...
7608,10869,,,Two giant cranes holding a bridge collapse in...,1,[],two giant crane hold bridge collapse nearby home


In [228]:
df.location[df.location != 'nan']

31                       birmingham
32      est september 2012  bristol
33                           africa
34                  philadelphia pa
35                        london uk
                   ...             
7575                             tn
7577         #newcastleupontyne #uk
7579               vancouver canada
7580                        london 
7581                        lincoln
Name: location, Length: 5080, dtype: object

In [224]:
df.keyword.unique()

array(['nan', 'ablaze', 'accident', 'aftershock', 'airplane accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown up', 'body bag', 'body bagging', 'body bags',
       'bomb', 'bombed', 'bombing', 'bridge collapse',
       'buildings burning', 'buildings on fire', 'burned', 'burning',
       'burning buildings', 'bush fires', 'casualties', 'casualty',
       'catastrophe', 'catastrophic', 'chemical emergency', 'cliff fall',
       'collapse', 'collapsed', 'collide', 'collided', 'collision',
       'crash', 'crashed', 'crush', 'crushed', 'curfew', 'cyclone',
       'damage', 'danger', 'dead', 'death', 'deaths', 'debris', 'deluge',
       'deluged', 'demolish', 'demolished', 'demolition', 'derail',
       'derailed', 'derailment', 'des

In [None]:
plt.figure(figsize=(11,11))
colors = ['lightblue', 'red']
expl = (0, 0.1)
df.target.value_counts().plot(kind='pie', legend=True, startangle=45, shadow=True, 
                             colors=colors, autopct='%1.1f%%')
plt.title('target count', fontsize=20)

In [287]:
spell = SpellChecker()

In [526]:
misspelled = spell.unknown(df.tweets)

In [527]:
misspelled

{'',
 'heat wave #squad #revitup #pizzarev',
 'hate white people mo',
 'body bag bitch',
 'haha traumatise hell want job xxx',
 'band build fire play wednesday #rdg',
 'thirst curfew',
 'bag body smoke hot',
 'need friends booze darude sandstorm',
 'well dad survive drive',
 'egg desolate',
 'cafe run acid attack survivors #india',
 'peacetime time national emergency',
 'empire avenue crush soul via',
 'another white mass murderer #antioch',
 'nuclear bomb terrible weapon',
 'osp concern mount fatalities',
 'free hailstorm maxi',
 'people survive like',
 '6beyonce pick fan army #beyhive',
 '@so @sorry deluge hell vines',
 'get money tomorrow riot',
 'change password link kick hijacker',
 'let follow',
 'laugh talk junk everyone panic mode',
 'real shit damage bitch',
 'bhavanas mom crush everyones soul',
 'hear lighten see thunder',
 'like beautiful ass tragedy lol',
 'love love love remember first crush',
 'every bts song jimin scream',
 'didnt want hurt fear drive midnight',
 '#nowpl