In [25]:
import os
import re, string, unicodedata
import pandas as pd 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
from jupyterthemes import jtplot
jtplot.style(theme='solarizedl')

In [26]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer, WhitespaceTokenizer
from nltk.corpus import stopwords
import inflect
from spellchecker import SpellChecker
import csv

In [62]:
def csv2dict(csv_name):
    with open(csv_name, mode='r') as input_file:
        reader = csv.reader(input_file)
        return {rows[0]:rows[1] for rows in reader}

In [63]:
location_expansion_dict = csv2dict('location_expansion.csv')
word_expansion_dict = csv2dict('word_expansion.csv')

In [29]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [65]:
df.location = df.location.astype(str)

In [432]:
def expand_hashtags(text):
    hashtag_list = []
    hashtag_exp_list = []


    def proper_nouns_lower(text): 
        text = re.sub('#', ' ', text)
        proper_noun = re.match(r"(?<=[A-Z])[a-z]", text)
        if text is proper_noun:
            new_string = re.sub(proper_noun, proper_noun.group(0).lower(), text)
        else:
            new_string = text
        return new_string

    def word_expansion(text):
        c_re = re.compile('|'.join('(\b%s\b)' % re.escape(s) for s in word_expansion_dict.keys()), re.IGNORECASE)
        def replace(match):
            expansion =  f" {word_expansion_dict[match.group(0)]}"
            return expansion
        text = c_re.sub(replace, text.lower())
        return text

    def camel_case_split(text):
        text = re.sub('#', ' ', text)
        exp_hashtags = re.sub(r'((?<!\A)(?<=[a-z])[A-Z]|(?<!\A)(?=[A-Z])[a-z+])', r' \1', text)
        # h2.append(re.sub(r'((?<!\A)(?<=[A-Z])[a-z])', r' \1', text))
        return exp_hashtags
        
        
    def denoise_text(text):
        new_text = re.sub(r"\S*https?:\S*",  r"", text)
        new_text_punct = re.sub(r"[^\w\s#]",  r"", new_text)
        new_text_chars = re.sub('[^\u0000-\u007f]', '',  new_text_punct)
        x = re.sub('#cafire', 'california fire', new_text_chars)
        x = re.sub('#calfires', 'california fires', x)
        x = re.sub('#calwildfires', 'california wildfires', x)
        x = re.sub('#cadrought', 'california drought', x)
        new_text_ = re.sub('_', '',  new_text_punct)
        return new_text_
    
    def replace_numbers(tokens):
# replace integers with string formatted words for numbers
        dig2word = inflect.engine()
        new_tokens = []
        for word in tokens:
            if word.isdigit():
                new_word = dig2word.number_to_words(word)
                new_tokens.append(new_word)
            else:
                new_tokens.append(word)
        return new_tokens
    
    for tweet in text:
        for x in tweet.split():
            if x.startswith('#') == True:
                clean_text = denoise_text(x)
                cleaner_text = replace_numbers(clean_text)
                hashtag_list.append(''.join(cleaner_text))
                
    for hashtag in hashtag_list: 
        exp_hashtag = camel_case_split(hashtag)
        hashtag_exp_list.append(exp_hashtag)
        
    return dict(zip(hashtag_list, hashtag_exp_list))

In [82]:
#### text preprocessing specifically formatted for tweets but will work on any text
def tweet_preprocess(df): 
    """combine regex and nltk processing for tweet text processing"""


    def word_expansion(text):

        c_re = re.compile('|'.join('(\b%s\b)' % re.escape(s) for s in word_expansion_dict.keys()), re.IGNORECASE)
        def replace(match):
            expansion =  f" {word_expansion_dict[match.group(0)]}"
            return expansion
        text = c_re.sub(replace, text.lower())
        return text
    
    def camel_case_split(text):
        if text.startswith('#'):
            exp_hashtags = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', text)
            return ' '.join([x for x in exp_hashtags])

    # function to expand contractions, remove urls and characters before tokenization processing
    def denoise_text(text):
        new_text = re.sub(r"\S*https?:\S*",  r"", text.lower())
        new_text_exp = word_expansion(new_text)
        x = re.sub('#cafire', 'california fire', new_text_exp)
        x = re.sub('#calfires', 'california fires', x)
        x = re.sub('#calwildfires', 'california wildfires', x)
        x = re.sub('#cadrought', 'california drought', x)
        hash_exp = camel_case_split(x)
        new_text_punct = re.sub(r"[^\w\s@]",  r"", hash_exp)
        new_text_chars = re.sub('[^\u0000-\u007f]', '',  new_text_punct)
        strip_text = new_text_chars.strip()
        #remove_hashtags_text = re.sub('#\w+', '',  strip_text)
        return strip_text 
    
# tokenization & lemmatization function returns tokens    
    def lemmatize_text(text):
        tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
        lemmatizer = WordNetLemmatizer() 
        return [lemmatizer.lemmatize(w, pos='v') for w in tokenizer.tokenize(text)]

# tokenization & stemmer function returns tokens
    def stem_text(text):
        tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
        stemmer = PorterStemmer()
        return [stemmer.stem(w) for w in tokenizer.tokenize(text)]

    def replace_numbers(tokens):
# replace integers with string formatted words for numbers
        dig2word = inflect.engine()
        new_tokens = []
        for word in tokens:
            if word.isdigit():
                new_word = dig2word.number_to_words(word)
                new_tokens.append(new_word)
            else:
                new_tokens.append(word)
        return new_tokens
    
    def remove_non_ascii(tokens):
# remove non ascii characters from text
        new_tokens = []
        for word in tokens:
            new_token = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            new_tokens.append(new_token)
        return new_tokens
    
# remove stopwords   
    def remove_stopwords(tokens):
        stop_list = stopwords.words('english')  
        new_tokens = []
        for word in tokens:
            if word not in stop_list:
                new_tokens.append(word)
        return new_tokens
  
 
    def norm_text(tokens):
        words = replace_numbers(tokens)
        #tokens = remove_stopwords(words)
        words = remove_non_ascii(words)
        return words
    

    def process_text(text):
        clean_text = denoise_text(text)
        lem_text = lemmatize_text(clean_text)
        text = ' '.join([x for x in norm_text(lem_text)])
        text = re.sub(r"-",  r" ", text)
        return text
    
    new_df = [process_text(x) for x in df]

    return new_df 

In [None]:
h_re = '([a-z]+)(?=[A-Z])|([A-Z][a-z]+)'
        exp_hashtags = filter(None, re.split(h_re, text))
        new_hashtag =  ' '.join([x for x in exp_hashtags])
        return new_hashtag

In [167]:
    def make_first_char_lower(text): 
        new_string = re.sub(r"#(?=[A-Z])",
                    lambda x :  x.group(0).lower(), text)
        return new_string
    make_first_char_lower("#ABnlkD")

'#ABnlkD'

In [None]:
if char[0] is upper and followed by char[1] lower then make char[0 ]lower

In [432]:

def expand_hashtags(text):
    hashtag_list = []
    hashtag_exp_list = []


    def proper_nouns_lower(text): 
        text = re.sub('#', ' ', text)
        proper_noun = re.match(r"(?<=[A-Z])[a-z]", text)
        if text is proper_noun:
            new_string = re.sub(proper_noun, proper_noun.group(0).lower(), text)
        else:
            new_string = text
        return new_string

    def word_expansion(text):

        c_re = re.compile('|'.join('(\b%s\b)' % re.escape(s) for s in word_expansion_dict.keys()), re.IGNORECASE)
        def replace(match):
            expansion =  f" {word_expansion_dict[match.group(0)]}"
            return expansion
        text = c_re.sub(replace, text.lower())
        return text
    


    def camel_case_split(text):
        text = re.sub('#', ' ', text)
        exp_hashtags = re.sub(r'((?<!\A)(?<=[a-z])[A-Z]|(?<!\A)(?=[A-Z])[a-z+])', r' \1', text)

       # h2.append(re.sub(r'((?<!\A)(?<=[A-Z])[a-z])', r' \1', text))
        return exp_hashtags
        
        
    def denoise_text(text):
        new_text = re.sub(r"\S*https?:\S*",  r"", text)
        new_text_punct = re.sub(r"[^\w\s#]",  r"", new_text)
        new_text_chars = re.sub('[^\u0000-\u007f]', '',  new_text_punct)
        x = re.sub('#cafire', 'california fire', new_text_chars)
        x = re.sub('#calfires', 'california fires', x)
        x = re.sub('#calwildfires', 'california wildfires', x)
        x = re.sub('#cadrought', 'california drought', x)
        new_text_ = re.sub('_', '',  new_text_punct)
        return new_text_
    
    def replace_numbers(tokens):
# replace integers with string formatted words for numbers
        dig2word = inflect.engine()
        new_tokens = []
        for word in tokens:
            if word.isdigit():
                new_word = dig2word.number_to_words(word)
                new_tokens.append(new_word)
            else:
                new_tokens.append(word)
        return new_tokens
    
    for tweet in text:
        for x in tweet.split():
            if x.startswith('#') == True:
                clean_text = denoise_text(x)
                cleaner_text = replace_numbers(clean_text)
                hashtag_list.append(''.join(cleaner_text))
                
    for hashtag in hashtag_list: 

        exp_hashtag = camel_case_split(hashtag)
        hashtag_exp_list.append(exp_hashtag)
    return dict(zip(hashtag_list, hashtag_exp_list))

In [96]:
def denoise_location(df):

    
    def word_expansion(text):
        # compile using word boundary so only complete work
        c_re = re.compile('|'.join('(\b%s\b)' % (re.escape(s) for s in location_expansion_dict.keys()), re.IGNORECASE))
        def replace(match):
            expansion =  f" {location_expansion_dict[match.group(0)]}"
            return expansion
        text = c_re.sub(replace, text.lower())
        return text
    # tokenization & lemmatization function returns tokens    
    def lemmatize_text(text):
        tokenizer = TweetTokenizer(strip_handles=True)
        lemmatizer = WordNetLemmatizer() 
        return [lemmatizer.lemmatize(w, pos='n') for w in tokenizer.tokenize(text)]

    
    def remove_non_ascii(tokens):
# remove non ascii characters from text
        new_tokens = []
        for word in tokens:
            new_token = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            new_tokens.append(new_token)
        return new_tokens
    
    def norm_text(tokens):
        words = remove_non_ascii(tokens)
        return words
    
    def denoise_location_text(text):
        text = str(text)
        new_text = re.sub(r"\S*https?:\S*",  r"", text.lower())
        new_string = re.sub(r"[^\w\s]",  r" ", new_text)
        new_string = re.sub(r"\d",  r"", new_string)
        unicode_string = re.sub('[^\u0000-\u007f]', '',  new_string)
        new_text_contractions = expand_abbreviations_contractions(unicode_string)
        clean_text = re.sub(r"est september   ",  r"", new_text_contractions)
        lem_text = lemmatize_text(clean_text)
        text = ' '.join([x for x in norm_text(lem_text)])
        text = re.sub(r"-",  r" ", text)
        return text

    
    new_df = [denoise_location_text(x) for x in df]
    return new_df 



In [433]:
hashtags = expand_hashtags(df.text)

hashtags

{'#earthquake': ' earthquake',
 '#wildfires': ' wildfires',
 '#Alaska': ' Alaska',
 '#RockyFire': ' Rocky Fire',
 '#CAfire': ' CAfire',
 '#flood': ' flood',
 '#disaster': ' disaster',
 '#flooding': ' flooding',
 '#raining': ' raining',
 '#Florida': ' Florida',
 '#TampaBay': ' Tampa Bay',
 '#Tampa': ' Tampa',
 '#Flood': ' Flood',
 '#We': ' We',
 '#BREAKING': ' BREAKING',
 '#metal': ' metal',
 '#RT': ' RT',
 '#AFRICANBAZE': ' AFRICANBAZE',
 '#mufc': ' mufc',
 '#Bridgetown': ' Bridgetown',
 '#nsfw': ' nsfw',
 '#Kurds': ' Kurds',
 '#Diyala': ' Diyala',
 '#California': ' California',
 '#climate': ' climate',
 '#energy': ' energy',
 '#NowPlaying': ' Now Playing',
 '#EDM': ' EDM',
 '#NashvilleTraffic': ' Nashville Traffic',
 '#SantaClara': ' Santa Clara',
 '#BayArea': ' Bay Area',
 '#Traffic': ' Traffic',
 '#personalinjury': ' personalinjury',
 '#solicitor': ' solicitor',
 '#OtleyHour': ' Otley Hour',
 '#stlouis': ' stlouis',
 '#caraccidentlawyer': ' caraccidentlawyer',
 '#TruckCrash': ' Truc

In [431]:
#hashtags = extract_hashtags(df.text)
df['tweets'] = tweet_preprocess((df.text.astype(str).replace({r"([#[A-Z][a-z])": r" \1"}, regex=True)))
df['keyword'] = tweet_preprocess(df.keyword.astype(str).replace({r"%20" : r" "}, regex=True))

TypeError: expected string or bytes-like object

In [39]:
df['clean_location'] = denoise_location(df.location)
df['clean_location'] = clean_loc

In [24]:
train_X = df.filter(['tweets','clean_location','keyword'], axis=1)
train_y = df.filter(['target'], axis=1)

In [43]:
all_text = df.tweets + " " + df.clean_location + " " + df.keyword
all_text

0       our deeds be the reason of this earthquake may...
1           forest fire near la ronge sask canada nan nan
2       all residents ask to shelter in place be be no...
3       thirteen thousand people receive wildfires eva...
4       just get send this photo from ruby alaska as s...
                              ...                        
7604    world news fall powerlines on glink tram you p...
7605    on the flip side i be at walmart and there be ...
7606    suicide bomber kill fifteen in saudi security ...
7608    two giant crane hold a bridge collapse into ne...
7612    the latest more home raze by northern californ...
Length: 7503, dtype: object

In [44]:
df['all_text'] = all_text
df

Unnamed: 0,id,keyword,location,text,target,tweets,clean_location,all_text
0,1,,,Our Deeds are the Reason of this #earthqua...,1,our deeds be the reason of this earthquake may...,,our deeds be the reason of this earthquake may...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,,forest fire near la ronge sask canada nan nan
2,5,,,All residents asked to 'shelter in place' ar...,1,all residents ask to shelter in place be be no...,,all residents ask to shelter in place be be no...
3,6,,,"13,000 people receive #wildfires evacuation o...",1,thirteen thousand people receive wildfires eva...,,thirteen thousand people receive wildfires eva...
4,7,,,Just got sent this photo from Ruby # Alaska...,1,just get send this photo from ruby alaska as s...,,just get send this photo from ruby alaska as s...
...,...,...,...,...,...,...,...,...
7604,10863,,,# World News Fallen powerlines on G:link tr...,1,world news fall powerlines on glink tram you p...,,world news fall powerlines on glink tram you p...
7605,10864,,,on the flip side I'm at Walmart and there i...,1,on the flip side i be at walmart and there be ...,,on the flip side i be at walmart and there be ...
7606,10866,,,Suicide bomber kills 15 in Saudi security s...,1,suicide bomber kill fifteen in saudi security ...,,suicide bomber kill fifteen in saudi security ...
7608,10869,,,Two giant cranes holding a bridge collapse i...,1,two giant crane hold a bridge collapse into ne...,,two giant crane hold a bridge collapse into ne...


In [None]:
pos_text_tags = df['tweets'].apply(lambda row: [nltk.pos_tag(row) for item in row])
pos_keyword_tags = df['keyword'].apply(lambda row: [nltk.pos_tag(row) for item in row])

In [None]:
df.location[df.location != 'nan']

In [None]:
df.keyword.unique()

In [None]:
plt.figure(figsize=(11,11))
colors = ['lightblue', 'red']
expl = (0, 0.1)
df.target.value_counts().plot(kind='pie', legend=True, startangle=45, shadow=True, 
                             colors=colors, autopct='%1.1f%%')
plt.title('target count', fontsize=20)

In [None]:
spell = SpellChecker()

In [None]:
misspelled = spell.unknown(df.tweets)

In [None]:
misspelled

In [None]:
pip install gensim

In [None]:
import gensim
from gensim import corpora

In [None]:
corpus = df.tweets
nlp = Word2Vec(corpus, size=200,   
            window=6, min_count=1, sg=1, iter=40)
len(nlp.wv.vocab) # number of words in a dictionary

In [None]:
import pkg_resources
from symspellpy.symspellpy import SymSpell

In [None]:
hashtags[99:]

In [None]:
df

In [None]:
new_hash=[]
sym_spell = SymSpell(max_dictionary_edit_distance=1)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
for term in hashtags:
    if len(term)>1:
        result = sym_spell.word_segmentation(term)
        r = result.corrected_string
    else:
        r = ''
    new_hash.append(r)

In [None]:
df[10:50]

In [None]:
new_hash

In [None]:
# Creating the dicti
word_dict = {} 

for row in df.tweets: 
    words = tokenizer.tokenize(row) 
    for word in words: 
        if word not in word_dict.keys(): 
            word_dict[word] = 1
        else: 
            word_dict[word] += 1
print(len(word_dict))
max(word_dict, key=word_dict.get)

In [None]:
word_dict

In [None]:
hlist=[]
for x in hashtags:
    x = x.str.replace('[^a-zA-Z]', '')
    hlist.append(x)