## Install Dependencies

In [1]:
# based on kaggle environment

# work-around to install broken dependency of pycontractions
%pip install git+https://github.com/MCFreddie777/language-check.git
%pip install pycontractions
%pip install symspellpy # there is a faster alternative, but it has bad documentation

Collecting git+https://github.com/MCFreddie777/language-check.git
  Cloning https://github.com/MCFreddie777/language-check.git to /tmp/pip-req-build-veo2cbuu
  Running command git clone -q https://github.com/MCFreddie777/language-check.git /tmp/pip-req-build-veo2cbuu
Building wheels for collected packages: language-check
  Building wheel for language-check (setup.py) ... [?25ldone
[?25h  Created wheel for language-check: filename=language_check-1.1-py3-none-any.whl size=90190985 sha256=9407d4567a7d6387eaf5e53991611c0b18521b76245934fd98fd951033b59107
  Stored in directory: /tmp/pip-ephem-wheel-cache-_c_3_nia/wheels/c6/a8/a2/992f0138466a1425526af353678322a252aa8cbd0a6d93e2e7
Successfully built language-check
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Setting Up Dependencies

In [2]:
import re
import string
import pkg_resources

import pandas as pd
from symspellpy import SymSpell, Verbosity
from pycontractions import Contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import word_tokenize

In [3]:
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")

symspell_segment = SymSpell(max_dictionary_edit_distance=2, prefix_length=8)
symspell_segment.load_dictionary(dictionary_path, term_index=0, count_index=1)

symspell_typo = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
symspell_typo.load_dictionary(dictionary_path, term_index=0, count_index=1)
symspell_typo.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

True

In [4]:
cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

In [5]:
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

## Data Preprocessing

In [6]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
train.insert(3, 'cleaned_text', train['text'])
test.insert(3, 'cleaned_text', test['text'])

In [8]:
# a few functions to clean the text
def to_lower(text):
    return text.lower()

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_urls(text):
    urls = re.compile(r'https?://\S+|www\.\S+|pic.twitter.com\S+')
    return re.sub(urls, ' ', text)

def handle_unicode(text):
    return text.encode('ascii', 'replace').decode('utf-8')

def words_with_numbers(text):
    return re.sub(r'\w*\d\w*', ' ', text)

def remove_numbers(text):
    return re.sub(r'\d+', ' ', text)

def replace_numbers(text):
    return re.sub(r'\d+', 'number', text)

def remove_mentions(text):
    return re.sub(r'@\w*', ' ', text)

def remove_newlines(text):
    return text.replace('\n', ' ')

def expand_hashtags(text):
    hashtags = re.findall(r'#\w*', text)
    for hashtag in hashtags:
        hashtag = hashtag.replace('#', '').replace('_', '')
        if len(hashtag) == 0 or hashtag.isnumeric():
            continue
        expanded = symspell_segment.word_segmentation(hashtag).corrected_string
        text = text.replace(hashtag, expanded)
    return text.replace('#', ' ')

def remove_repeated_characters(text):
    ret = ""
    for c in text:
        if len(ret) == 0 or c != ret[-1]:
            ret += c
    return ret

def correct_typos(text):
    suggestions = symspell_typo.lookup_compound(text, max_edit_distance=2)
    return suggestions[0].term

def expand_contractions(text):
    gen = cont.expand_texts([text], precise=True)
    return next(gen)

def remove_stopwords(text):
    words = word_tokenize(text)
    ret = [word for word in words if word not in stop_words]
    return ' '.join(ret)

def stem(text):
    words = word_tokenize(text)
    ret = [stemmer.stem(word) for word in words]
    return ' '.join(ret)

def lemmatize(text):
    words = word_tokenize(text)
    ret = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(ret)

In [9]:
pipline = [
    handle_unicode,
    remove_newlines,
    remove_urls,
    expand_hashtags,
    remove_mentions,
    words_with_numbers,
    remove_numbers,
    expand_contractions,
    remove_punctuation,
    to_lower,
    remove_repeated_characters,
    correct_typos,
    remove_stopwords,
    stem,
    lemmatize,
]

def apply_pipline(pipeline, text):
    for func in pipline:
        text = func(text)
    return text


train["cleaned_text"] = train["cleaned_text"].apply(lambda x: apply_pipline(pipline, x))
test["cleaned_text"] = test["cleaned_text"].apply(lambda x: apply_pipline(pipline, x))

train.head()



Unnamed: 0,id,keyword,location,cleaned_text,text,target
0,1,,,bed reason earthquak may alan forgiv u al,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,forest fire near la rang sask canada,Forest fire near La Ronge Sask. Canada,1
2,5,,,al resid ask shelter place notifi offic evacu ...,All residents asked to 'shelter in place' are ...,1
3,6,,,peopl receiv wildfir evacu order california,"13,000 people receive #wildfires evacuation or...",1
4,7,,,got sent photo rubi alaska smoke wildfir pour ...,Just got sent this photo from Ruby #Alaska as ...,1


## Using the FastText Library

In [16]:
# write to fasttext format
with open("./data/cleaned_text.train", "w") as f:
    f.writelines("__label__" + train["target"].astype(str) + " " + train["cleaned_text"] + "\n")

In [17]:
import fasttext

In [24]:
model = fasttext.train_supervised(input="./data/cleaned_text.train", epoch=50, lr=0.5, wordNgrams=2, verbose=2, minCount=1)
model.save_model("./models/cleaned_text.bin")

Read 0M words
Number of words:  7826
Number of labels: 2
Progress: 100.0% words/sec/thread: 1330425 lr:  0.000000 avg.loss:  0.099717 ETA:   0h 0m 0s 0.099717 ETA:   0h 0m 0s


In [25]:
test["predictions"] = test["cleaned_text"].apply(lambda x: model.predict(x)[0][0].replace('__label__', ''))

with open("./submissions/fasttext.csv", "w") as f:
    f.write("id,target\n")
    for i, pred in zip(test["id"], test["predictions"]):
        f.write(f"{i},{pred}\n")