In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from sklearn import feature_extraction, linear_model, model_selection, preprocessing


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train = pd.read_csv("../input/nlp-getting-started/train.csv")
test = pd.read_csv("../input/nlp-getting-started/test.csv")

train.head()

In [3]:
train.keyword.value_counts()

In [4]:
train.location.value_counts()

### Preprocessing

#### Lowercasing

In [5]:
train["keyword"] = train["keyword"].apply(lambda x : str.lower(x) if pd.isna(x) != True else x)
train["text"] = train["text"].apply(lambda x : str.lower(x) if pd.isna(x) != True else x)
train["location"] = train["location"].apply(lambda x : str.lower(x) if pd.isna(x) != True else x)

test["keyword"] = test["keyword"].apply(lambda x : str.lower(x) if pd.isna(x) != True else x)
test["text"] = test["text"].apply(lambda x : str.lower(x) if pd.isna(x) != True else x)
test["location"] = test["location"].apply(lambda x : str.lower(x) if pd.isna(x) != True else x)

In [6]:
train.head()

#### Entities, URL Links and Punctuation Removal

In [7]:
import re,string

def remove_entities(tweet):
    entity_prefixes = ["@", "#"]
    for separator in string.punctuation:
        if separator not in entity_prefixes:
            tweet = tweet.replace(separator," ") #replaceing every punctuation symbol with a space, except for the hashtags and tags.
    words = []
    for word in tweet.split(" "):
        word = word.strip() #removing uncessary sapces at the start and end of each word
        if word:
            if word[0] not in entity_prefixes:
                words.append(word) #adding the word the list words only if its not a hashtag or a tag
    return " ".join(words)

train["keyword"] = train["keyword"].apply(lambda x : remove_entities(x) if pd.isna(x) != True else x)
train["text"] = train["text"].apply(lambda x : remove_entities(x) if pd.isna(x) != True else x)
train["location"] = train["location"].apply(lambda x : remove_entities(x) if pd.isna(x) != True else x)

test["keyword"] = test["keyword"].apply(lambda x : remove_entities(x) if pd.isna(x) != True else x)
test["text"] = test["text"].apply(lambda x : remove_entities(x) if pd.isna(x) != True else x)
test["location"] = test["location"].apply(lambda x : remove_entities(x) if pd.isna(x) != True else x)

In [8]:
#Remove url links
import re

train["keyword"] = train["keyword"].apply(lambda x : re.sub(r"(?:\@|http?\://|https?\://|www)\S+", ' ', x) if pd.isna(x) != True else x)
train["text"] = train["text"].apply(lambda x : re.sub(r"(?:\@|http?\://|https?\://|www)\S+", ' ', x) if pd.isna(x) != True else x)
train["location"] = train["location"].apply(lambda x : re.sub(r"(?:\@|http?\://|https?\://|www)\S+", ' ', x) if pd.isna(x) != True else x)

test["keyword"] = test["keyword"].apply(lambda x : re.sub(r"(?:\@|http?\://|https?\://|www)\S+", ' ', x) if pd.isna(x) != True else x)
test["text"] = test["text"].apply(lambda x : re.sub(r"(?:\@|http?\://|https?\://|www)\S+", ' ', x) if pd.isna(x) != True else x)
test["location"] = test["location"].apply(lambda x : re.sub(r"(?:\@|http?\://|https?\://|www)\S+", ' ', x) if pd.isna(x) != True else x)

In [9]:
#remove punctuation

train["keyword"] = train["keyword"].apply(lambda x : re.sub(r"[^\w\s]", " ", x) if pd.isna(x) != True else x)
train["text"] = train["text"].apply(lambda x : re.sub(r"[^\w\s]", " ", x) if pd.isna(x) != True else x)
train["location"] = train["location"].apply(lambda x : re.sub(r"[^\w\s]", " ", x) if pd.isna(x) != True else x)

test["keyword"] = test["keyword"].apply(lambda x : re.sub(r"[^\w\s]", " ", x) if pd.isna(x) != True else x)
test["text"] = test["text"].apply(lambda x : re.sub(r"[^\w\s]", " ", x) if pd.isna(x) != True else x)
test["location"] = test["location"].apply(lambda x : re.sub(r"[^\w\s]", " ", x) if pd.isna(x) != True else x)

In [10]:
pip install symspellpy

In [11]:
#spelling correction

from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell()

dictionary_path = "./frequency_dictionary_en_82_765.txt"

sym_spell.load_dictionary(dictionary_path, 0, 1)

def spelling_correction(sent):
    doc_w_correct_spelling = []
    for tok in sent.split(" "):
        
        x = sym_spell.lookup(tok, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)[0].__str__()
        y = x.split(',')[0]
        doc_w_correct_spelling.append(y)
        
    return " ".join(doc_w_correct_spelling)

train["keyword"] = train["keyword"].apply(lambda x : spelling_correction(x) if pd.isna(x) != True else x)
train["text"] = train["text"].apply(lambda x : spelling_correction(x) if pd.isna(x) != True else x)
train["location"] = train["location"].apply(lambda x : spelling_correction(x) if pd.isna(x) != True else x)

test["keyword"] = test["keyword"].apply(lambda x : spelling_correction(x) if pd.isna(x) != True else x)
test["text"] = test["text"].apply(lambda x : spelling_correction(x) if pd.isna(x) != True else x)
test["location"] = test["location"].apply(lambda x : spelling_correction(x) if pd.isna(x) != True else x)

#### Filling missing data

In [12]:
train.info()
test.info()
#missing data in both sets

In [13]:
pip install sentence_transformers

In [14]:
#Keyword extraction to fill the missing data in the keyword column

import spacy #advanced NLP library
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import os
os.system('python -m spacy download en')
nlp = spacy.load('en_core_web_sm')

In [15]:
import tensorflow_hub as hub

model = "https://tfhub.dev/google/nnlm-en-dim50/2"
embed = hub.load(model)

def extract_keywords(nlp=nlp, doc="", no_of_keywords=5, model=model):
    doc = doc.lower()
    doc = re.sub(r'(?:\@|http?\://|https?\://|www)\S+', ' ', doc)
    doc = re.sub(r'[^\w\s]', ' ', doc)
    doc = re.sub(' \d+ ', ' ', doc)
    
    doc_ = nlp(doc)
    
    #list of word categories (parts-of-speech tags)
    pos_tag = ['VERB', 'NOUN', 'ADJ', 'PROPN']
    result = []
    
    for token in doc_:
        if(toekn.pos_ in pos_tag):
            result.append(token.text)
            
    doc_embedding = model.encode([doc])
    results_embeddings = model.encode(result)
    
    #calculate similarity between document and results embeddings
    distances = cosine_similarity(doc_embedding, results_embeddings)
    
    #get the top similar keywords
    keywords = [result[index] for index in distances.argsort()[0][-no_of_keywords:]]
    
    return keywords

In [16]:
#fill the empty entries in train[keyword]
for i in range(len(train["keyword"])):
    if pd.isnull(train["keyword"].iloc[i]):
        try:
            train["keyword"].iloc[i] = extract_keywords(nlp=nlp, doc=train.text.iloc[i], no_of_keywords=1, model=model)[0]
        except:
            train["keyword"].iloc[i] = "NaN"

#fill the empty entries in test[keyword]
for i in range(len(test["keyword"])):
    if pd.isnull(test["keyword"].iloc[i]):
        try:
            test["keyword"].iloc[i] = extract_keywords(nlp=nlp, doc=test.text.iloc[i], no_of_keywords=1, model=model)[0]
        except:
            test["keyword"].iloc[i] ="NaN"

In [17]:
#Entity recognition

def get_location(nlp=nlp, doc=""):
    doc_ = nlp(doc)
    
    location = ""
    
    for ent in doc_.ents:
        if ent.label in ["GPE", "ORG"]: #if the entity's label is a geopolitical entity
            location = location + ent.text + " "
    return location
#fill the empty entities in train["location"]
for i in range(len(train["location"])):
    if pd.isnull(train["location"].iloc[i]):
        try:
            train["location"].iloc[i] = get_location(nlp=nlp, doc=train.text.iloc[i])
        except:
            test["location"].iloc[i] = "NaN"

#fill the empty entities in test["location"]
for i in range(len(test["location"])):
    if pd.isnull(test["location"].iloc[i]):
        try:
            test["location"].iloc[i] = get_location(nlp=nlp, doc=test.iloc[i])
        except:
            test["location"].iloc[i] = "NaN"

#### Lemmatization

The goal of lemmatization is to convert a word to its root form

In [18]:
def lemmatize(sentence):
    doc = nlp(sentence)
    lemmas = [token.lemma_ for token in doc]
    return " ".join(lemmas)

train["keyword"] = train["keyword"].apply(lambda x : lemmatize(x) if pd.isna(x) != True else x)
train["text"] = train["text"].apply(lambda x : lemmatize(x) if pd.isna(x) != True else x)
train["location"] = train["location"].apply(lambda x : lemmatize(x) if pd.isna(x) != True else x)

test["keyword"] = test["keyword"].apply(lambda x : lemmatize(x) if pd.isna(x) != True else x)
test["text"] = test["text"].apply(lambda x : lemmatize(x) if pd.isna(x) != True else x)
test["location"] = test["location"].apply(lambda x : lemmatize(x) if pd.isna(x) != True else x)

In [19]:
#Stop words removal

def remove_stopwords(sentence):
    doc = nlp(sentence)
    all_stopwords = nlp.Defaults.stop_words
    doc_tokens = [token.text for token in doc]
    tokens_without_sw = [word for word in doc_tokens if not word in all_stopwords]
    return " ".join(tokens_without_sw)

train["keyword"] = train["keyword"].apply(lambda x : remove_stopwords(x) if pd.isna(x) != True else x)
train["text"] = train["text"].apply(lambda x : remove_stopwords(x) if pd.isna(x) != True else x)
train["location"] = train["location"].apply(lambda x : remove_stopwords(x) if pd.isna(x) != True else x)

test["keyword"] = test["keyword"].apply(lambda x : remove_stopwords(x) if pd.isna(x) != True else x)
test["text"] = test["text"].apply(lambda x : remove_stopwords(x) if pd.isna(x) != True else x)
test["location"] = test["location"].apply(lambda x : remove_stopwords(x) if pd.isna(x) != True else x)

#### Model

In [20]:
from sklearn.utils import shuffle

train = shuffle(train, random_state=42).reset_index(drop=True) #shuffling training data
y = np.array(train["target"].tolist())

key_embed = embed(train.keyword.to_list())
loc_embed = embed(train.location.to_list())
text_embed = embed(train.text.to_list())

In [22]:
from tensorflow.keras.layers import Input, Dense, Flatten, concatenate, Dropout
from tensorflow.keras import Model

keyword_input = Input(shape=(key_embed.shape[1],))
location_input = Input(shape=(loc_embed.shape[1],))
text_input = Input(shape=(text_embed.shape[1],))

#create keyword model
key_model = Flatten()(keyword_input)
key_model = Dense(1024, activation='relu')(key_model)
key_model = Dropout(0.5)(key_model)

#create location model
loc_model = Flatten()(location_input)
loc_model = Dense(1024, activation='relu')(loc_model)
loc_model = Dropout(0.5)(loc_model)

#create text model
text_model = Flatten()(text_input)
text_model = Dense(1024, activation='relu')(text_model)
text_model = Dropout(0.5)(text_model)

#concatenate the three models
merged = concatenate([key_model,
                         loc_model,
                             text_model], axis=1)
merged = Dense(1024, activation='relu')(merged)
merged = Dropout(0.5)(merged)
final = Dense(1, activation='sigmoid')(merged)

final = Model(inputs = [keyword_input, location_input, text_input], outputs=final)

In [26]:
lr = 0.1 #learning rate
epochs = 100 #number of epochs
opt = tf.keras.optimizers.SGD(lr = lr, momentum = 0.8, decay = lr/epochs) #optimizer

final.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

earlystop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', mode = 'min', patience=10, verbose = 1
)

history = final.fit(
            x = [key_embed, loc_embed, text_embed],
            y = y,
            batch_size = 32,
            epochs = epochs,
#             callbacks = [earlystop],
            validation_split = 0.1
)

### Submission

In [27]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [46]:
test = shuffle(test, random_state=42).reset_index(drop=True) #shuffling training data

key_embed_test = embed(test.keyword.to_list())
loc_embed_test = embed(test.location.to_list())
text_embed_test = embed(test.text.to_list())

In [75]:
prediction = np.round(final.predict([key_embed_test, loc_embed_test, text_embed_test]))
sample_submission["target"] = prediction.astype('int64')

In [82]:
sample_submission.value_counts()

In [77]:
sample_submission.to_csv("submission.csv", index=False)