## Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [3]:
pd.set_option('display.max_colwidth', 100)

## Data Collection

In [4]:
train_path = "data/train.csv"

In [5]:
df = pd.read_csv(train_path)

In [6]:
df.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1


## Preprocessing

In [7]:
import re

In [8]:
def lowercase_first_word(text):
    # Define a regular expression to find the first word in the sentence
    match = re.match(r'^\w+', text)  # Match the first word at the start of the string
    
    if match:  # If a match is found
        first_word = match.group(0)  # Extract the first word
        # Replace the first word with its lowercase version in the text
        return text.replace(first_word, first_word.lower(), 1)  # Only replace the first occurrence
    return text  # If no match is found, return the original text

In [9]:
df['basic_text'] = df['text'].apply(lowercase_first_word)

In [10]:
df.head(3)

Unnamed: 0,id,keyword,location,text,target,basic_text
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1,all residents asked to 'shelter in place' are being notified by officers. No other evacuation or...


## Data Split

In [11]:
X = df['basic_text']
y = df['target']

In [12]:
y.value_counts(normalize=True)

target
0    0.57034
1    0.42966
Name: proportion, dtype: float64

In [13]:
threshold = 0.45 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

---

# Basic Bag of Words
- Consists of a bag of words with a simple preprocessing step.

In [15]:
# Bag of Words (CountVectorizer)
vectorizer = CountVectorizer()

In [16]:
X_train_bow = vectorizer.fit_transform(X_train).astype('float64')
X_test_bow = vectorizer.transform(X_test).astype('float64')

In [17]:
from lightgbm import LGBMClassifier

In [18]:
model = LGBMClassifier(random_state=42)

In [19]:
model.fit(X_train_bow, y_train)

[LightGBM] [Info] Number of positive: 2622, number of negative: 3468
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1860
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 705
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.430542 -> initscore=-0.279641
[LightGBM] [Info] Start training from score -0.279641


In [20]:
probs = model.predict_proba(X_test_bow)[:, 1]

In [21]:
predictions = np.where(probs >= threshold, 1, 0)

In [22]:
f1_score(y_test, predictions)

0.7460317460317459

In [23]:
recall_score(y_test, predictions)

0.724191063174114

In [24]:
precision_score(y_test, predictions)

0.7692307692307693

- Before to continue, I'll create a function to execute and reporte metrics of the BOW vectorization.

In [25]:
from sklearn.metrics import roc_auc_score

In [26]:
def train_bow(text_column: str, target: str, threshold: float, dataframe: pd.DataFrame) -> None:
    copy = dataframe.copy(deep=True)
    
    tmp_X = copy[text_column]
    tmp_y = copy[target]
    
    X_train, X_test, y_train, y_test = train_test_split(tmp_X, tmp_y, test_size=0.2, random_state=42)
    
    vectorizer = CountVectorizer()
    
    X_train_bow = vectorizer.fit_transform(X_train).astype('float64')
    X_test_bow = vectorizer.transform(X_test).astype('float64')
    
    model = LGBMClassifier(random_state=42).fit(X_train_bow, y_train)
    
    probs = model.predict_proba(X_test_bow)[:, 1]
    y_test_pred = np.where(probs >= threshold, 1, 0)
    
    f1_score_value = f1_score(y_test, y_test_pred)    
    recall_value = recall_score(y_test, y_test_pred)
    precision_value = precision_score(y_test, y_test_pred)
    auc_score = roc_auc_score(y_test, probs)

    print(f"AUC Score: {auc_score:.4f}")
    print(f"F1 Score: {f1_score_value:.4f}")
    print(f"Recall: {recall_value:.4f}")
    print(f"Precision: {precision_value:.4f}")

---

# Lemmatization and Stemming

In [27]:
import nltk
from nltk.stem import PorterStemmer

In [28]:
stemmer = PorterStemmer()

In [29]:
def stem_text(text):
    words = text.split()  # Split the text into words using whitespace
    stemmed_words = [stemmer.stem(word) for word in words]  # Stem each word
    return ' '.join(stemmed_words)

In [30]:
df['stemmed_text'] = df['basic_text'].apply(stem_text)

In [31]:
import spacy

In [32]:
nlp = spacy.load('en_core_web_sm')

In [33]:
def lemmatize_text(text):
    doc = nlp(text)  # Process the text with SpaCy's NLP pipeline
    lemmatized_words = [token.lemma_ for token in doc]  # Extract lemmatized form for each word
    return ' '.join(lemmatized_words)

In [34]:
df['lemmatized_text'] = df['basic_text'].apply(lemmatize_text)

In [35]:
df.head(3)

Unnamed: 0,id,keyword,location,text,target,basic_text,stemmed_text,lemmatized_text
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,our deed are the reason of thi #earthquak may allah forgiv us all,our deed be the Reason of this # earthquake may ALLAH forgive we all
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near La Ronge Sask. Canada,forest fire near La rong sask. canada,forest fire near La Ronge Sask . Canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1,all residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,all resid ask to 'shelter in place' are be notifi by officers. No other evacu or shelter in plac...,all resident ask to ' shelter in place ' be be notify by officer . no other evacuation or shelte...


In [36]:
train_bow(
    text_column="lemmatized_text",
    target="target",
    threshold=threshold,
    dataframe=df
)

[LightGBM] [Info] Number of positive: 2622, number of negative: 3468
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1819
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 673
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.430542 -> initscore=-0.279641
[LightGBM] [Info] Start training from score -0.279641
AUC Score: 0.8506
F1 Score: 0.7447
Recall: 0.7304
Precision: 0.7596


In [37]:
train_bow(
    text_column="stemmed_text",
    target="target",
    threshold=threshold,
    dataframe=df
)

[LightGBM] [Info] Number of positive: 2622, number of negative: 3468
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1881
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 700
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.430542 -> initscore=-0.279641
[LightGBM] [Info] Start training from score -0.279641
AUC Score: 0.8486
F1 Score: 0.7434
Recall: 0.7211
Precision: 0.7672


In [38]:
train_bow(
    text_column="basic_text",
    target="target",
    threshold=threshold,
    dataframe=df
)

[LightGBM] [Info] Number of positive: 2622, number of negative: 3468
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1860
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 705
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.430542 -> initscore=-0.279641
[LightGBM] [Info] Start training from score -0.279641
AUC Score: 0.8417
F1 Score: 0.7460
Recall: 0.7242
Precision: 0.7692


---

# POS tagging and Stop Words removal

In [39]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'textcat'])

In [40]:
def preprocess_text_with_pos(text: str, pos_to_keep: list = ['NOUN', 'VERB', 'ADJ']) -> str:
    """
    Preprocesses the input text by tokenizing, filtering tokens based on Part-of-Speech (POS) tags,
    and lemmatizing the filtered tokens. Stop words and punctuation are removed.

    Parameters
    ----------
    text : str
        The input text to preprocess.
    
    pos_to_keep : list of str, optional
        A list of POS tags to keep after filtering. The tokens with these POS tags will be lemmatized.
        The default is ['NOUN', 'VERB', 'ADJ'].

        Common POS tags in SpaCy that can be used:
        - 'NOUN' : Noun
        - 'VERB' : Verb
        - 'ADJ'  : Adjective
        - 'ADV'  : Adverb
        - 'PRON' : Pronoun
        - 'PROPN': Proper Noun
        - 'DET'  : Determiner
        - 'ADP'  : Adposition (prepositions like "in", "on", etc.)
        - 'CONJ' : Conjunction
        - 'NUM'  : Numeral
        - 'PART' : Particle
        - 'PUNCT': Punctuation (excluded by default)
    
    Returns
    -------
    str
        A preprocessed string where only the lemmatized tokens with the specified POS tags are included.
        Stop words and punctuation are removed.
    """

    # Process the text using SpaCy's NLP pipeline
    doc = nlp(text)
    
    filtered_tokens = []
    
    # Iterate through each token in the text
    for token in doc:
        # Check if the token's POS is in the allowed list and it is not a stop word or punctuation
        if token.pos_ in pos_to_keep and not token.is_stop and not token.is_punct:
            # Add the lemmatized version of the token to the list
            filtered_tokens.append(token.lemma_)
    
    # Join the filtered tokens back into a string and return it
    return ' '.join(filtered_tokens)

In [41]:
df['pos_tagging_text'] = df['text'].apply(preprocess_text_with_pos)

In [42]:
train_bow(
    text_column="pos_tagging_text",
    target="target",
    threshold=threshold,
    dataframe=df
)

[LightGBM] [Info] Number of positive: 2622, number of negative: 3468
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014073 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 935
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 370
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.430542 -> initscore=-0.279641
[LightGBM] [Info] Start training from score -0.279641
AUC Score: 0.7937
F1 Score: 0.6913
Recall: 0.6641
Precision: 0.7207


# NER and Stop Words removal

In [43]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'textcat'])

In [44]:
def preprocess_text_with_ner_and_stopwords(text: str, keep_entities: bool = True) -> str:
    """
    Preprocesses the input text by removing stop words, identifying named entities, 
    and optionally keeping named entities in their original form or lemmatizing non-entity tokens.

    Parameters
    ----------
    text : str
        The input text to preprocess.
    
    keep_entities : bool, optional
        If True, named entities (e.g., "New York", "Apple Inc.") will be kept in their original form.
        If False, the named entities will also be lemmatized like other tokens. Default is True.

    Returns
    -------
    str
        A preprocessed string where stop words are removed, and named entities are optionally kept 
        in their original form.

    """

    # Process the text using SpaCy's NLP pipeline
    doc = nlp(text)
    
    preprocessed_tokens = []
    
    # Iterate through each token in the text
    for token in doc:
        # Check if the token is not a stop word or punctuation
        if not token.is_stop and not token.is_punct:
            # If the token is a named entity and we want to keep it as is
            if token.ent_type_ and keep_entities:
                preprocessed_tokens.append(token.text)
            else:
                # Otherwise, append the lemmatized token
                preprocessed_tokens.append(token.lemma_)
    
    # Join the preprocessed tokens back into a string and return it
    return ' '.join(preprocessed_tokens)

In [45]:
df['ner_text'] = df['text'].apply(preprocess_text_with_ner_and_stopwords)

In [46]:
train_bow(
    text_column="ner_text",
    target="target",
    threshold=threshold,
    dataframe=df
)

[LightGBM] [Info] Number of positive: 2622, number of negative: 3468
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020317 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1402
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 539
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.430542 -> initscore=-0.279641
[LightGBM] [Info] Start training from score -0.279641
AUC Score: 0.8445
F1 Score: 0.7360
Recall: 0.7088
Precision: 0.7654
