In [4]:
import re
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# remove URLs, mentions, hashtags, and special characters
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#', '', text)     # Remove hashtags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

def tokenize_text(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def preprocess_text(text):
    text = clean_text(text)
    tokens = tokenize_text(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_tokens(tokens)
    return ' '.join(tokens)

def preprocess_dataframe(df, text_column='text'):
    """Preprocess an entire dataframe column containing text."""
    df[text_column] = df[text_column].apply(preprocess_text)
    return df

print("preprocess.py has been loaded.")
print("available functions:", dir())

preprocess.py has been loaded.
available functions: ['In', 'Out', 'WordNetLemmatizer', '_', '__', '___', '__builtin__', '__builtins__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '__vsc_ipynb_file__', '_dh', '_i', '_i1', '_i2', '_i3', '_i4', '_ih', '_ii', '_iii', '_oh', 'clean_text', 'exit', 'f', 'file', 'get_ipython', 'lemmatize_tokens', 'lemmatizer', 'nltk', 'open', 'pd', 'preprocess_dataframe', 'preprocess_text', 'quit', 're', 'remove_stopwords', 'stop_words', 'stopwords', 'string', 'tokenize_text', 'word_tokenize']


[nltk_data] Downloading package punkt to C:\Users\Florian
[nltk_data]     Horwege\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Florian
[nltk_data]     Horwege\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Florian
[nltk_data]     Horwege\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
df = pd.read_csv('../data/external/train.csv')
sample = df.sample(10)
print(sample)

         id            keyword                      location  \
4109   5840          hailstorm  Newton Centre, Massachusetts   
2737   3935         devastated                     Banbridge   
5840   8346               ruin                           NaN   
6458   9239  suicide%20bombing                           NaN   
2669   3831           detonate                           NaN   
7131  10214            volcano                         Earth   
2679   3841           detonate                 Morioh, Japan   
6610   9466          terrorism                           NaN   
5741   8195               riot                      Belgrade   
2481   3560           desolate                           NaN   

                                                   text  target  
4109  Freak #Boston #hailstorm produces a hailstorm ...       1  
2737  'Er indoors will be devastated. RIP Arfur. #Ge...       1  
5840                             I ruin everything ????       0  
6458  Remembering Marlene Menah

In [6]:
clean_sample = preprocess_dataframe(sample)
print(clean_sample)

         id            keyword                      location  \
4109   5840          hailstorm  Newton Centre, Massachusetts   
2737   3935         devastated                     Banbridge   
5840   8346               ruin                           NaN   
6458   9239  suicide%20bombing                           NaN   
2669   3831           detonate                           NaN   
7131  10214            volcano                         Earth   
2679   3841           detonate                 Morioh, Japan   
6610   9466          terrorism                           NaN   
5741   8195               riot                      Belgrade   
2481   3560           desolate                           NaN   

                                                   text  target  
4109  freak produce hailstorm business autobody repa...       1  
2737                    er indoors devastated rip arfur       1  
5840                                    ruin everything       0  
6458  remembering marlene menah