# Preprocessing

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import kagglehub
# import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import re

## Download NLTK data

In [2]:
nltk.download("stopwords") # for stopword removal
nltk.download('wordnet') # for lemmatization
nltk.download('averaged_perceptron_tagger_eng') # for part-of-speech tagging

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nixosuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/nixosuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/nixosuser/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

## Load dataset

In [3]:
path = kagglehub.dataset_download("abhi8923shriv/sentiment-analysis-dataset")
train_dataset = path+"/train.csv"
test_dataset = path+"/test.csv"
print(train_dataset)
print(test_dataset)
train_df = pd.read_csv(train_dataset, encoding="ISO-8859-1")
test_df = pd.read_csv(test_dataset, encoding="ISO-8859-1")
train_df.head()

/home/nixosuser/.cache/kagglehub/datasets/abhi8923shriv/sentiment-analysis-dataset/versions/9/train.csv
/home/nixosuser/.cache/kagglehub/datasets/abhi8923shriv/sentiment-analysis-dataset/versions/9/test.csv


Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


## Remove missing

In [4]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

## Normalisation

In [5]:
def normalize_text(text):
    """Remove html tags, special characters, digits and convert to lowercase

    Args:
        text (str): the text to be normalized

    Returns:
        str: the normalized text
    """
    if isinstance(text, str):
        text = text.lower()
        # remove tags
        text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)
        # remove special characters and digits
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        # replace punctuation characters with spaces
        # text = re.sub("(\\d|\\W)+"," ", text)
        # remove tags
        text = re.sub(r'<.*?>', '', text)
        # remove extra spaces
        text = re.sub(r'[^\w\s]', '', text)
        # remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
    else:
        text = str(text)
    return text

train_df['clean_text'] = train_df['text'].apply(normalize_text)
test_df['clean_text'] = test_df['text'].apply(normalize_text)

train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60,id have responded if i were going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105,sooo sad i will miss you here in san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18,my boss is bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164,what interview leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26,sons of why couldnt they put them on the relea...


## Remove stopwords

In [6]:
def remove_stopwords(text):
    if isinstance(text, str):
        words = text.split()
        filtered_words = [word for word in words if word not in stopwords.words('english')]
        filtered_text = ' '.join(filtered_words)
    else:
        filtered_text = ''
    return filtered_text
    
train_df['text_without_stopwords'] = train_df['clean_text'].apply(remove_stopwords)
test_df['text_without_stopwords'] = test_df['clean_text'].apply(remove_stopwords)

train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_text,text_without_stopwords
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60,id have responded if i were going,id responded going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105,sooo sad i will miss you here in san diego,sooo sad miss san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18,my boss is bullying me,boss bullying
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164,what interview leave me alone,interview leave alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26,sons of why couldnt they put them on the relea...,sons couldnt put releases already bought


## Tokenization

In [9]:
def tokenize_text(text):
    try:
        text = str(text)
        tokens = word_tokenize(text)
        return tokens
    except Exception as e:
        print(f"Error tokenizing text: {e}")
        return []

train_df['tokens'] = train_df['text_without_stopwords'].apply(tokenize_text)
test_df['tokens'] = test_df['text_without_stopwords'].apply(tokenize_text)

train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_text,text_without_stopwords,tokens,lemmatized_tokens
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60,id have responded if i were going,id responded going,"[id, responded, going]","[id, have, respond, if, i, be, go]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105,sooo sad i will miss you here in san diego,sooo sad miss san diego,"[sooo, sad, miss, san, diego]","[sooo, sad, i, will, miss, you, here, in, san,..."
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18,my boss is bullying me,boss bullying,"[boss, bullying]","[my, bos, be, bully, me]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164,what interview leave me alone,interview leave alone,"[interview, leave, alone]","[what, interview, leave, me, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26,sons of why couldnt they put them on the relea...,sons couldnt put releases already bought,"[sons, couldnt, put, releases, already, bought]","[son, of, why, couldnt, they, put, them, on, t..."


## Lemmatization with POS tagging

In [10]:
nltk_lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(tokens):
    tokens_with_pos = nltk.pos_tag(tokens)
    lemmatized_text = []
    for token, pos in tokens_with_pos:
        wordnet_pos = get_wordnet_pos(pos)
        lemmatized_text.append(nltk_lemmatizer.lemmatize(token, pos=wordnet_pos))
    return lemmatized_text

train_df["lemmatized_tokens"] = train_df["tokens"].apply(lemmatize_text)
test_df["lemmatized_tokens"] = test_df["tokens"].apply(lemmatize_text)

train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_text,text_without_stopwords,tokens,lemmatized_tokens
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60,id have responded if i were going,id responded going,"[id, responded, going]","[id, respond, go]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105,sooo sad i will miss you here in san diego,sooo sad miss san diego,"[sooo, sad, miss, san, diego]","[sooo, sad, miss, san, diego]"
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18,my boss is bullying me,boss bullying,"[boss, bullying]","[bos, bully]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164,what interview leave me alone,interview leave alone,"[interview, leave, alone]","[interview, leave, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26,sons of why couldnt they put them on the relea...,sons couldnt put releases already bought,"[sons, couldnt, put, releases, already, bought]","[son, couldnt, put, release, already, buy]"


## Save preprocessed data

In [None]:
train_df.to_csv("train_preprocessed.csv", index=False)
test_df.to_csv("test_preprocessed.csv", index=False)