# Dataset cleaning

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import kagglehub
# import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import re

## Get NLTK and Spacy data

In [2]:
nltk.download("stopwords")
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nixosuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/nixosuser/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/nixosuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Load dataset

In [4]:
path = kagglehub.dataset_download("abhi8923shriv/sentiment-analysis-dataset")
train_dataset = path+"/train.csv"
test_dataset = path+"/test.csv"
print(train_dataset)
print(test_dataset)
train_df = pd.read_csv(train_dataset, encoding="ISO-8859-1")
test_df = pd.read_csv(test_dataset, encoding="ISO-8859-1")
train_df.head()

/home/nixosuser/.cache/kagglehub/datasets/abhi8923shriv/sentiment-analysis-dataset/versions/9/train.csv
/home/nixosuser/.cache/kagglehub/datasets/abhi8923shriv/sentiment-analysis-dataset/versions/9/test.csv


Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


## Remove missing values

In [5]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

## Remove unnecessary characters

In [6]:
def remove_unnecessary_characters(text):
    text = re.sub(r'<.*?>', '', str(text))
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))
    text = re.sub(r'\s+', ' ', str(text)).strip()
    return text

train_df['clean_text'] = train_df['text'].apply(remove_unnecessary_characters)
test_df['clean_text'] = test_df['text'].apply(remove_unnecessary_characters)

train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60,Id have responded if I were going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105,Sooo SAD I will miss you here in San Diego
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18,my boss is bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164,what interview leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26,Sons of why couldnt they put them on the relea...


## Remove stopwords

In [7]:
filtered_words = []
def remove_stopwords(text):
    if isinstance(text, str):
        words = text.split()
        filtered_words = [word for word in words if word not in stopwords.words('english')]
        filtered_text = ' '.join(filtered_words)
    else:
        filtered_text = ''
    return filtered_text
    
train_df['text_without_stopwords'] = train_df['text'].apply(remove_stopwords)
test_df['text_without_stopwords'] = test_df['text'].apply(remove_stopwords)

train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_text,text_without_stopwords
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60,Id have responded if I were going,"I`d responded, I going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105,Sooo SAD I will miss you here in San Diego,Sooo SAD I miss San Diego!!!
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18,my boss is bullying me,boss bullying me...
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164,what interview leave me alone,interview! leave alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26,Sons of why couldnt they put them on the relea...,"Sons ****, couldn`t put releases already bought"


## Normalisation

In [8]:
def normalize_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    else:
        text = str(text)
    return text

train_df['normalized_text'] = train_df['text'].apply(normalize_text)
test_df['normalized_text'] = test_df['text'].apply(normalize_text)

train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_text,text_without_stopwords,normalized_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60,Id have responded if I were going,"I`d responded, I going",id have responded if i were going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105,Sooo SAD I will miss you here in San Diego,Sooo SAD I miss San Diego!!!,sooo sad i will miss you here in san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18,my boss is bullying me,boss bullying me...,my boss is bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164,what interview leave me alone,interview! leave alone,what interview leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26,Sons of why couldnt they put them on the relea...,"Sons ****, couldn`t put releases already bought",sons of why couldnt they put them on the relea...


## Tokenization

In [9]:
def tokenize_text(text):
    try:
        text = str(text)
        tokens = word_tokenize(text)
        return tokens
    except Exception as e:
        print(f"Error tokenizing text: {e}")
        return []

train_df['tokens'] = train_df['text'].apply(tokenize_text)
test_df['tokens'] = test_df['text'].apply(tokenize_text)

train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_text,text_without_stopwords,normalized_text,tokens
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60,Id have responded if I were going,"I`d responded, I going",id have responded if i were going,"[I, `, d, have, responded, ,, if, I, were, going]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105,Sooo SAD I will miss you here in San Diego,Sooo SAD I miss San Diego!!!,sooo sad i will miss you here in san diego,"[Sooo, SAD, I, will, miss, you, here, in, San,..."
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18,my boss is bullying me,boss bullying me...,my boss is bullying me,"[my, boss, is, bullying, me, ...]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164,what interview leave me alone,interview! leave alone,what interview leave me alone,"[what, interview, !, leave, me, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26,Sons of why couldnt they put them on the relea...,"Sons ****, couldn`t put releases already bought",sons of why couldnt they put them on the relea...,"[Sons, of, *, *, *, *, ,, why, couldn, `, t, t..."


## Lemmatization

In [10]:
nltk_lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    lemmatized_text = []
    for word in text:
        lemmatized_text.append(nltk_lemmatizer.lemmatize(word))
    return lemmatized_text

train_df['lemmatized_text'] = train_df['tokens'].apply(lemmatize_text)
test_df['lemmatized_text'] = test_df['tokens'].apply(lemmatize_text)

train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_text,text_without_stopwords,normalized_text,tokens,lemmatized_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60,Id have responded if I were going,"I`d responded, I going",id have responded if i were going,"[I, `, d, have, responded, ,, if, I, were, going]","[I, `, d, have, responded, ,, if, I, were, going]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105,Sooo SAD I will miss you here in San Diego,Sooo SAD I miss San Diego!!!,sooo sad i will miss you here in san diego,"[Sooo, SAD, I, will, miss, you, here, in, San,...","[Sooo, SAD, I, will, miss, you, here, in, San,..."
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18,my boss is bullying me,boss bullying me...,my boss is bullying me,"[my, boss, is, bullying, me, ...]","[my, bos, is, bullying, me, ...]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164,what interview leave me alone,interview! leave alone,what interview leave me alone,"[what, interview, !, leave, me, alone]","[what, interview, !, leave, me, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26,Sons of why couldnt they put them on the relea...,"Sons ****, couldn`t put releases already bought",sons of why couldnt they put them on the relea...,"[Sons, of, *, *, *, *, ,, why, couldn, `, t, t...","[Sons, of, *, *, *, *, ,, why, couldn, `, t, t..."


## Stemming

In [11]:
nltk_stemmer = LancasterStemmer()

def stem_text(text):
    stemmed_text = []
    for word in text:
        stemmed_text.append(nltk_stemmer.stem(word))
    return stemmed_text

train_df['stemmed_text'] = train_df['tokens'].apply(stem_text)
test_df['stemmed_text'] = test_df['tokens'].apply(stem_text)

train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_text,text_without_stopwords,normalized_text,tokens,lemmatized_text,stemmed_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60,Id have responded if I were going,"I`d responded, I going",id have responded if i were going,"[I, `, d, have, responded, ,, if, I, were, going]","[I, `, d, have, responded, ,, if, I, were, going]","[i, `, d, hav, respond, ,, if, i, wer, going]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105,Sooo SAD I will miss you here in San Diego,Sooo SAD I miss San Diego!!!,sooo sad i will miss you here in san diego,"[Sooo, SAD, I, will, miss, you, here, in, San,...","[Sooo, SAD, I, will, miss, you, here, in, San,...","[sooo, sad, i, wil, miss, you, her, in, san, d..."
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18,my boss is bullying me,boss bullying me...,my boss is bullying me,"[my, boss, is, bullying, me, ...]","[my, bos, is, bullying, me, ...]","[my, boss, is, bul, me, ...]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164,what interview leave me alone,interview! leave alone,what interview leave me alone,"[what, interview, !, leave, me, alone]","[what, interview, !, leave, me, alone]","[what, interview, !, leav, me, alon]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26,Sons of why couldnt they put them on the relea...,"Sons ****, couldn`t put releases already bought",sons of why couldnt they put them on the relea...,"[Sons, of, *, *, *, *, ,, why, couldn, `, t, t...","[Sons, of, *, *, *, *, ,, why, couldn, `, t, t...","[son, of, *, *, *, *, ,, why, couldn, `, t, th..."


## Credits

[😊| Sentiment Analysis by NLP🤨😑](https://www.kaggle.com/code/mohsinsial/sentiment-analysis-by-nlp)

[Sentiment Analysis: Machine Learning Approach](https://www.kaggle.com/code/poojag718/sentiment-analysis-machine-learning-approach)

[Text classification with BERT embeddings and LLM f](https://www.kaggle.com/code/adamgassem/text-classification-with-bert-embeddings-and-llm-f)