## Data Cleaning: Cleaning, tokenizing, removing stopwords and lemmatizing the data

In [1]:
# Importing necessary libraries
import re
import string

import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import treebank

# Download the nlkt tools
nltk.download('opinion_lexicon')
nltk.download('sentiwordnet')

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\danij\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\danij\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

### Read in data

In [2]:
train_data = pd.read_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\train.csv")
dev_data = pd.read_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\dev.csv")

# Concatenate the train and dev data
# data = pd.concat([train_data, dev_data])
data = train_data

# Test data
# test_data = pd.read_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\test.csv")
test_data = dev_data

# Set the display options to show the full content of each row
pd.set_option('display.max_colwidth', -1)

data.head()

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,tweet_id,therapy,text,label
0,1454224517895688192,adderall,wait until i get an adderall prescription. imma be on time for Everything,neutral
1,1426258820376842243,oxycodone,"@Sassychickie @kelly_rdc Fentanyl, OxyContin and Oxycodone! I’ve had 2 back surgeries. Never again!!!",negative
2,1473007602170798082,cbd,"a fun juggling act of mine is taking adderall and drinking coffee, then needing CBD in the afternoon to soothe my anxiety",neutral
3,1561156143405502466,percocet,percocet roxycodone with some xanax that i had crushed up in some dust\nelevated to another dimension so i got a limp in my strut,neutral
4,1559923718578741248,adderall,first day of adderall and i feel 😵‍💫😵‍💫😵‍💫😵‍💫,negative


In [3]:
print(len(data))
print(len(test_data))

3009
753


In [4]:
# Downloading the stopwords corpus from NLTK 
# words like "the", "is", "and" that are commonly used and can be ignored
stopwords = nltk.corpus.stopwords.words('english')

# Creating a WordNet lemmatizer object from NLTK (used for lemmatizing words to their base form based on context)
wn = nltk.WordNetLemmatizer()

In [7]:
# Function to clean the text by removing punctuation, converting to lowercase, and lemmatizing words
def clean_text(text):
    # Removing punctuation characters from the text and converting it to lowercase
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    # Splitting the text into tokens (words) using regular expressions that match just words
    tokens = re.split('\W+', text)
    # Lemmatizing each word in the tokens list using the WordNet lemmatizer
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    # Returning the cleaned text
    return text

In [8]:
## Applying the function to the train dataset and convert the 'cleaned_text' column from list to string
data['cleaned_text'] = data['text'].apply(lambda x: clean_text(x)).apply(' '.join)

## Applying the function to the test dataset and convert the 'cleaned_text' column from list to string
test_data['cleaned_text'] = test_data['text'].apply(lambda x: clean_text(x)).apply(' '.join)

# Show the dataset
data.head()

Unnamed: 0,tweet_id,therapy,text,label,cleaned_text
0,1454224517895688192,adderall,wait until i get an adderall prescription. imma be on time for Everything,neutral,wait get adderall prescription imma time everything
1,1426258820376842243,oxycodone,"@Sassychickie @kelly_rdc Fentanyl, OxyContin and Oxycodone! I’ve had 2 back surgeries. Never again!!!",negative,sassychickie kellyrdc fentanyl oxycontin oxycodone 2 back surgery never
2,1473007602170798082,cbd,"a fun juggling act of mine is taking adderall and drinking coffee, then needing CBD in the afternoon to soothe my anxiety",neutral,fun juggling act mine taking adderall drinking coffee needing cbd afternoon soothe anxiety
3,1561156143405502466,percocet,percocet roxycodone with some xanax that i had crushed up in some dust\nelevated to another dimension so i got a limp in my strut,neutral,percocet roxycodone xanax crushed dust elevated another dimension got limp strut
4,1559923718578741248,adderall,first day of adderall and i feel 😵‍💫😵‍💫😵‍💫😵‍💫,negative,first day adderall feel


### Save cleaned dataset

In [9]:
data.to_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\cleaned_text.csv", mode='w',index=False)

test_data.to_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\test_cleaned_text.csv", mode='w',index=False)