## Multi-class classification of sentiment associated with therapies in English tweets

### Read in & clean text

In [1]:
import pandas as pd

train_data = pd.read_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\train.csv")
dev_data = pd.read_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\dev.csv")

# Concatenate the train and dev data
data = pd.concat([train_data, dev_data])

data.head()

Unnamed: 0,tweet_id,therapy,text,label
0,1550591923047600131,cannabis,@chuckschumer YES. Please. Cannabis is legal i...,neutral
1,1496301299691839491,adderall,"@youdoingtoomuch I’m a busy girl, adderall kee...",positive
2,1460587790966657024,adderall,adderall adderall caffeine caffeine caffeine k...,neutral
3,1393586192625528832,alprazolam,@justky1018 See if you can get your doctor to ...,neutral
4,1561452418285547520,diazepam,@feytaline Reminds me of the time I had a roug...,positive


In [2]:
# Importing necessary libraries
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

# Downloading the stopwords corpus from NLTK (words like "the", "is", "and" that are 
# commonly used and can be ignored)
stopwords = nltk.corpus.stopwords.words('english')

# Creating a Porter stemmer object from NLTK (used for stemming words to their base form)
ps = nltk.PorterStemmer()

# Reading the data from the "train.csv" file into a Pandas DataFrame
data = pd.read_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\train.csv")

# Function to count the percentage of punctuation characters in a given text
def count_punct(text):
    # Counting the number of punctuation characters in the text
    count = sum([1 for char in text if char in string.punctuation])
    # Calculating the percentage of punctuation characters (excluding spaces) in the text
    return round(count/(len(text) - text.count(" ")), 3) * 100

# Applying the 'count_punct' function to the 'text' column and storing the result in 
# a new 'body_len' column
data['body_len'] = data['text'].apply(lambda x: len(x) - x.count(" "))

# Applying the 'count_punct' function to the 'body_text' column and storing the result in 
# a new 'punct%' column
data['punct%'] = data['text'].apply(lambda x: count_punct(x))

# Function to clean the text by removing punctuation, converting to lowercase, and stemming words
def clean_text(text):
    # Removing punctuation characters from the text and converting it to lowercase
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    # Splitting the text into tokens (words) using regular expressions
    tokens = re.split('\W+', text)
    # Stemming each word in the tokens list using the Porter stemmer
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    # Returning the cleaned text
    return text

### Add word sentiment function

In [3]:
import nltk
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\danij\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

Define una función que calcule el sentimiento de cada palabra en un texto, utilizando el léxico de sentimientos apropiado. Aquí tienes un ejemplo utilizando SentiWordNet:

In [4]:
from nltk.corpus import sentiwordnet as swn

def get_word_sentiment(word):
    synsets = list(swn.senti_synsets(word))
    if synsets:
        sentiment = synsets[0].pos_score() - synsets[0].neg_score()
        return sentiment
    return 0.0

Itera sobre cada texto en tu dataset y para cada palabra en el texto, utiliza la función get_word_sentiment para obtener el sentimiento de esa palabra. Puedes almacenar los sentimientos en una nueva lista o como una columna adicional en tu dataset.

In [6]:
import numpy as np

data['sentiments'] = data['text'].apply(lambda text: np.mean([get_word_sentiment(word) for word in nltk.word_tokenize(text) if get_word_sentiment(word) != 0]))

data['sentiments'] = data['sentiments'].fillna(np.mean(data['sentiments']))

  return _methods._mean(a, axis=axis, dtype=dtype,


### Add more sentiment related features

In [None]:
from nltk.corpus import opinion_lexicon
from nltk.tokenize import treebank
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download the opinion lexicon
nltk.download('opinion_lexicon')

# This tokenizer uses the Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank
tokenizer = treebank.TreebankWordTokenizer()

# Create a Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Function to count positive and negative words
def count_pos_neg_words(text):
    pos_words = 0
    neg_words = 0

    tokenized_text = [word.lower() for word in tokenizer.tokenize(text)]

    for word in tokenized_text:
        if word in opinion_lexicon.positive():
            pos_words += 1
        elif word in opinion_lexicon.negative():
            neg_words += 1

    return pos_words, neg_words

# Apply the function to the text column
data['pos_word_count'], data['neg_word_count'] = zip(*data['text'].map(count_pos_neg_words))

# Function to get sentiment intensity
def get_sentiment_intensity(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']

# Apply the function to the text column
data['sentiment_intensity'] = data['text'].apply(get_sentiment_intensity)

### Split into train/test

In [7]:
# Importing the train_test_split function from the sklearn.model_selection module
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets
# The 'body_text', 'body_len', and 'punct%' columns are used as the features (X)
# The 'label' column is used as the target variable (y)
# The test_size parameter is set to 0.2, which means 20% of the data will be used for testing
X_train, X_test, y_train, y_test = train_test_split(data[['text', 'body_len', 'punct%', 'sentiments']], data['label'], test_size=0.2)

In [8]:
# Show data before vectorizing
data.head()

Unnamed: 0,tweet_id,therapy,text,label,body_len,punct%,sentiments
0,1550591923047600131,cannabis,@chuckschumer YES. Please. Cannabis is legal i...,neutral,239,6.3,-0.075
1,1496301299691839491,adderall,"@youdoingtoomuch I’m a busy girl, adderall kee...",positive,67,6.0,0.4375
2,1460587790966657024,adderall,adderall adderall caffeine caffeine caffeine k...,neutral,166,3.6,0.15
3,1393586192625528832,alprazolam,@justky1018 See if you can get your doctor to ...,neutral,129,2.3,0.024545
4,1561452418285547520,diazepam,@feytaline Reminds me of the time I had a roug...,positive,236,4.7,0.0875


### Vectorize text

In [9]:
# Creating a TfidfVectorizer object with the analyzer parameter set to the clean_text function
tfidf_vect = TfidfVectorizer(analyzer=clean_text)

# Fitting the TfidfVectorizer on the 'text' column of the training set
tfidf_vect_fit = tfidf_vect.fit(X_train['text'])

# Transforming the 'text' column of the training and testing sets into TF-IDF features
tfidf_train = tfidf_vect_fit.transform(X_train['text'])
tfidf_test = tfidf_vect_fit.transform(X_test['text'])

# Concatenating the 'body_len' and 'punct%' and 'sentiments' columns with the TF-IDF features of the training set
X_train_vect = pd.concat([X_train[['body_len', 'punct%', 'sentiments']].reset_index(drop=True), 
                          pd.DataFrame(tfidf_train.toarray())], axis=1)

# Concatenating the 'body_len' and 'punct%'  and 'sentiments'columns with the TF-IDF features of the testing set
X_test_vect = pd.concat([X_test[['body_len', 'punct%', 'sentiments']].reset_index(drop=True), 
                         pd.DataFrame(tfidf_test.toarray())], axis=1)

# Displaying the head (first few rows) of the X_train_vect DataFrame
X_train_vect.head()

Unnamed: 0,body_len,punct%,sentiments,0,1,2,3,4,5,6,...,7694,7695,7696,7697,7698,7699,7700,7701,7702,7703
0,301,5.3,-0.278846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,47,2.1,0.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,125,1.6,-0.229167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,112,1.8,-0.175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,171,3.5,-0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Final evaluation of models

In [10]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [12]:
# Importing the RandomForestClassifier from the sklearn.ensemble module
from sklearn.ensemble import RandomForestClassifier
import time
from sklearn.metrics import precision_recall_fscore_support as score

# Creating a RandomForestClassifier object with specified parameters
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

# Measuring the time taken to fit (train) the RandomForestClassifier on the training data
start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

# Measuring the time taken to make predictions using the trained RandomForestClassifier 
# on the testing data
start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

# Computing precision, recall, fscore, and support values for the predicted results
precision, recall, fscore, support = score(y_test, y_pred, average='macro')

# Printing the precision, recall, and F1-score
print('Macro Average Precision:', precision)
print('Macro Average Recall:', recall)
print('Macro Average F1-score:', fscore)
print('Support:', support)



Macro Average Precision: 0.8657654650645306
Macro Average Recall: 0.5748310044095516
Macro Average F1-score: 0.6346955869918399
Support: None




In [13]:
# Importing the GradientBoostingClassifier from the sklearn.ensemble module
from sklearn.ensemble import GradientBoostingClassifier
import time
from sklearn.metrics import precision_recall_fscore_support as score

# Creating a GradientBoostingClassifier object with specified parameters
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

# Measuring the time taken to fit (train) the GradientBoostingClassifier on the training data
start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

# Measuring the time taken to make predictions using the trained GradientBoostingClassifier on 
# the testing data
start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

# Computing precision, recall, fscore, and support values for the predicted results
precision, recall, fscore, support = score(y_test, y_pred, average='macro')

# Printing the precision, recall, and F1-score
print('Macro Average Precision:', precision)
print('Macro Average Recall:', recall)
print('Macro Average F1-score:', fscore)
print('Support:', support)



Macro Average Precision: 0.6618217772834013
Macro Average Recall: 0.6022616824435296
Macro Average F1-score: 0.6258148392294735
Support: None


