# Feature Engineering


We will create new features to train the model, gaining more insights from the data

In [1]:
# Importing necessary libraries
import re
import string

import nltk
import numpy as np
import pandas as pd
from nltk.corpus import opinion_lexicon
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Download the nlkt tools
nltk.download('opinion_lexicon')
nltk.download('sentiwordnet')
nltk.download('vader_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\danij\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\danij\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\danij\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danij\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Read in data

In [2]:
data = pd.read_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\cleaned_text.csv")

test_data = pd.read_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\test_cleaned_text.csv")

test_data.head()

Unnamed: 0,tweet_id,therapy,text,label,cleaned_text
0,1526565065549352974,adderall,@danno6_ @LunaManokit I was able to quit adder...,neutral,danno6 lunamanokit able quit adderall without ...
1,1494046188257087493,adderall,@samfuchsie me when i do adderall,neutral,samfuchsie adderall
2,1563293301930807298,adderall,"@caslernoel Well' you didn't miss much,you alr...",neutral,caslernoel well didnt miss muchyou already kne...
3,1500878265543704585,tramadol,"Dolor neuropático, corrientazos musculares, tr...",neutral,dolor neuropático corrientazos musculares tram...
4,1577193665705160705,cbd,My Medicine \n#MentalHealthMatters #THC #CBD #...,positive,medicine mentalhealthmatters thc cbd ptsd ment...


### 3.1 Body length

In [3]:
# Applying the 'count' function to the 'text' column and storing the result in 
# a new 'body_len' column. Not counting whitespaces
data['body_len'] = data['text'].apply(lambda x: len(x) - x.count(" "))

test_data['body_len'] = test_data['text'].apply(lambda x: len(x) - x.count(" "))

### 3.2 Count punctuation signs (% of punctuation in the text)

In [4]:
# Function to count the percentage of punctuation characters in a given text
def count_punct(text):
    # Counting the number of punctuation characters in the text
    count = sum([1 for char in text if char in string.punctuation])
    # Calculating the percentage of punctuation characters (excluding spaces) in the text
    return round(count/(len(text) - text.count(" ")), 3) * 100


# Applying the 'count_punct' function to the 'body_text' column and storing the result in 
# a new 'punct%' column
data['punct%'] = data['text'].apply(lambda x: count_punct(x))

test_data['punct%'] = test_data['text'].apply(lambda x: count_punct(x))

### 3.3 Word with associated sentiment weight function

In [5]:
# Create a Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Function to get sentiment intensity
def get_sentiment_intensity(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']

# Apply the function to the text column
data['sentiment_intensity'] = data['cleaned_text'].apply(get_sentiment_intensity)

test_data['sentiment_intensity'] = test_data['cleaned_text'].apply(get_sentiment_intensity)

### 3.4 Words features

In [6]:
# Calculate word count
data['word_count'] = data['text'].apply(lambda x: len(str(x).split()))
test_data['word_count'] = test_data['text'].apply(lambda x: len(str(x).split()))


# Calculate character count
data['char_count'] = data['text'].apply(lambda x: len(str(x)))
test_data['char_count'] = test_data['text'].apply(lambda x: len(str(x)))


# Calculate average word length
data['avg_word_length'] = data['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_data['avg_word_length'] = test_data['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))


# Calculate punctuation count
data['punctuation_count'] = data['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
test_data['punctuation_count'] = test_data['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))


# Calculate hashtag count
data['hashtag_count'] = data['text'].apply(lambda x: len([c for c in str(x) if c == '#']))
test_data['hashtag_count'] = test_data['text'].apply(lambda x: len([c for c in str(x) if c == '#']))


# Stopword Count
stopwords = nltk.corpus.stopwords.words('english')
data['stopword_count'] = data['text'].apply(lambda x: len([word for word in x if word in stopwords]))
test_data['stopword_count'] = test_data['text'].apply(lambda x: len([word for word in x if word in stopwords]))

### 3.5 Count positive and negative sentiment words

In [7]:
# Count of Positive Words
positive_words = [word for word, score in sia.lexicon.items() if score > 0]
data['sia_positive_word_count'] = data['cleaned_text'].apply(lambda x: len([word for word in x if word in positive_words]))
test_data['sia_positive_word_count'] = test_data['cleaned_text'].apply(lambda x: len([word for word in x if word in positive_words]))


# Count of Negative Words
negative_words = [word for word, score in sia.lexicon.items() if score < 0]
data['sia_negative_word_count'] = data['cleaned_text'].apply(lambda x: len([word for word in x if word in negative_words]))
test_data['sia_negative_word_count'] = test_data['cleaned_text'].apply(lambda x: len([word for word in x if word in negative_words]))


# Positive Word Rate
data['sia_positive_word_rate'] = data['sia_positive_word_count'] / data['word_count']
test_data['sia_positive_word_rate'] = test_data['sia_positive_word_count'] / test_data['word_count']


# Negative Word Rate
data['sia_negative_word_rate'] = data['sia_negative_word_count'] / data['word_count']
test_data['sia_negative_word_rate'] = test_data['sia_negative_word_count'] / test_data['word_count']

### 3.6 More sentiment score features

In [8]:
# Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Sentiment Scores
sentiment_scores = data['cleaned_text'].apply(lambda x: sia.polarity_scores(x))
test_sentiment_scores = test_data['cleaned_text'].apply(lambda x: sia.polarity_scores(x))


# Positive Sentiment Score
data['positive_score'] = sentiment_scores.apply(lambda x: x['pos'])
test_data['positive_score'] = test_sentiment_scores.apply(lambda x: x['pos'])


# Negative Sentiment Score
data['negative_score'] = sentiment_scores.apply(lambda x: x['neg'])
test_data['negative_score'] = test_sentiment_scores.apply(lambda x: x['neg'])


# Neutral Sentiment Score
data['neutral_score'] = sentiment_scores.apply(lambda x: x['neu'])
test_data['neutral_score'] = test_sentiment_scores.apply(lambda x: x['neu'])


# Compound Sentiment Score
data['compound_score'] = sentiment_scores.apply(lambda x: x['compound'])
test_data['compound_score'] = test_sentiment_scores.apply(lambda x: x['compound'])

### 3.7 Count of sentiment expressions

In [9]:
# Count of Laughing Expressions
laugh_expressions = ['haha', 'hehe', 'lol']
data['laugh_count'] = data['text'].apply(lambda x: sum([x.lower().count(expr) for expr in laugh_expressions]))


# Count of Sad Expressions
sad_expressions = [':(', ':-(', ';(', ';-(']
data['sad_count'] = data['text'].apply(lambda x: sum([x.count(expr) for expr in sad_expressions]))

### 3.8 Vader sentiment features

In [10]:
# Initialize the VADER sentiment intensity analyzer
sid = SentimentIntensityAnalyzer()

# Define a function to calculate VADER sentiment scores
def get_vader_scores(text):
    scores = sid.polarity_scores(text)
    return scores

# Calculate VADER sentiment scores for each tweet
vader_scores = data['cleaned_text'].apply(get_vader_scores)

# Extract compound score for each tweet
data['compound_Vscore'] = vader_scores.apply(lambda x: x['compound'])

# Extract negative score for each tweet
data['negative_Vscore'] = vader_scores.apply(lambda x: x['neg'])

# Extract neutral score for each tweet
data['neutral_Vscore'] = vader_scores.apply(lambda x: x['neu'])

# Extract positive score for each tweet
data['positive_Vscore'] = vader_scores.apply(lambda x: x['pos'])

### Showing new features

In [12]:
# Show all columns
pd.set_option('display.max_columns', None)

# Assuming 'column_to_drop' is the name of the column you want to drop
# data = data.drop('sentiment_scores', axis=1)

data.head()

Unnamed: 0,tweet_id,therapy,text,label,cleaned_text,body_len,punct%,sentiment_intensity,word_count,char_count,avg_word_length,punctuation_count,hashtag_count,stopword_count,sia_positive_word_count,sia_negative_word_count,sia_positive_word_rate,sia_negative_word_rate,positive_score,negative_score,neutral_score,compound_score,laugh_count,sad_count,compound_Vscore,negative_Vscore,neutral_Vscore,positive_Vscore
0,1454224517895688192,adderall,wait until i get an adderall prescription. im...,neutral,wait get adderall prescription imma time every...,61,1.6,0.0,13,74,4.692308,1,0,29,2,0,0.153846,0.0,0.0,0.0,1.0,0.0,0,0,0.0,0.0,1.0,0.0
1,1426258820376842243,oxycodone,"@Sassychickie @kelly_rdc Fentanyl, OxyContin a...",negative,sassychickie kellyrdc fentanyl oxycontin oxyco...,89,10.1,0.0,13,101,6.846154,9,0,30,3,0,0.230769,0.0,0.0,0.0,1.0,0.0,0,0,0.0,0.0,1.0,0.0
2,1473007602170798082,cbd,a fun juggling act of mine is taking adderall ...,neutral,fun juggling act mine taking adderall drinking...,100,1.0,0.6249,22,121,4.545455,1,0,43,3,0,0.136364,0.0,0.331,0.097,0.571,0.6249,0,0,0.6249,0.097,0.571,0.331
3,1561156143405502466,percocet,percocet roxycodone with some xanax that i had...,neutral,percocet roxycodone xanax crushed dust elevate...,105,0.0,-0.4215,25,128,4.16,0,0,57,2,0,0.08,0.0,0.0,0.219,0.781,-0.4215,0,0,-0.4215,0.219,0.781,0.0
4,1559923718578741248,adderall,first day of adderall and i feel 😵‍💫😵‍💫😵‍💫😵‍💫,negative,first day adderall feel,38,0.0,0.0,8,45,4.75,0,0,14,3,0,0.375,0.0,0.0,0.0,1.0,0.0,0,0,0.0,0.0,1.0,0.0


In [13]:
# Store this dataframe with all of the features
data.to_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\all_features.csv", index=False)
test_data.to_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\test_all_features.csv", index=False)

### Check feature importance to select the best features

We will use a RandomForestClassifier in order to evaluate all of the features created and keep the most relevant ones

In [14]:
# Define the feature columns and the target column
feature_columns = ['body_len', 'punct%', 'sentiment_intensity', 'word_count', 'char_count', 'avg_word_length',  
                   'punctuation_count', 'hashtag_count', 'stopword_count', 'sia_positive_word_count', 
                   'sia_negative_word_count', 'sia_positive_word_rate', 'sia_negative_word_rate', 
                   'positive_score', 'negative_score', 'neutral_score', 'compound_score', 'laugh_count', 'sad_count', 
                   'compound_Vscore', 'negative_Vscore', 'neutral_Vscore', 'positive_Vscore']


target_column = 'label'

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(data[feature_columns], data[target_column], test_size=0.2, random_state=42)

# Initialize a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Predict the sentiment labels on the test data
y_pred = rf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# Extract the feature importance
feature_importance = rf.feature_importances_
feature_importance

              precision    recall  f1-score   support

    negative       0.11      0.02      0.03        62
     neutral       0.70      0.96      0.81       412
    positive       0.60      0.14      0.23       128

    accuracy                           0.69       602
   macro avg       0.47      0.37      0.36       602
weighted avg       0.62      0.69      0.61       602



array([6.32644978e-02, 5.71024926e-02, 4.98817390e-02, 5.23324498e-02,
       6.26885927e-02, 7.41319807e-02, 4.68043458e-02, 1.26514158e-02,
       6.36520698e-02, 4.27255146e-02, 0.00000000e+00, 6.77157455e-02,
       0.00000000e+00, 4.66330638e-02, 4.33230458e-02, 6.35204675e-02,
       4.83512901e-02, 2.74105979e-03, 7.57537030e-05, 4.84537447e-02,
       4.29952935e-02, 6.51348532e-02, 4.58205837e-02])

In [15]:
# Create a DataFrame with the feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': feature_columns, 'Importance': feature_importance})

# Sort the DataFrame by importance score in descending order
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

feature_importance_df

Unnamed: 0,Feature,Importance
5,avg_word_length,0.074132
11,sia_positive_word_rate,0.067716
21,neutral_Vscore,0.065135
8,stopword_count,0.063652
15,neutral_score,0.06352
0,body_len,0.063264
4,char_count,0.062689
1,punct%,0.057102
3,word_count,0.052332
2,sentiment_intensity,0.049882


### Different way of evaluating features

To get a more stable estimate of feature importance, you can train the Random Forest model multiple times with different random states, and then average the feature importances over all iterations.

This will give you a more stable estimate of feature importance that is less likely to be affected by the randomness of the Random Forest model.

In [16]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Initialize an array to store the feature importances
feature_importances = np.zeros(len(feature_columns))

# Number of iterations
n_iterations = 200

# Train the model multiple times with different random states
for i in range(n_iterations):
    rf = RandomForestClassifier(n_estimators=100, random_state=i)
    rf.fit(X_train, y_train)
    feature_importances += rf.feature_importances_

# Average the feature importances
feature_importances /= n_iterations

# Create a DataFrame with the feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': feature_columns, 'Importance': feature_importances})

# Sort the DataFrame by importance score in descending order
feature_importance_df_2 = feature_importance_df.sort_values('Importance', ascending=False)

feature_importance_df_2

Unnamed: 0,Feature,Importance
5,avg_word_length,0.075084
11,sia_positive_word_rate,0.066511
15,neutral_score,0.063762
21,neutral_Vscore,0.063691
4,char_count,0.063267
8,stopword_count,0.063107
0,body_len,0.062992
1,punct%,0.05789
3,word_count,0.052507
16,compound_score,0.048799


As for taking into account the correlation between features, one method you can use is permutation importance. Permutation importance works by randomly shuffling a single feature and measuring how much the performance of the model decreases. This gives an estimate of how important the feature is.

This will give you an estimate of feature importance that takes into account the correlation between features. However, please note that permutation importance can be more computationally intensive than the default feature importance from Random Forest, especially for large datasets.

In [17]:
from sklearn.inspection import permutation_importance

# Train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Compute permutation importance
result = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42)

# Create a DataFrame with the feature names and their importance scores
permutation_importance_df = pd.DataFrame({'Feature': feature_columns, 'Importance': result.importances_mean})

# Sort the DataFrame by importance score in descending order
permutation_importance_df = permutation_importance_df.sort_values('Importance', ascending=False)

permutation_importance_df

Unnamed: 0,Feature,Importance
11,sia_positive_word_rate,0.005814
8,stopword_count,0.004153
9,sia_positive_word_count,0.003488
6,punctuation_count,0.003322
17,laugh_count,0.002824
7,hashtag_count,0.001329
0,body_len,0.000664
10,sia_negative_word_count,0.0
12,sia_negative_word_rate,0.0
18,sad_count,0.0


### What features will we keep?

Based on the results, it seems that the features `sia_negative_word_rate`, `sia_negative_word_count`, `laugh_count`, `sad_count` consistently have zero or near-zero importance across all iterations. This suggests that these features do not contribute much to the model's predictions and could potentially be dropped.

On the other hand, features like `avg_word_length`, `sia_positive_word_rate`, `stopword_count`, `char_count`, `body_len`, `neutral_score`, `neutral_Vscore`, `punct%`, `word_count`, `sentiment_intensity`, `compound_score`, `compound_Vscore`, `positive_Vscore`, `positive_score`, `sia_positive_word_count`, `punctuation_count`, `negative_Vscore`, `negative_score` consistently have higher importance values, suggesting that they are important for the model's predictions.

However, it's important to note that feature importance doesn't tell the whole story. Even features with low importance could potentially be useful when combined with other features. Also, correlation between features can affect the importance values. 

A good next step could be to perform a more systematic feature selection process, such as recursive feature elimination or forward selection, to find the optimal set of features. 

Also, remember that these importance values are specific to the model (Random Forest in this case) and the specific data you're working with. If you plan to use a different model or if your data changes, the feature importance could change as well. 

### New dataframe just with selected features

In [18]:
data.columns

Index(['tweet_id', 'therapy', 'text', 'label', 'cleaned_text', 'body_len',
       'punct%', 'sentiment_intensity', 'word_count', 'char_count',
       'avg_word_length', 'punctuation_count', 'hashtag_count',
       'stopword_count', 'sia_positive_word_count', 'sia_negative_word_count',
       'sia_positive_word_rate', 'sia_negative_word_rate', 'positive_score',
       'negative_score', 'neutral_score', 'compound_score', 'laugh_count',
       'sad_count', 'compound_Vscore', 'negative_Vscore', 'neutral_Vscore',
       'positive_Vscore'],
      dtype='object')

In [19]:
# Dataframe of the columns you want to keep
data_cleaned_features = data[
    ['tweet_id', 'therapy', 'label', 'cleaned_text', 
     'avg_word_length', 'sia_positive_word_rate', 'sia_negative_word_rate', 'neutral_score', 'stopword_count', 
     'body_len', 'compound_score', 'punct%', 'positive_score', 'negative_score', 'neutral_score']]


test_data_cleaned_features = test_data[
    ['tweet_id', 'therapy', 'label', 'cleaned_text', 
     'avg_word_length', 'sia_positive_word_rate', 'sia_negative_word_rate', 'neutral_score', 'stopword_count', 
     'body_len', 'compound_score', 'punct%', 'positive_score', 'negative_score', 'neutral_score']]

In [20]:
print(len(data_cleaned_features))
print(len(test_data_cleaned_features))

3009
753


In [21]:
test_data_cleaned_features.head()

Unnamed: 0,tweet_id,therapy,label,cleaned_text,avg_word_length,sia_positive_word_rate,sia_negative_word_rate,neutral_score,stopword_count,body_len,compound_score,punct%,positive_score,negative_score,neutral_score.1
0,1526565065549352974,adderall,neutral,danno6 lunamanokit able quit adderall without ...,4.463415,0.195122,0.0,0.81,88,185,0.5719,4.3,0.19,0.0,0.81
1,1494046188257087493,adderall,neutral,samfuchsie adderall,4.666667,0.333333,0.0,1.0,13,28,0.0,3.6,0.0,0.0,1.0
2,1563293301930807298,adderall,neutral,caslernoel well didnt miss muchyou already kne...,5.348837,0.348837,0.0,0.699,100,231,-0.6435,6.5,0.103,0.198,0.699
3,1500878265543704585,tramadol,neutral,dolor neuropático corrientazos musculares tram...,6.611111,0.388889,0.0,1.0,112,239,0.0,7.1,0.0,0.0,1.0
4,1577193665705160705,cbd,positive,medicine mentalhealthmatters thc cbd ptsd ment...,11.086957,0.478261,0.0,1.0,80,259,0.0,9.7,0.0,0.0,1.0


### Write out the new dataframe with all features

In [22]:
# Write it to our data files
data_cleaned_features.to_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\data_cleaned_features.csv", mode='w', index=False)

test_data_cleaned_features.to_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\test_data_cleaned_features.csv", mode='w', index=False)