# Vectorizing - TD-IDF

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\data_cleaned_features.csv")

test_data = pd.read_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\test_data_cleaned_features.csv")

test_data.head()

Unnamed: 0,tweet_id,therapy,label,cleaned_text,avg_word_length,sia_positive_word_rate,sia_negative_word_rate,neutral_score,stopword_count,body_len,compound_score,punct%,positive_score,negative_score,neutral_score.1
0,1526565065549352974,adderall,neutral,danno6 lunamanokit able quit adderall without ...,4.463415,0.195122,0.0,0.81,88,185,0.5719,4.3,0.19,0.0,0.81
1,1494046188257087493,adderall,neutral,samfuchsie adderall,4.666667,0.333333,0.0,1.0,13,28,0.0,3.6,0.0,0.0,1.0
2,1563293301930807298,adderall,neutral,caslernoel well didnt miss muchyou already kne...,5.348837,0.348837,0.0,0.699,100,231,-0.6435,6.5,0.103,0.198,0.699
3,1500878265543704585,tramadol,neutral,dolor neuropático corrientazos musculares tram...,6.611111,0.388889,0.0,1.0,112,239,0.0,7.1,0.0,0.0,1.0
4,1577193665705160705,cbd,positive,medicine mentalhealthmatters thc cbd ptsd ment...,11.086957,0.478261,0.0,1.0,80,259,0.0,9.7,0.0,0.0,1.0


In [3]:
print(len(data))
print(len(test_data))

3009
753


### Split into train/test

In [None]:
# Splitting the data into training and testing sets

# The 'label' column is used as the target variable (y)
# The rest of the columns except 'tweet_id' & 'therapy' are used as the features (X)
# The test_size parameter is set to 0.2, which means 20% of the data will be used for testing

X_train, X_test, y_train, y_test = train_test_split(data[
    ['tweet_id', 'therapy', 'cleaned_text', 
     'avg_word_length', 'sia_positive_word_rate', 'sia_negative_word_rate', 'neutral_score', 'stopword_count', 
     'body_len', 'compound_score', 'punct%', 'positive_score', 'negative_score', 'neutral_score']
], data['label'], test_size=0.2)

In [4]:
X_train = data[['tweet_id', 'therapy', 'cleaned_text',
               'avg_word_length', 'sia_positive_word_rate', 'sia_negative_word_rate',
               'neutral_score', 'stopword_count', 'body_len', 'compound_score',
               'punct%', 'positive_score', 'negative_score', 'neutral_score']]

X_test = test_data[['tweet_id', 'therapy', 'cleaned_text',
                    'avg_word_length', 'sia_positive_word_rate', 'sia_negative_word_rate',
                    'neutral_score', 'stopword_count', 'body_len', 'compound_score',
                    'punct%', 'positive_score', 'negative_score', 'neutral_score']]

In [5]:
y_train = data['label']
y_test = test_data['label']

### Write out data

In [6]:
X_train.to_csv('C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\train_features.csv', mode='w', index=False)
# X_val.to_csv('C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\val_features.csv', mode='w', index=False)
X_test.to_csv('C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\test_features.csv', mode='w', index=False)

y_train.to_csv('C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\train_labels.csv', mode='w', index=False)
# y_val.to_csv('C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\val_labels.csv', mode='w', index=False)
y_test.to_csv('C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\test_labels.csv', mode='w', index=False)

### TD-IDF Vectorizer

In [7]:
# Creating a TfidfVectorizer object with the analyzer parameter set to the clean_text function
tfidf_vect = TfidfVectorizer()

# Fitting the TfidfVectorizer on the 'text' column of the training set
tfidf_vect_fit = tfidf_vect.fit(X_train['cleaned_text'])

# Transforming the 'text' column of the training and testing sets into TF-IDF features
tfidf_train = tfidf_vect_fit.transform(X_train['cleaned_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['cleaned_text'])

# Concatenating the features columns with the TF-IDF features of the training set
X_train_vect = pd.concat([X_train[
    ['tweet_id', 'avg_word_length', 'sia_positive_word_rate', 'sia_negative_word_rate', 'neutral_score', 
     'stopword_count', 'body_len', 'compound_score', 'punct%', 'positive_score', 'negative_score', 'neutral_score']
].reset_index(drop=True), pd.DataFrame(tfidf_train.toarray())], axis=1)

# Concatenating the features columns with the TF-IDF features of the testing set
X_test_vect = pd.concat([X_test[
    ['tweet_id', 'avg_word_length', 'sia_positive_word_rate', 'sia_negative_word_rate', 'neutral_score', 
     'stopword_count', 'body_len', 'compound_score', 'punct%', 'positive_score', 'negative_score', 'neutral_score']
].reset_index(drop=True), pd.DataFrame(tfidf_test.toarray())], axis=1)

# Displaying the head (first few rows) of the X_train_vect DataFrame
X_train_vect.head()

Unnamed: 0,tweet_id,avg_word_length,sia_positive_word_rate,sia_negative_word_rate,neutral_score,neutral_score.1,stopword_count,body_len,compound_score,punct%,...,11361,11362,11363,11364,11365,11366,11367,11368,11369,11370
0,1454224517895688192,4.692308,0.153846,0.0,1.0,1.0,29,61,0.0,1.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1426258820376842243,6.846154,0.230769,0.0,1.0,1.0,30,89,0.0,10.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1473007602170798082,4.545455,0.136364,0.0,0.571,0.571,43,100,0.6249,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1561156143405502466,4.16,0.08,0.0,0.781,0.781,57,105,-0.4215,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1559923718578741248,4.75,0.375,0.0,1.0,1.0,14,38,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Displaying the head (first few rows) of the X_train_vect DataFrame
X_test_vect.head()

Unnamed: 0,tweet_id,avg_word_length,sia_positive_word_rate,sia_negative_word_rate,neutral_score,neutral_score.1,stopword_count,body_len,compound_score,punct%,...,11361,11362,11363,11364,11365,11366,11367,11368,11369,11370
0,1526565065549352974,4.463415,0.195122,0.0,0.81,0.81,88,185,0.5719,4.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1494046188257087493,4.666667,0.333333,0.0,1.0,1.0,13,28,0.0,3.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1563293301930807298,5.348837,0.348837,0.0,0.699,0.699,100,231,-0.6435,6.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1500878265543704585,6.611111,0.388889,0.0,1.0,1.0,112,239,0.0,7.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1577193665705160705,11.086957,0.478261,0.0,1.0,1.0,80,259,0.0,9.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save vectorized dataframe

In [9]:
X_train_vect.to_csv('C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\X_train_vect.csv', mode='w', index=False)

X_test_vect.to_csv('C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\X_test_vect.csv', mode='w', index=False)

### N-Grams

In [None]:
# Initialize a CountVectorizer object with ngram_range=(2,2) to create bigrams
vectorizer = CountVectorizer(ngram_range=(2, 2))

# Fit and transform the text column of the dataframe
bigrams = vectorizer.fit_transform(data_cleaned_features['cleaned_text'])

# Convert the result to a dataframe
df_bigrams = pd.DataFrame(bigrams.toarray(), columns=vectorizer.get_feature_names_out())

# Display the first few rows of the dataframe
df_bigrams.head()

###  Final features Dataframe

In [None]:
# First, reset the index of all dataframes to ensure they align correctly
X_train_vect.reset_index(drop=True, inplace=True)
df_bigrams.reset_index(drop=True, inplace=True)

# Then, concatenate all the dataframes along axis=1 (i.e., columns)
df_all_features = pd.concat([X_train_vect, df_bigrams], axis=1)

### Save final features dataset

In [None]:
data.to_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\tdidf_ngrams_features.csv", mode='w', index=False)