# Preprocessing

**GOAL**: Clean, transform and extract (new) rappresentative feature to improve sentiment classification for each text (issue).

## Possible update:
1. Use prototipe

In [9]:
import sys
import os
import import_ipynb

# Add the folder containing "files.ipynb" to the Python path
sys.path.append(os.path.abspath("Utils"))
# Import the files notebook
import Utils.file_manager as fm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

SEED = 48
DIM_PRE_EMBEDDING = 200
ARRAY_DIRECTORY_PATH = "./Array_After_Preprocessing"

## Dataset Description

This dataset contains text comments from technical discussions on platforms like Stack Overflow and GitHub. Each entry includes an identifier, a sentiment polarity label, the comment text, and its source.

In [10]:
df = pd.read_csv("./Datasets/train.csv")

In [11]:
df.describe()

Unnamed: 0,id,polarity,text,source
count,8082,8082,8082,8082
unique,8082,3,8030,2
top,t1,neutral,"Excellent, thanks!",github
freq,1,3301,7,4985


In [12]:
df.head()

Unnamed: 0,id,polarity,text,source
0,t1,negative,"Vineet, what you are trying to do is a terribl...",stackoverflow
1,t2,positive,"'Course I do, corrected.",stackoverflow
2,t3,positive,"Excellent, happy to help! If you don't mind, c...",stackoverflow
3,t6,negative,@talnicolas I'm using it a few dozen times in ...,stackoverflow
4,t7,neutral,I didn't select an answer because even though ...,stackoverflow


In [13]:
df_test_public = pd.read_csv("./Datasets/test_public.csv")

In [14]:
df_test_public.describe()

Unnamed: 0,id,text,source
count,3463,3463,3463
unique,3463,3454,2
top,t4,what about 3rd question?( When is it appropria...,github
freq,1,2,2137


In [15]:
df_test_public.head()

Unnamed: 0,id,text,source
0,t4,@DrabJay: excellent suggestion! Code changed. :-),stackoverflow
1,t5,Any decent browser should protect against mali...,stackoverflow
2,t8,I swear - I don't put pseudo code I get told o...,stackoverflow
3,t9,I have attached below,stackoverflow
4,t13,When I refactor the following line: using Resh...,stackoverflow


## Preprocessiong 1

In [16]:
import re

from sklearn.preprocessing import OneHotEncoder

### Replace URL & USER

In [17]:
# Regex pattern to match URLs
url_pattern = re.compile(r'https?://\S+|www\.\S+')

# Regex pattern to match @username
username_pattern = re.compile(r'@\w+')

In [18]:
for i,text in enumerate(df.text):

    # Replace URLs
    text = url_pattern.sub("_URL_", text)

    # Replace @username
    df.loc[i, "text"] = username_pattern.sub("_USER_", text)

In [19]:
for i,text in enumerate(df_test_public.text):

    # Replace URLs
    text = url_pattern.sub("_URL_", text)

    # Replace @username
    df_test_public.loc[i, "text"] = username_pattern.sub("_USER_", text)

### Dataset '**train**'

In [20]:
X_source = np.asarray( OneHotEncoder().fit_transform(df.loc[:, ["source"]]).todense() )

### Dataset '**test_public**'

In [21]:
X_TP_source = np.asarray( OneHotEncoder().fit_transform(df_test_public.loc[:, ["source"]]).todense() )

## Preprocessing 2 - extract EMBEDDINGs

#### Training an embedding

In [22]:
import spacy
from spacy.training import Example

import re
from nltk.corpus import stopwords

In [23]:
# 1. Initialize a blank pipeline
nlp = spacy.blank("en")

WARNING!!!!!!!!!!!!!!!!!!!!  
The block below takes time to execute (7 min), so you can run it the first time and then import the trained model with the next block

In [24]:

# 2. Add the 'tok2vec' component
tok2vec = nlp.add_pipe("tok2vec", config={'model': {'width': DIM_PRE_EMBEDDING}})

# Convert to spaCy's Example format
examples = []
for entry in df.loc[:, "text"]:
    doc = nlp.make_doc(entry)
    example = Example.from_dict(doc, {})
    examples.append(example)

# 3. Start the training loop
with nlp.select_pipes(enable=["tok2vec"]):  # Train only 'tok2vec'
    optimizer = nlp.begin_training()
    for epoch in range(10):  # Train for 10 epochs
        losses = {}
        for example in examples:
            nlp.update([example], sgd=optimizer, losses=losses)
        print(f"Epoch {epoch + 1}")

# Save the trained model on disk
tok2vec.to_disk("./Embedding_model_trained")

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10


In [None]:
# AFTER THE FIRST RUN uncomment the next line
# tok2vec = nlp.add_pipe("tok2vec", config={'model': {'width': DIM_PRE_EMBEDDING}})

# Import the trained model from disk
tok2vec.from_disk("./Embedding_model_trained")

<spacy.pipeline.tok2vec.Tok2Vec at 0x24af7182db0>

In [None]:

# Import the trained model from disk
tok2vec.from_disk("./Embedding_model_trained")

In [27]:
# A method for get embedding from dataset texts

def extract_embedding(texts_feature, len_embedding = DIM_PRE_EMBEDDING):

    X_embedded = np.zeros((texts_feature.shape[0], len_embedding))
    
    for i, sentence in enumerate(texts_feature):

        if i%500 == 0:
            print(f"row {i} in execution")
        
        doc = nlp(sentence)
        
        for token in doc:

            # TODO: Trattare stopword, negazioni e altro 
            # This sum the word embedding and create a sentence embedding
            X_embedded[i, :] += token.vector

        X_embedded[i, :] /=  np.linalg.norm(X_embedded[i, :])
        # TODO: devi normalizzare

    return X_embedded

### Dataset '**train**'
#### Get embedding for 'text' column

WARNING!!!!!!!!!!!!!!!!!!!!  
After the first run, you can **skip** the **next 2 blocks**.

In [28]:
X_text_embedded = extract_embedding(df.loc[:, "text"])

row 0 in execution
row 500 in execution
row 1000 in execution
row 1500 in execution
row 2000 in execution
row 2500 in execution
row 3000 in execution
row 3500 in execution
row 4000 in execution
row 4500 in execution
row 5000 in execution
row 5500 in execution
row 6000 in execution
row 6500 in execution
row 7000 in execution
row 7500 in execution
row 8000 in execution


In [54]:
# Merge of all the columns of the dataset after the transormations
X_embedded = np.concatenate([X_text_embedded, X_source], axis=1)

# Save the array on disk
fm.save_array(X_embedded, "X_embedded")

In [30]:
# AFTER FIRST RUN: import the embedded data from disk
X_embedded = fm.import_array("X_embedded")

### Dataset '**test_public**'
#### Get embedding for 'text' column

WARNING!!!!!!!!!!!!!!!!!!!!  
After the first run, you can **skip** the **next 2 blocks**.

In [None]:
X_TP_text_embedded = extract_embedding(df_test_public.loc[:, "text"])

row 0 in execution
row 500 in execution
row 1000 in execution
row 1500 in execution
row 2000 in execution
row 2500 in execution
row 3000 in execution


In [None]:
# Merge of all the columns of the dataset after the transormations
X_TP_embedded = np.concatenate([X_TP_text_embedded, X_TP_source], axis=1)

# Save the array on disk
fm.save_array(X_TP_embedded, "X_TP_embedded")

In [None]:
# AFTER FIRST RUN: import the embedded data from disk
X_TP_embedded = fm.import_array("X_TP_embedded")

## Preprocessing 3 - extract NEW FEATUREs

In [34]:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from nltk import bigrams

# Ensure NLTK stopwords are downloaded
import nltk

In [35]:
def extract_feature(df):

    # Regex pattern to match @username
    positive_emoticon_pattern = re.compile(r"""(:|;|=)(-)?(\)|D|P|)|<3""", re.VERBOSE)
    neutral_emoticon_pattern = re.compile(r"""(:)(-)?(O|/)""", re.VERBOSE)
    negative_emoticon_pattern = re.compile(r"""(:|=)(-|')?(\()""", re.VERBOSE)

    features = []
    stop_words = set(stopwords.words("english"))

    for i, text in enumerate(df):
        
        if i%500 == 0:
            print(f"row {i} in execution") 

        # Basic Text Features
        doc = nlp(text)       
        
        word_count = len(doc)
        char_count = len(text)
        punct_count = sum(1 for char in text if char in "!?.")
        uppercase_word_count = sum(1 for word in text.split() if word.isupper())

        # Lexical Features
        avg_word_length = np.mean([len(word) for word in doc])
        stopword_ratio = sum(1 for word in nlp(text.lower()) if word in stop_words) / word_count

        # Keyword Count
        # keywords = {"great", "bug", "error", "fix", "feature", "missing"}
        # keyword_count = sum(1 for word in word_tokenize(text.lower()) if word in keywords)

        # Syntactic Features
        pos_counts = doc.count_by(spacy.attrs.POS)
        noun_ratio = pos_counts.get(nlp.vocab.strings["NOUN"], 0) / word_count
        verb_ratio = pos_counts.get(nlp.vocab.strings["VERB"], 0) / word_count
        adjective_ratio = pos_counts.get(nlp.vocab.strings["ADJ"], 0) / word_count
        negation_count = sum(1 for token in doc if token.dep_ == "neg")

        # Sentiment-Specific Features
        blob = TextBlob(text)
        sentiment_score = blob.sentiment.polarity
        polarity_word_ratio = (len([word for word in blob.words if TextBlob(word).sentiment.polarity > 0]) + 
                               len([word for word in blob.words if TextBlob(word).sentiment.polarity < 0])) / word_count


        # Eoticons Features

        # Count positive emoticons
        positive_emoticon_count = len( positive_emoticon_pattern.findall(text) )

        # Count neutral emoticons
        neutral_emoticon_count = len( neutral_emoticon_pattern.findall(text) )

        # Count neutral emoticons
        negative_emoticon_count = len( negative_emoticon_pattern.findall(text) )



        features.append([
            word_count, char_count, punct_count, uppercase_word_count,
            avg_word_length, stopword_ratio,
            noun_ratio, verb_ratio, adjective_ratio, negation_count,
            sentiment_score, polarity_word_ratio,
            positive_emoticon_count, neutral_emoticon_count, negative_emoticon_count
        ])


    feature_columns = [
        "word_count", "char_count", "punct_count", "uppercase_word_count",
        "avg_word_length", "stopword_ratio",
        "noun_ratio", "verb_ratio", "adjective_ratio", "negation_count",
        "sentiment_score", "polarity_word_ratio",
        "positive_emoticon_count", "neutral_emoticon_count", "negative_emoticon_count"
        ]
    return pd.DataFrame(features, columns=feature_columns)
    
        

### Dataset '**train**'
#### Get new features from 'text' column

WARNING!!!!!!!!!!!!!!!!!!!!  
After the first run, you can **skip** the **next block**.

In [36]:
X_new_features =  extract_feature(df.loc[:, "text"])
fm.save_array(np.asarray(X_new_features), "X_new_features")

row 0 in execution
row 500 in execution
row 1000 in execution
row 1500 in execution
row 2000 in execution
row 2500 in execution
row 3000 in execution
row 3500 in execution
row 4000 in execution
row 4500 in execution
row 5000 in execution
row 5500 in execution
row 6000 in execution
row 6500 in execution
row 7000 in execution
row 7500 in execution
row 8000 in execution


In [37]:
# AFTER FIRST RUN: import the new features from disk
X_new_features = fm.import_array("X_new_features")

### Dataset '**test_public**'
#### Get new features from 'text' column

WARNING!!!!!!!!!!!!!!!!!!!!  
After the first run, you can **skip** the **next block**.

In [38]:
X_TP_new_feature =  extract_feature(df_test_public.loc[:, "text"])
fm.save_array(np.asarray(X_TP_new_feature), "X_TP_new_feature")

row 0 in execution
row 500 in execution
row 1000 in execution
row 1500 in execution
row 2000 in execution
row 2500 in execution
row 3000 in execution


In [39]:
# AFTER FIRST RUN: import the new features from disk
X_TP_new_feature = fm.import_array("X_TP_new_feature")

## Preprocessing 4 - extract BIGRAMs

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

### Dataset '**train**'
#### Get the most frequent bigrams from 'text' column

In [41]:
def extract_bigram_most_frequent(df, min_tfidf=1):

    # Initialize TfidfVectorizer with ngram_range for bigrams
    vectorizer = TfidfVectorizer(ngram_range=(2, 2), stop_words='english')

    # Fit the vectorizer on the documents and transform them into TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(df)

    # Get feature names (bigrams) and their corresponding TF-IDF scores
    name_bigram = vectorizer.get_feature_names_out()

    # Convert the TF-IDF matrix to a dense format (optional, for display)
    X_bigram = tfidf_matrix.toarray()

    # Detecte the highest tfidf bigrams
    mask = (X_bigram.sum(axis=0) >= min_tfidf)

    return X_bigram[:, mask], name_bigram[mask]

WARNING!!!!!!!!!!!!!!!!!!!!  
After the first run, you can **skip** the **next 3 blocks**.

In [42]:
X_bigram, name_bigram = extract_bigram_most_frequent(df.text, min_tfidf=3)

In [43]:
# ALLERT! ['ah sorry', 'holy crap'] are a couples of bigrams that AREN'T IN TEST dataset
df_X_bigram = pd.DataFrame(X_bigram, columns=name_bigram)
df_X_bigram = df_X_bigram.drop(['ah sorry', 'holy crap'], axis=1)

df_X_bigram.describe()

Unnamed: 0,_user_ _url_,_user_ _user_,_user_ excellent,_user_ thanks,_user_ think,_user_ true,_user_ worries,accepted answer,asp net,awesome thanks,...,ve got,ve seen,ve tried,ve using,visual studio,want use,won work,works fine,works great,wow great
count,8082.0,8082.0,8082.0,8082.0,8082.0,8082.0,8082.0,8082.0,8082.0,8082.0,...,8082.0,8082.0,8082.0,8082.0,8082.0,8082.0,8082.0,8082.0,8082.0,8082.0
mean,0.000409,0.000675,0.000795,0.000716,0.000445,0.000402,0.001427,0.000403,0.000573,0.000788,...,0.000535,0.000488,0.000456,0.000428,0.000671,0.000509,0.000384,0.00081,0.000748,0.000408
std,0.014743,0.015178,0.018016,0.019176,0.013019,0.017263,0.036536,0.012188,0.012561,0.024865,...,0.010926,0.010571,0.010523,0.010957,0.012532,0.011506,0.011639,0.019648,0.020546,0.015522
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.680705,0.658924,1.0,0.732557,1.0,1.0,0.508321,0.596027,1.0,...,0.504354,0.35216,0.521632,0.420555,0.459744,0.376869,0.628213,1.0,1.0,0.734789


In [44]:
fm.save_array(np.asarray(df_X_bigram), "X_bigram")

In [45]:
# AFTER FIRST RUN: import the frequent bigrams from disk
X_bigram = fm.import_array("X_bigram")

### Dataset '**test_public**'
#### Get the most frequent bigrams from 'text' column

In [47]:
def find_bigram_most_frequent(df, name_highest_bigram):

    # Initialize TfidfVectorizer with ngram_range for bigrams
    vectorizer = TfidfVectorizer(ngram_range=(2, 2), stop_words='english')

    # Fit the vectorizer on the documents and transform them into TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(df)

    # Get feature names (bigrams) and their corresponding TF-IDF scores
    name_bigram = vectorizer.get_feature_names_out()

    # Convert the TF-IDF matrix to a dense format (optional, for display)
    X_bigram = tfidf_matrix.toarray()

    # Detecte the highest tfidf bigrams
    mask = [True if b in name_highest_bigram else False for b in name_bigram ]
    

    return X_bigram[:, mask], name_bigram[mask]

WARNING!!!!!!!!!!!!!!!!!!!!  
After the first run, you can **skip** the **next block**.

In [48]:
X_TP_bigram, name_bigram_TP = find_bigram_most_frequent(df_test_public.text, name_bigram)
fm.save_array(np.asarray(X_TP_bigram), "X_TP_bigram")

In [49]:
# AFTER FIRST RUN: import the frequent bigrams from disk
X_TP_bigram = fm.import_array("X_TP_bigram")

--- THIS AREA SI FOR FIND BIGRAMS IN COMMON BETWEEN TRAIN & TEST DATASET

In [50]:
mask = [False if b in name_bigram_TP else True for b in name_bigram]

In [51]:
name_bigram[mask]

array(['ah sorry', 'holy crap'], dtype=object)

---END

In [52]:
# Show the columns values
df_X_TP_bigram = pd.DataFrame(X_TP_bigram, columns=name_bigram_TP)
df_X_TP_bigram.describe()

Unnamed: 0,_user_ _url_,_user_ _user_,_user_ excellent,_user_ thanks,_user_ think,_user_ true,_user_ worries,accepted answer,asp net,awesome thanks,...,ve got,ve seen,ve tried,ve using,visual studio,want use,won work,works fine,works great,wow great
count,3463.0,3463.0,3463.0,3463.0,3463.0,3463.0,3463.0,3463.0,3463.0,3463.0,...,3463.0,3463.0,3463.0,3463.0,3463.0,3463.0,3463.0,3463.0,3463.0,3463.0
mean,0.000643,0.000677,0.00098,0.000471,0.00047,6.4e-05,0.000991,9e-05,0.000985,0.000635,...,0.000301,0.000245,0.000318,0.000163,0.000793,0.000501,5.3e-05,0.000544,0.000164,0.000272
std,0.014064,0.016539,0.025259,0.013984,0.011659,0.003749,0.030325,0.005297,0.01508,0.020099,...,0.008258,0.010695,0.007147,0.00554,0.013638,0.010933,0.003137,0.013993,0.005648,0.011345
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.50259,0.692871,1.0,0.656406,0.395807,0.220638,1.0,0.311688,0.358855,1.0,...,0.284566,0.55845,0.200799,0.202859,0.359254,0.380205,0.184618,0.641426,0.230646,0.503013
