In [11]:
pip show pyabsa


Name: pyabsa
Version: 2.4.1.post1
Summary: This tool provides the state-of-the-art models for aspect term extraction (ATE), aspect polarity classification (APC), and text classification (TC).
Home-page: https://github.com/yangheng95/PyABSA
Author: Yang, Heng
Author-email: hy345@exeter.ac.uk
License: MIT
Location: /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages
Requires: autocuda, boostaug, findfile, gitpython, metric-visualizer, networkx, pandas, protobuf, pytorch-warmup, sentencepiece, seqeval, spacy, termcolor, torch, tqdm, transformers, typing-extensions, update-checker
Required-by: boostaug
Note: you may need to restart the kernel to use updated packages.


In [44]:
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter
from pyabsa import AspectTermExtraction as ATEPC, available_checkpoints
from sqlalchemy import create_engine, text
import spacy


In [45]:
nlp = spacy.load("en_core_web_sm")
print("SpaCy model loaded successfully!")

SpaCy model loaded successfully!


### Helper functions

In [19]:
# Pre processing functions
def rid_numbers_punct(df):
    def clean_text(text):
        # Ensure the input is a string
        text = str(text) if text is not None else ""
        # Remove numbers and punctuation, except essential ones like periods for sentence structure
        return re.sub(r'[^\w\s]', '', text)  # Keep only alphanumeric and spaces
    
    # Apply the cleaning function
    df['review_text'] = df['review_text'].astype(str).apply(clean_text)
    return df


# Function to transform all sentences to lowercase
def lowercase(df):
    df['review_text'] = df['review_text'].str.lower()
    return df

def blank_lines_reduction(df):
    df['review_text'].replace('', np.nan, inplace=True)
    return df

# Function to remove stopwords
def exclude_stopwords(df):
    stop_words = set(stopwords.words('english'))
    def remove_stopwords(text):
        tokens = word_tokenize(text)
        return ' '.join([word for word in tokens if word not in stop_words])
    df['review_text'] = df['review_text'].apply(remove_stopwords)
    return df

# Function to remove words with fewer than three characters
def remove_under_three_characters(df):
    def remove_short_words(text):
        return ' '.join([word for word in text if len(word) >= 3])
    df['review_text'] = df['review_text'].apply(remove_short_words)
    return df

# Function to tokenize the text
def tokenize(df):
    df['review_text'] = df['review_text'].apply(word_tokenize)
    return df

# Stemming function
def stemming(df):
    stemmer = PorterStemmer()
    def stem_text(tokens):
        return [stemmer.stem(word) for word in tokens]
    df['review_text'] = df['review_text'].apply(lambda tokens: stem_text(tokens))
    return df

# Lemmatization function for words appearing in <1% of the corpus
def lemmatize_rare_words(df):
    lemmatizer = WordNetLemmatizer()
    
    # Flatten list of tokens to calculate word frequencies
    all_tokens = [word for tokens in df['review_text'] for word in tokens]
    word_freq = Counter(all_tokens)
    total_words = sum(word_freq.values())
    threshold = total_words * 0.01  # 1% threshold
    
    # Identify rare words
    rare_words = {word for word, freq in word_freq.items() if freq < threshold}
    
    def lemmatize_text(tokens):
        return [
            lemmatizer.lemmatize(word) if word in rare_words else word
            for word in tokens
        ]
    
    df['review_text'] = df['review_text'].apply(lambda tokens: lemmatize_text(tokens))
    return df


In [20]:
# Minimal preprocessing function
def minimal_preprocessing(df):
    def clean_text(text):
        # Ensure the input is a string
        text = str(text) if text is not None else ""
        # Remove URLs
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
        # Remove special characters, keeping basic punctuation
        text = re.sub(r"[^a-zA-Z0-9\s.,!?']+", "", text)
        # Normalize whitespace
        text = re.sub(r"\s+", " ", text).strip()
        return text

    # Apply minimal cleaning
    df['review_text'] = df['review_text'].apply(clean_text)
    return df

In [46]:
# Load absa model and test it out
def load_and_apply_absa_model(df):
    aspect_extractor = ATEPC.AspectExtractor('multilingual',
                                        auto_device=True,  # False means load model on CPU
                                        cal_perplexity=True,
                                        )
    
    # Apply model
    def apply_model(text):
        processed_text = aspect_extractor.predict(text, ignore_error = True)
        return processed_text

    df['predicted_text'] = df['review_text'].apply(apply_model)
    return df
    

In [47]:
# Example pipeline for preprocessing what choices actually to apply
def preprocess_pipeline(df):
    df = minimal_preprocessing(df)
    return df

In [80]:
def extract_aspects_sentiments(row):
    """
    Extracts aspect terms and sentiments from PyABSA predictions.

    Args:
        row (dict): A dictionary containing the predicted text output from PyABSA.

    Returns:
        tuple: (aspect_terms, sentiment_terms) as comma-separated strings.
    """
    if isinstance(row, dict) and 'aspect' in row and 'sentiment' in row:
        aspect_terms = ", ".join(row['aspect']) if row['aspect'] else ""
        sentiment_terms = ", ".join(row['sentiment']) if row['sentiment'] else ""
        return aspect_terms, sentiment_terms
    return "", ""

### End of helper functions

In [69]:
### Connection to the database
engine = create_engine('postgresql+psycopg2://patrickstewart:password@localhost:5432/patrickstewart')

In [70]:
try:
    # Establish a connection
    with engine.connect() as connection:
        # Query to extract the 'product_info' table
        product_info_query = "SELECT * FROM product_info"
        product_info_df = pd.read_sql_query(text(product_info_query), connection)
        print("Product Info Table extracted successfully.")

        # Query to extract the 'product_reviews' table
        product_reviews_query = "SELECT * FROM product_reviews"
        product_reviews_df = pd.read_sql_query(text(product_reviews_query), connection)
        print("Product Reviews Table extracted successfully.")

    # Display the first few rows of each DataFrame to confirm
    print("\nProduct Info DataFrame:\n", product_info_df.head())
    print("\nProduct Reviews DataFrame:\n", product_reviews_df.head())
except Exception as e:
    print("Error:", e)


Error: 'str' object is not callable


In [71]:
### Let's just select the data most important and keep that as the demo - just the table product_reviews is fine

In [72]:
### Apply processing function
# Rid blank lines seen in the function
product_reviews_df_process =  preprocess_pipeline(product_reviews_df)

In [73]:
product_reviews_df_process =  load_and_apply_absa_model(product_reviews_df.head(100))

[2025-01-31 16:21:14] (2.4.1.post1) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2025-01-31 16:21:14] (2.4.1.post1) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2025-01-31 16:21:14] (2.4.1.post1) [32mDownloading checkpoint:multilingual [0m
[2025-01-31 16:21:14] (2.4.1.post1) [31mNotice: The pretrained model are used for testing, it is recommended to train the model on your own custom datasets[0m
[2025-01-31 16:21:14] (2.4.1.post1) Checkpoint already downloaded, skip
[2025-01-31 16:21:14] (2.4.1.post1) Load aspect extractor from checkpoints/ATEPC_MULTILINGUAL_CHECKPOINT
[2025-01-31 16:21:14] (2.4.1.post1) config: checkpoints/ATEPC_MULTILINGUAL_CHECKPOINT/fast_lcf_atepc.config
[2025-01-31 16:21:14] (2.4.1.post1) state_dict: checkpoints/ATEPC_MULTILINGUAL_CHECKPOINT/fast_lcf_atepc.state_dict
[2025-01-31 16:21:14] (2.4.1.post1) model: None
[2025-01-31 16:21:14] (2.4.1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  torch.load(


[2025-01-31 16:21:33] (2.4.1.post1) The results of aspect term extraction have been saved in /Users/patrickstewart/Documents/Customer review platform/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
[2025-01-31 16:21:33] (2.4.1.post1) Example 0: I use this with the Nudestix Citrus [32m<Clean Balm:Positive Confidence:0.9715>[0m MakeUp [32m<Melt:Positive Confidence:0.974>[0m to double cleanse and it has completely changed my skin for the better . The [32m<makeup melt:Positive Confidence:0.8979>[0m is oil based and removes all of your makeup super easily . I followup with this water based cleanser , and I also use this just by itself when Im not wearing makeup . It leaves the skin gently cleansed , but without stripping the skin . 1010 recommend combining with the makeup melt . Its perfection !
[2025-01-31 16:21:34] (2.4.1.post1) The results of aspect term extraction have been saved in /Users/patrickstewart/Documents/Customer review platform/Aspect Term 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted_text'] = df['review_text'].apply(apply_model)


In [81]:
# Apply extraction to DataFrame
product_reviews_df_process[['aspect_terms', 'sentiments']] = product_reviews_df_process['predicted_text'].apply(
    lambda x: pd.Series(extract_aspects_sentiments(x))
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_reviews_df_process[['aspect_terms', 'sentiments']] = product_reviews_df_process['predicted_text'].apply(


In [82]:
product_reviews_df_process[['aspect_terms', 'sentiments']]

Unnamed: 0,aspect_terms,sentiments
0,"Clean Balm, Melt, makeup melt","Positive, Positive, Positive"
1,"lip, jelly","Negative, Positive"
2,lip mask,Positive
3,"formula, Grapefruit","Positive, Positive"
4,price,Positive
...,...,...
95,,
96,"use, lips","Positive, Positive"
97,"laneige, lips","Positive, Positive"
98,applicator,Negative


In [91]:
product_reviews_df_process.to_csv('product_processed.csv')