In [None]:
import numpy as np
import pandas as pd

from apyori import apriori
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap, Normalize
from matplotlib.font_manager import FontProperties

from datetime import datetime
import en_nlp_utils

In [None]:
# Define default source path
SRC_PATH = "src/"

start_time = datetime.now()
print("Text processing started at {}".format(start_time))

Text processing started at 2024-06-29 17:37:58.250003


In [None]:
# Load dataset
df_review = pd.read_csv(SRC_PATH + "en_hotel_review.csv")

# Check NULL values
en_nlp_utils.check_null(df_review, "df_review")

In [None]:
# Filter positive and negative reviews and save into different dataframe
df_pos = df_review[df_review["sentiment"] == "positive"].reset_index(drop=True)
df_neg = df_review[df_review["sentiment"] == "negative"].reset_index(drop=True)

## 1) Association Rule Mining

### a) Helper functions

In [None]:
# Function to convert the input value of a string type to a list of words
def process_all_words(value):
    if isinstance(value, str):
        return value.split()
    return []


# Function to discover association rules between item sets in a data set
# and to represent these rules as a graphical network for visualization
def find_association_rules(target_text, min_support=0.01, min_confidence=0.3, min_lift=3, min_length=2):
    
    # Process target_text: split each string into a list of words
    text = target_text.apply(process_all_words)
    text = [x for x in text if x != []]  # Remove empty lists
    
    # Find association rules using Apriori algorithm
    association_rules = apriori(text, min_support=min_support, min_confidence=min_confidence, 
                                min_lift=min_lift, min_length=min_length)
    association_results = list(association_rules)
    
    # Initialize a directed graph
    G = nx.DiGraph()
    
    # Summarize association rules and add edges to the graph
    rules_summary = [{
        "antecedent": list(item[2][0][0]),
        "consequent": list(item[2][0][1]),
        "support": round(item[1], 4),
        "confidence": round(item[2][0][2], 4),
        "lift": round(item[2][0][3], 4),
    } for item in association_results]
    
    print(f"Success. {len(rules_summary)} rules found.")
    print("========================================")
    
    # Print and add edges to the graph
    for rule in rules_summary:
        print(f"Rule: {rule['antecedent']} => {rule['consequent']}")
        print("Support: " + str(rule["support"]))
        print("Confidence: " + str(rule["confidence"]))
        print("Lift: " + str(rule["lift"]))
        print("----------------------------------------")
        
        # Add edges to the graph
        for ant in rule["antecedent"]:
            for con in rule["consequent"]:
                G.add_edge(ant, con, weight=rule["lift"], support=rule["support"], confidence=rule["confidence"])

    # Check if there are any edges to plot
    edges = G.edges(data=True)
    if not edges:
        print("No edges to plot. Exiting visualization.")
        return
    
    # Create a custom colormap
    colors = plt.cm.Blues(np.linspace(0.5, 1, 256))
    new_cmap = LinearSegmentedColormap.from_list("new_blues", colors)
    
    # Draw the network graph
    pos = nx.spring_layout(G, k=1, iterations=15)  # Adjust layout parameters for a more compact graph
    plt.figure(figsize=(12, 12))  # Increase figure size for better visibility
    
    # Node color and size
    node_color = "skyblue"
    node_size = 3000
    
    # Edge width and color
    edge_color = [d["weight"] for (u, v, d) in edges]
    
    # Draw nodes and edges
    nx.draw_networkx_nodes(G, pos, node_size=node_size, node_color=node_color)
    
    # Function to shorten edges to prevent overlap with nodes
    def shorten_edge(pos, src, dst, shrink_factor=0.1):
        """Shorten edges to prevent overlap with nodes"""
        x1, y1 = pos[src]
        x2, y2 = pos[dst]
        new_x2 = x1 + (x2 - x1) * (1 - shrink_factor)
        new_y2 = y1 + (y2 - y1) * (1 - shrink_factor)
        return (x1, y1), (new_x2, new_y2)
    
    new_edges = []
    for src, dst, data in edges:
        new_edges.append((*shorten_edge(pos, src, dst), data))
    
    # Draw edges with shortened positions
    for ((x1, y1), (x2, y2), data) in new_edges:
        plt.arrow(x1, y1, x2 - x1, y2 - y1, color=new_cmap(Normalize(vmin=min(edge_color), 
                                                                     vmax=max(edge_color))(data["weight"])),
            alpha=0.8, head_width=0.05, head_length=0.1, length_includes_head=True, width=0.01)
        # Show lift value at the midpoint of the edge
        plt.text((x1 + x2) / 2, (y1 + y2) / 2, f"{data['weight']:.2f}", fontsize=10, 
                 horizontalalignment="center", verticalalignment="center")

    # Draw labels with specified font
    for node, (x, y) in pos.items():
        plt.text(x, y, node, fontsize=12, horizontalalignment="center", verticalalignment="center")
    
    # Adjust color mapping for better visibility of the lightest color
    norm = Normalize(vmin=min(edge_color), vmax=max(edge_color))
    sm = plt.cm.ScalarMappable(cmap=new_cmap, norm=norm)
    sm.set_array([])
    plt.colorbar(sm, ax=plt.gca())
    
    plt.title("Association Rules Network")
    plt.show()

### b) Find association using whole cleaned reviews

#### For Positive Reviews

In [None]:
target_text = "review_cleaned_v3"

# Remove rows when target_text is NULL
df_target_text = df_pos.dropna(subset=[target_text]).reset_index(drop=True)
print(f"Length of df_pos: {len(df_pos)}")
print(f"Length of df_target_text: {len(df_target_text)}")

# Find association
find_association_rules(df_target_text[target_text], 0.01, 0.3, 10)

#### For Negative Reviews

In [None]:
# Remove rows when target_text is NULL
df_target_text = df_neg.dropna(subset=[target_text]).reset_index(drop=True)
print(f"Length of df_neg: {len(df_neg)}")
print(f"Length of df_target_text: {len(df_target_text)}")

# Find association
find_association_rules(df_target_text[target_text])

### c) Find association using words by POS tagging

#### For Positive Reviews

In [None]:
# Remove rows when "ADJ" is NULL
df_tag = df_pos.dropna(subset=["ADJ"]).reset_index(drop=True)
print(f"Length of ADJ: {len(df_tag)}")

# Find association
find_association_rules(df_tag["ADJ"], 0.0005, 0.5, 10)

In [None]:
# Remove rows when "ADV" is NULL
df_tag = df_pos.dropna(subset=["ADV"]).reset_index(drop=True)
print(f"Length of ADV: {len(df_tag)}")

# Find association
find_association_rules(df_tag["ADV"], 0.0005, 0.3, 10)

In [None]:
# Remove rows when "NOUN" is NULL
df_tag = df_pos.dropna(subset=["NOUN"]).reset_index(drop=True)
print(f"Length of NOUN: {len(df_tag)}")

# Find association
find_association_rules(df_tag["NOUN"], 0.001, 0.6, 30)

In [None]:
# Remove rows when "VERB" is NULL
df_tag = df_pos.dropna(subset=["VERB"]).reset_index(drop=True)
print(f"Length of VERB: {len(df_tag)}")

# Find association
find_association_rules(df_tag["VERB"], 0.0005, 0.3, 10)

In [None]:
# Remove rows when "PROPN" is NULL
df_tag = df_pos.dropna(subset=["PROPN"]).reset_index(drop=True)
print(f"Length of PROPN: {len(df_tag)}")

# Find association
find_association_rules(df_tag["PROPN"], 0.001, 0.8, 30)

In [None]:
# Remove rows when "DET" is NULL
df_tag = df_pos.dropna(subset=["DET"]).reset_index(drop=True)
print(f"Length of DET: {len(df_tag)}")

# Find association
find_association_rules(df_tag["DET"], 0.0005, 0.3, 10)

In [None]:
# Remove rows when "NUM" is NULL
df_tag = df_pos.dropna(subset=["NUM"]).reset_index(drop=True)
print(f"Length of NUM: {len(df_tag)}")

# Find association
find_association_rules(df_tag["NUM"], 0.0005, 0.3, 10)

#### For Negative Reviews

In [None]:
# Remove rows when "ADJ" is NULL
df_tag = df_neg.dropna(subset=["ADJ"]).reset_index(drop=True)
print(f"Length of ADJ: {len(df_tag)}")

# Find association
find_association_rules(df_tag["ADJ"], 0.0008, 0.5, 10)

In [None]:
# Remove rows when "ADV" is NULL
df_tag = df_neg.dropna(subset=["ADV"]).reset_index(drop=True)
print(f"Length of ADV: {len(df_tag)}")

# Find association
find_association_rules(df_tag["ADV"], 0.0008, 0.8, 50)

In [None]:
# Remove rows when "NOUN" is NULL
df_tag = df_neg.dropna(subset=["NOUN"]).reset_index(drop=True)
print(f"Length of NOUN: {len(df_tag)}")

# Find association
find_association_rules(df_tag["NOUN"], 0.001, 0.8, 30)

In [None]:
# Remove rows when "VERB" is NULL
df_tag = df_neg.dropna(subset=["VERB"]).reset_index(drop=True)
print(f"Length of VERB: {len(df_tag)}")

# Find association
find_association_rules(df_tag["VERB"], 0.001, 0.3, 10)

In [None]:
# Remove rows when "PROPN" is NULL
df_tag = df_neg.dropna(subset=["PROPN"]).reset_index(drop=True)
print(f"Length of PROPN: {len(df_tag)}")

# Find association
find_association_rules(df_tag["PROPN"], 0.002, 0.3, 10)

In [None]:
# Remove rows when "DET" is NULL
df_tag = df_neg.dropna(subset=["DET"]).reset_index(drop=True)
print(f"Length of DET: {len(df_tag)}")

# Find association
find_association_rules(df_tag["DET"], 0.001, 0.3, 10)

In [None]:
# Remove rows when "NUM" is NULL
df_tag = df_neg.dropna(subset=["NUM"]).reset_index(drop=True)
print(f"Length of NUM: {len(df_tag)}")

# Find association
find_association_rules(df_tag["NUM"], 0.001, 0.3, 10)

In [None]:
end_time = datetime.now()
print("Text processing ended at {}".format(end_time))
print("Text processing spent {}".format(end_time - start_time))