In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# Improved frequency generator using Counter for optimization
def gen_freq(descriptions):
    count = Counter()
    for desc in descriptions:
        count.update(desc.split())  # Efficiently split and count words in one step
    return count

# Frequency calculation for positive and negative descriptions
pos_freq = gen_freq(df[df['Combined_Output'] == 1]['stemmed_description'])
neg_freq = gen_freq(df[df['Combined_Output'] == 0]['stemmed_description'])

# Combine unique keys from both positive and negative frequencies
all_keys = set(pos_freq.keys()).union(set(neg_freq.keys()))

# Create a DataFrame with word, positive count, and negative count
data = [(key, pos_freq.get(key, 0), neg_freq.get(key, 0)) for key in all_keys]
df_freq = pd.DataFrame(data, columns=['key', 'pos_count', 'neg_count'])

# Function to plot log graph for positive and negative word frequencies
def plot_log_graph(df_freq):
    # Extract data from the DataFrame
    data = df_freq.values.tolist()

    # Set up the figure and axis
    fig, ax = plt.subplots(figsize=(12, 8))

    # Convert positive and negative raw counts to Logarithmic scale
    x = np.log([x[1] + 1 for x in data])  # Log of positive count (+1 to avoid log(0))
    y = np.log([x[2] + 1 for x in data])  # Log of negative count (+1 to avoid log(0))

    # Plot a scatter plot for each word
    ax.scatter(x, y)

    # Assign axis labels
    ax.set_xlabel("Log Positive Count", fontsize=14)
    ax.set_ylabel("Log Negative Count", fontsize=14)

    # Annotate the words at the same position as the points
    for i in range(len(data)):
        ax.annotate(data[i][0], (x[i], y[i]), fontsize=10, alpha=0.75)

    # Plot the red line that divides the two areas (y=x line)
    ax.plot([0, max(x)], [0, max(y)], color='red', linewidth=2)

    # Add grid and title
    ax.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.title("Log-Log Plot of Word Frequencies (Positive vs Negative)", fontsize=16)

    # Show the plot
    plt.show()

# Call the function to plot the graph
plot_log_graph(df_freq)


# Multiple Graphs DBSCAN Clustering

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from collections import Counter
from sklearn.cluster import DBSCAN  # For clustering words based on proximity

# Improved frequency generator using Counter for optimization
def gen_freq(descriptions):
    count = Counter()
    for desc in descriptions:
        count.update(desc.split())  # Efficiently split and count words in one step
    return count

# Frequency calculation for positive and negative descriptions
pos_freq = gen_freq(df[df['Combined_Output'] == 1]['stemmed_description'])
neg_freq = gen_freq(df[df['Combined_Output'] == 0]['stemmed_description'])

# Combine unique keys from both positive and negative frequencies
all_keys = set(pos_freq.keys()).union(set(neg_freq.keys()))

# Create a DataFrame with word, positive count, and negative count
data = [(key, pos_freq.get(key, 0), neg_freq.get(key, 0)) for key in all_keys]
df_freq = pd.DataFrame(data, columns=['key', 'pos_count', 'neg_count'])

# Convert counts to log scale to avoid log(0) issues
df_freq['log_pos_count'] = np.log(df_freq['pos_count'] + 1)
df_freq['log_neg_count'] = np.log(df_freq['neg_count'] + 1)

# Group words using DBSCAN to find clusters based on their (x, y) coordinates
def group_words_by_proximity(df_freq, eps=0.5, min_samples=1):
    # DBSCAN clustering on the log scale counts (eps controls closeness, adjust as needed)
    coords = df_freq[['log_pos_count', 'log_neg_count']].values
    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(coords)
    df_freq['cluster'] = clustering.labels_
    return df_freq

# Group the words into clusters based on proximity
df_freq = group_words_by_proximity(df_freq, eps=0.3, min_samples=1)

# Function to plot log graph using plotly, split into multiple graphs
def plot_log_graph_interactive(df_freq):
    clusters = df_freq['cluster'].unique()
    for cluster_id in clusters:
        # Get words for this cluster
        cluster_data = df_freq[df_freq['cluster'] == cluster_id]
        
        # Split into groups of 5
        group_size = 5
        num_groups = int(np.ceil(len(cluster_data) / group_size))
        
        for i in range(num_groups):
            group_data = cluster_data.iloc[i*group_size:(i+1)*group_size]
            words = group_data['key'].tolist()
            x = group_data['log_pos_count'].tolist()
            y = group_data['log_neg_count'].tolist()
            
            # Create an interactive scatter plot using plotly
            fig = go.Figure(data=go.Scatter(
                x=x,
                y=y,
                mode='markers+text',
                text=words,  # Annotate with words
                textposition="top center",
                marker=dict(size=12, color='blue', opacity=0.7)
            ))

            # Add labels and diagonal dividing line
            fig.update_layout(
                title=f'Log-Log Plot of Word Frequencies (Cluster {cluster_id}, Group {i+1})',
                xaxis_title="Log Positive Count",
                yaxis_title="Log Negative Count",
                showlegend=False
            )

            fig.add_shape(
                type='line',
                x0=0, y0=0, x1=max(x), y1=max(y),
                line=dict(color='red', width=2)
            )

            # Show the figure
            fig.show()

# Call the function to plot the graphs
plot_log_graph_interactive(df_freq)


# Euclidean Distance

In [None]:

from scipy.spatial import distance_matrix

def filter_close_points(x, y, threshold=0.1):
    # Stack x and y into a 2D array for distance calculation
    points = np.vstack((x, y)).T
    dist_matrix = distance_matrix(points, points)
    
    # Create a mask for points that are too close to each other
    mask = np.ones(len(x), dtype=bool)
    for i in range(len(x)):
        if mask[i]:
            close_points = np.where(dist_matrix[i] < threshold)[0]
            mask[close_points] = False
    
    # Return filtered points
    return x[mask], y[mask]

# Apply the function to filter out close points
x_filtered, y_filtered = filter_close_points(x, y)


def plot_log_graph(df_freq):
    # Extract data from the DataFrame
    data = df_freq.values.tolist()

    # Convert positive and negative raw counts to Logarithmic scale
    x = np.log([x[1] + 1 for x in data])  # Log of positive count (+1 to avoid log(0))
    y = np.log([x[2] + 1 for x in data])  # Log of negative count (+1 to avoid log(0))

    # Filter out close points
    x_filtered, y_filtered = filter_close_points(x, y)

    # Set up the figure and axis
    fig, ax = plt.subplots(figsize=(12, 8))

    # Plot a scatter plot for each word
    ax.scatter(x_filtered, y_filtered)

    # Annotate the words at the same position as the points
    for i in range(len(x_filtered)):
        ax.annotate(data[i][0], (x_filtered[i], y_filtered[i]), fontsize=10, alpha=0.75)

    # Plot the red line that divides the two areas (y=x line)
    ax.plot([0, max(x_filtered)], [0, max(y_filtered)], color='red', linewidth=2)

    # Add grid and title
    ax.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.title("Log-Log Plot of Word Frequencies (Positive vs Negative)", fontsize=16)

    # Show the plot
    plt.show()

# Call the function to plot the graph
plot_log_graph(df_freq)
