In [271]:
from flask import Flask, request, jsonify, send_from_directory
import os
import docx
import PyPDF2
import numpy as np
import re
from sklearn.manifold import MDS
from scipy.spatial.distance import pdist, squareform
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler

app = Flask(__name__, static_folder='../static', static_url_path='')

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def split_into_sentences(text):
    sentences = re.split(r'(?<=[.!?])', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

def compute_sentence_embeddings(sentences):
    return model.encode(sentences)

def compute_cosine_distance_matrix(embeddings):
    return squareform(pdist(embeddings, 'cosine'))

def reduce_to_one_dimension(distance_matrix):
    """
    Reduces a distance matrix to one dimension using Multidimensional Scaling (MDS).

    Parameters:
    distance_matrix (numpy.ndarray): The input distance matrix.

    Returns:
    numpy.ndarray: A one-dimensional representation of the input distance matrix.
    """
    mds = MDS(n_components=1, dissimilarity='precomputed', random_state=42)
    one_dimensional = mds.fit_transform(distance_matrix).flatten()
    return one_dimensional

def normalize_array(array):
    min_val = np.min(array)
    max_val = np.max(array)
    return (array - min_val) / (max_val - min_val)

def get_color(value):
    """
    Given a value between 0 and 1, this function returns an RGB color value that transitions
    smoothly between red, yellow, green, cyan, and blue. The value parameter determines the 
    position of the color in the transition. The color is calculated by interpolating between 
    the colors in the transitions list. The function ensures that the value is within the 
    range [0, 1] and calculates the segment and local value based on the segment length. If 
    the segment is equal to the number of segments, the segment is set to the last segment and 
    the local value is set to 1. The function then calculates the interpolated color by 
    interpolating between the start and end colors in the transitions list. The function 
    returns the interpolated color as a tuple of three integers representing the RGB values.
    
    :param value: A float between 0 and 1 that determines the position of the color in the 
                  transition.
    :type value: float
    :return: An RGB color value as a tuple of three integers representing the RGB values.
    :rtype: tuple(int, int, int)
    """
    transitions = [
        (255, 0, 0),   # Red
        (255, 255, 0), # Yellow
        (0, 255, 0),   # Green
        (0, 255, 255), # Cyan
        (0, 0, 255),    # Blue
    ]
    value = max(0, min(1, value))
    num_segments = len(transitions) - 1
    segment_length = 1.0 / num_segments
    segment = int(value / segment_length)
    if segment == num_segments:
        segment = num_segments - 1
        local_value = 1.0
    else:
        local_value = (value - segment * segment_length) / segment_length
    start_color = transitions[segment]
    end_color = transitions[segment + 1]
    interpolated_color = tuple(
        int(start_color[i] + local_value * (end_color[i] - start_color[i]))
        for i in range(3)
    )
    return interpolated_color

def get_color_map(normalized_array):
    colors = [get_color(i) for i in normalized_array]
    return colors

def generate_html(sentences, colors):
    """
    Generates an HTML content based on the input sentences and their corresponding colors.

    Parameters:
    sentences (list): A list of sentences to be included in the HTML.
    colors (list): A list of colors corresponding to each sentence.

    Returns:
    str: The generated HTML content.
    """
    html_content = '''
    <html>
    <head>
        <style>
            body {
                font-family: Arial, sans-serif;
                line-height: 1.6;
                background-color: #f5f5f5;
                padding: 20px;
            }
            .sentence {
                display: inline-block;
                margin-bottom: 10px;
                padding: 5px;
                border-radius: 5px;
                white-space: pre-wrap; /* Preserve spaces and newlines */
            }
            .file-separator {
                display: block;
                margin: 20px 0;
                height: 1px;
                background-color: #ccc;
            }
        </style>
    </head>
    <body>
    '''
    for i, sentence in enumerate(sentences):
        color = colors[i]
        html_content += f'<span class="sentence" style="background-color: rgb{color}">{sentence}</span> '
        if '\n' in sentence:
            html_content += '<br>'

    html_content += '</body></html>'
    return html_content

def get_cluster_labels(embeddings):
    ss = StandardScaler()
    scaled_embeds = ss.fit_transform(embeddings)
    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=32).fit(scaled_embeds)
    return clustering.labels_

def average_for_each_cluster(embeddings, labels):
    clusters = {}
    for i, j in zip(labels, embeddings):
        if i not in clusters:
            clusters[i] = []
        clusters[i].append(j)
    for i in clusters:
        clusters[i] = sum(clusters[i]) / len(clusters[i])
    clusters = dict(sorted(clusters.items()))
    return clusters

def index_mapping(lst):
    index_dict = {}
    for index, value in enumerate(lst):
        if value in index_dict:
            index_dict[value].append(index)
        else:
            index_dict[value] = [index]
    index_dict = dict(sorted(index_dict.items()))
    return index_dict

def get_one_dimentional_clusters(index_dict, embeddings):
    one_dimensional_clusters = {}
    for i, k in index_dict.items():
        this_cluster = []
        if len(k) == 1:
            one_dimensional_clusters[i] = [0.5]
        else:
            for j in k:
                this_cluster.append(embeddings[j])
            cosine_distance_matrix = compute_cosine_distance_matrix(np.array(this_cluster))
            one_dimensional_cluster = reduce_to_one_dimension(cosine_distance_matrix)
            normalized_array = normalize_array(one_dimensional_cluster)
            one_dimensional_clusters[i] = normalized_array
    one_dimensional_clusters = dict(sorted(one_dimensional_clusters.items()))
    return one_dimensional_clusters

def extract_colors(cluster_values, index_dict, cluster_colors):
    colored_points = {}
    for cluster, values in cluster_values.items():
        max_color = cluster_colors[cluster]
        colored_points[cluster] = [interpolate_color(value, max_color) for value in values]
    colors_index = {}
    for k1, k2 in zip(index_dict.values(), colored_points.values()):
        for i, j in zip(k1, k2):
            colors_index[i] = tuple(j)
    colors = dict(sorted(colors_index.items())).values()
    return colors

def interpolate_color(value, max_color):
    return np.clip(np.array(max_color) - int(value*20), 0, 255)

In [107]:
all_sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial Intelligence is transforming the world.",
    "Python is a versatile programming language.",
    "Data science involves statistics, programming, and domain knowledge.",
    "Machine learning algorithms learn from data to make predictions.",
    "The Earth revolves around the Sun.",
    "Quantum computing promises to revolutionize technology.",
    "Climate change is a pressing global issue.",
    "Renewable energy sources are essential for a sustainable future.",
    "The Internet has connected people worldwide.",
    "Blockchain technology enables decentralized transactions.",
    "The Great Wall of China is one of the Seven Wonders of the World.",
    "Human genetics is a fascinating field of study.",
    "Cybersecurity is crucial in protecting data and privacy.",
    "Virtual reality can create immersive experiences.",
    "Biodiversity is vital for a healthy ecosystem.",
    "The theory of relativity was proposed by Albert Einstein.",
    "Smartphones have become an integral part of modern life.",
    "The human brain is a complex organ.",
    "Space exploration helps us understand the universe.",
    r"The oceans cover more than 70% of the Earth's surface.",
    "Artificial neural networks are inspired by the human brain.",
    "I love cartoons and anime.",
    "Attack on titan is the best anime to watch.",
    "The universe is expanding and changing.",
]


In [281]:
embeddings = compute_sentence_embeddings(all_sentences)
labels = get_cluster_labels(embeddings)
clusters = average_for_each_cluster(embeddings, labels)
index_dict = index_mapping(labels)
cluster_values = get_one_dimentional_clusters(index_dict, embeddings)
cluster_embeddings = list(clusters.values())
cosine_distance_matrix = compute_cosine_distance_matrix(cluster_embeddings)
one_dimensional = reduce_to_one_dimension(cosine_distance_matrix)
normalized_array = normalize_array(one_dimensional)
cluster_colors = dict(enumerate(get_color_map(normalized_array)))
colors = extract_colors(cluster_values, index_dict, cluster_colors)



In [282]:
colors

dict_values([(245, 0, 0), (0, 159, 248), (0, 146, 235), (0, 152, 241), (0, 155, 244), (0, 0, 251), (131, 251, 0), (0, 235, 208), (0, 246, 219), (120, 240, 0), (127, 247, 0), (0, 0, 239), (0, 162, 251), (124, 244, 0), (135, 255, 0), (0, 255, 228), (0, 0, 247), (115, 235, 0), (0, 166, 255), (0, 0, 243), (0, 0, 255), (0, 166, 255), (59, 255, 0), (39, 235, 0), (0, 0, 235)])

In [269]:
clusters.values()

dict_values([array([-0.21257837, -0.22894716, -0.274597  , -0.30743423, -0.00857934,
       -0.14875458, -0.07222147, -0.00719772,  0.16228414,  0.12208735,
       -0.19777323, -0.1766618 , -0.0041559 ,  0.02584129, -0.36996296,
       -0.11518591, -0.24328254, -0.04439108, -0.22428131, -0.14666514,
       -0.19472751,  0.03810656,  0.1363705 , -0.10357957, -0.03996561,
        0.09865342,  0.07654794, -0.06079931,  0.07266394, -0.18513682,
        0.3484679 ,  0.20320202,  0.37088206,  0.09708074, -0.5017683 ,
        0.14741997, -0.07604994, -0.03045066,  0.1107987 ,  0.31524763,
       -0.06352799, -0.07498413,  0.06610546,  0.14920254,  0.09633522,
        0.13391785, -0.25910994,  0.13779546, -0.08259051,  0.01754911,
       -0.18431377, -0.08183572, -0.02911495, -0.18366684,  0.0456252 ,
        0.40467265,  0.2768479 , -0.30175743, -0.20217167, -0.21736829,
        0.10531955, -0.22626802,  0.12225018,  0.28932256,  0.26497874,
        0.02483741, -0.01942676,  0.02771701,  0.06



{0: (0, 166, 255),
 1: (135, 255, 0),
 2: (0, 0, 255),
 3: (255, 0, 0),
 4: (59, 255, 0),
 5: (0, 255, 228)}

{0: (0, 166, 255),
 1: (135, 255, 0),
 2: (0, 0, 255),
 3: (255, 0, 0),
 4: (59, 255, 0),
 5: (0, 255, 228)}

In [218]:
clustering.labels_

array([3, 0, 0, 0, 0, 2, 1, 5, 5, 1, 1, 2, 0, 1, 1, 5, 2, 1, 0, 2, 2, 0,
       4, 4, 2], dtype=int64)

In [256]:
def index_mapping(lst):
    index_dict = {}
    for index, value in enumerate(lst):
        if value in index_dict:
            index_dict[value].append(index)
        else:
            index_dict[value] = [index]
    index_dict = dict(sorted(index_dict.items()))
    return index_dict
index_dict = index_mapping(clustering.labels_)


In [233]:
def get_one_dimentional_clusters(index_dict, embeddings):
    one_dimensional_clusters = {}
    for i, k in index_dict.items():
        this_cluster = []
        if len(k) == 1:
            one_dimensional_clusters[i] = [0.5]
        else:
            for j in k:
                this_cluster.append(embeddings[j])
            cosine_distance_matrix = compute_cosine_distance_matrix(np.array(this_cluster))
            one_dimensional_cluster = reduce_to_one_dimension(cosine_distance_matrix)
            normalized_array = normalize_array(one_dimensional_cluster)
            one_dimensional_clusters[i] = normalized_array
    one_dimensional_clusters = dict(sorted(one_dimensional_clusters.items()))
    return one_dimensional_clusters

x = get_one_dimentional_clusters(index_dict, embeddings)



In [255]:
index_dict

{3: [0],
 0: [1, 2, 3, 4, 12, 18, 21],
 2: [5, 11, 16, 19, 20, 24],
 1: [6, 9, 10, 13, 14, 17],
 5: [7, 8, 15],
 4: [22, 23]}

In [254]:
import numpy as np

# Define the colors for each cluster
cluster_colors = {
    0: (0, 166, 255),
    1: (135, 255, 0),
    2: (0, 0, 255),
    3: (255, 0, 0),
    4: (59, 255, 0),
    5: (0, 255, 228)
}

# Define the values for each cluster
cluster_values = {
    0: np.array([0.39541123, 1.0, 0.71233395, 0.59644517, 0.24247726, 0.00598407, 0.0]),
    1: np.array([0.23209577, 0.78091155, 0.43006641, 0.59062934, 0.0, 1.0]),
    2: np.array([0.23066032, 0.82782919, 0.43688449, 0.62689432, 0.0, 1.0]),
    3: np.array([0.5]),
    4: np.array([0.0, 1.0]),
    5: np.array([1.0, 0.48642724, 0.0])
}

def interpolate_color(value, max_color):
    return np.clip(np.array(max_color) - int(value*20), 0, 255)

# Generate colors for each point in each cluster
def extract_colors(cluster, values):
    colored_points = {}
    for cluster, values in cluster_values.items():
        max_color = cluster_colors[cluster]
        colored_points[cluster] = [interpolate_color(value, max_color) for value in values]
    colors_index = {}
    for k1, k2 in zip(index_dict.values(), colored_points.values()):
        for i, j in zip(k1, k2):
            colors_index[i] = tuple(j)
    colors = dict(sorted(colors_index.items())).values()
    return colors
# Example of accessing the colors for cluster 0
print(colored_points[0])


[array([  0, 159, 248]), array([  0, 146, 235]), array([  0, 152, 241]), array([  0, 155, 244]), array([  0, 162, 251]), array([  0, 166, 255]), array([  0, 166, 255])]


In [253]:
colored_points

{0: [array([  0, 163, 252]),
  array([  0, 156, 245]),
  array([  0, 159, 248]),
  array([  0, 161, 250]),
  array([  0, 164, 253]),
  array([  0, 166, 255]),
  array([  0, 166, 255])],
 1: [array([133, 253,   0]),
  array([128, 248,   0]),
  array([131, 251,   0]),
  array([130, 250,   0]),
  array([135, 255,   0]),
  array([125, 245,   0])],
 2: [array([  0,   0, 253]),
  array([  0,   0, 247]),
  array([  0,   0, 251]),
  array([  0,   0, 249]),
  array([  0,   0, 255]),
  array([  0,   0, 245])],
 3: [array([250,   0,   0])],
 4: [array([ 59, 255,   0]), array([ 49, 245,   0])],
 5: [array([  0, 245, 218]), array([  0, 251, 224]), array([  0, 255, 228])]}