In [1]:
import pandas as pd
import numpy as np
import datetime
import json

from debther_texts import *
from cluster_functions_nb import *

from dash import Dash, html, dcc, Input, Output, callback,State
from dash import dash_table
import plotly.express as px
import plotly.graph_objects as go

In [36]:
cluster_df = pd.DataFrame()
comparison_df = pd.DataFrame()


edition_comparison = Dash(__name__)

edition_comparison.layout = html.Div([
    html.H1(children='Edition comparison - cluster finder', style={'textAlign': 'center'}),
    dcc.Store(id='store_df'),
    dcc.Store(id='download_df'),
    html.Button('example_Data',id='load_example',n_clicks=0),

    html.Div([
        # Wrapper Div für die beiden Textbereiche
        html.Div([
            dcc.Markdown(children=f'Text A', id='md_a', style={'width': '100%'}),
            dcc.Textarea(id='text_a_input', style={'width': '90%', 'height': 100}),
            dcc.Input(id='seperators',placeholder="all seperators (z. B.: ',', '|', ';')", 
                style={'width': '90%','height':20}),
            
            html.Div('minimal cluster size', id='slider_head', style={'margin-right': '10px'}),  # Text mit Abstand
               
            dcc.Slider(
                    0, 50, 1, value=10, id='cluster_slider',
                    tooltip={"placement": "bottom", "always_visible": True}, marks=None 
                    ),
            dcc.RadioItems(['Bubble','Line'],value='Bubble',id='graph_type')

            ], style={'width': '45%'}),

            html.Div([
                dcc.Markdown(children=f'Text B', id='md_b', style={'width': '100%'}),
                dcc.Textarea(id='text_b_input', style={'width': '90%', 'height': 100}),
                html.Button('Analyze',id='start_button', style={'width': '20%', 'height': 40},n_clicks=0)
                ], style={'width': '45%'}),
            html.Div([],id='loading')
        
        ], style={'display': 'flex'}),
        html.Div([
                
                html.Div([
                    dcc.Loading([
                    dcc.Graph(id='graph')],id='loading1')
                ], style={'width': '45%'}),

                html.Div([
                    dcc.Loading([
                    dcc.Graph(id='graph_diff')],id='loading2')
                ], style={'width': '45%'})
                
                ], style={'display': 'flex', 'justify-content': 'space-between', 'align-items': 'center'}
                ),
        html.Div([
                html.Button('Remove Cluster', id='remove', n_clicks=0),
                html.Button('Update Graph', id='update_g_button', n_clicks=0),
                html.Button('Update Table', id='update_table', n_clicks=0),
                html.Button('Download Data', id='download_button', n_clicks=0),
                dcc.Download('download')
        ]),
        html.Div([
            dash_table.DataTable(
                id='table',
                style_header={'backgroundColor': 'lightgrey', 'fontWeight': 'bold'},
                style_cell={'textAlign': 'left','minWidth': '20px', 'maxWidth': '400px', 'width': 'auto'},
                style_data={'whiteSpace': 'normal','height': 'auto'}  # Automatische Höhe für mehrere Zeilen
            )
        ])
        

])
# remove selected data
@callback(
        Output('store_df','data',allow_duplicate=True),
        Input('remove','n_clicks'),
        State('graph','selectedData'),
        State('store_df','data'),
        prevent_initial_call=True
)
def get_click(click,remove_data, df_data):
    remove_lst = [i['x'] for i in remove_data['points']]
    df = pd.DataFrame(df_data)
    df = df[~df['start_a'].isin(remove_lst)]
    return df.to_dict()



@callback(
    Output('text_a_input','value'),
    Output('text_b_input','value'),
    Output('seperators','value'),
    Input('load_example','n_clicks'),
    prevent_initial_call=True
)
def load_example(click):
    if click >=1:
        text_a = debther_gangtok()#[:30000]
        text_b = debther_peking()#[:30000]
        seperators_1,a,b,c = debther_parameters()
        print(seperators_1)
        seps = r""
        for i in seperators_1:
            seps += ""
            seps += i+","
        print(seps.rstrip(','))
        return text_a,text_b,seps
    else:
        return '','',''

# Analyse and create graph
@callback(
    Output('graph','figure', allow_duplicate=True),
    Output('graph_diff','figure', allow_duplicate=True),
    Output('store_df','data'),
    State('text_a_input','value'),
    State('text_b_input','value'),
    State('seperators','value'),
    State('cluster_slider','value'),
    State('graph_type','value'),
    Input('start_button','n_clicks'),
    prevent_initial_call=True
)
def update_graph(text_a,text_b,seps,min_clus,type,clicks):
    if clicks >= 1:
        seps = seps.split(',')

        text_a = clean(text_a,seps) 
        text_b = clean(text_b,seps)    

        cluster_df = find_cluster(text_a,text_b,int(min_clus),'a','b')
        print(cluster_df)
        if type == 'Bubble':
            fig1 = px.scatter(cluster_df, x='start_a', y='start_b', size='length')
            fig1.update_layout(clickmode='event+select')
            fig2 = px.scatter(cluster_df, x='start_a', y='differenz', size='length')
        else:
            fig1 = go.Figure()
            for _, row in cluster_df.iterrows():
                fig1.add_trace(go.Scatter(
                x=[row['start_a'], row['end_a']],  # X-Koordinaten
                y=[row['start_b'], row['end_b']],  # Y-Koordinaten
                mode='lines+markers',  # Zeigt Linien und Marker
                name=f"Line {row.name}",  # Optionale Beschriftung
                line=dict(color='blue',width=2),  # Linienbreite
                marker=dict(color='blue',size=5)  # Markierungsgröße
                ))
                fig1.update_layout(
                    title="Linien-Visualisierung basierend auf DataFrame",
                    xaxis_title="Text_a",
                    yaxis_title="Text_b",
                    showlegend=False,  # Legende anzeigen
                    )
                fig1.add_trace(go.Scatter(
                    x=[0, max(cluster_df['end_a'].max(), cluster_df['end_b'].max())],  # Bereich für die Diagonale
                    y=[0, max(cluster_df['end_a'].max(), cluster_df['end_b'].max())],  # y=x
                    mode='lines',  # Nur Linie
                    line=dict(color='grey', width=1),  # Rote gestrichelte Linie
                    name='y = x',  # Beschriftung der Linie
                    showlegend=False  # Keine Legende
                ))
            fig2 = go.Figure()
            for _, row in cluster_df.iterrows():
                fig2.add_trace(go.Scatter(
                x=[row['start_a'], row['end_a']],  # X-Koordinaten
                y=[row['differenz'], row['differenz']],  # Y-Koordinaten
                mode='lines+markers',  # Zeigt Linien und Marker
                name=f"Line {row.name}",  # Optionale Beschriftung
                line=dict(color='blue',width=2),  # Linienbreite
                marker=dict(color='blue',size=5)  # Markierungsgröße
                ))
                fig1.update_layout(
                    title="Linien-Visualisierung basierend auf DataFrame",
                    xaxis_title="Text_a",
                    yaxis_title="Text_b",
                    showlegend=False,  # Legende anzeigen
                    )
                fig1.add_trace(go.Scatter(
                    x=[0, max(cluster_df['end_a'].max(), cluster_df['end_b'].max())],  # Bereich für die Diagonale
                    y=[0, 0],  # y=x
                    mode='lines',  # Nur Linie
                    line=dict(color='grey', width=1),  # Rote gestrichelte Linie
                    name='y = x',  # Beschriftung der Linie
                    showlegend=False  # Keine Legende
                ))
        
        cluster_df = cluster_df.to_dict()
        return fig1, fig2, cluster_df
    
    else:
        return px.scatter(),px.scatter(),cluster_df

# only change type of graph
@callback(
    Output('graph','figure', allow_duplicate=True),
    Output('graph_diff','figure', allow_duplicate=True),
    Input('graph_type','value'),
    Input('update_g_button', 'n_clicks'),
    State('store_df','data'),
    prevent_initial_call=True
)
def update_graph(type,clicks,cluster_df):
    cluster_df = pd.DataFrame(cluster_df)
    print(cluster_df)
    if type == 'Bubble':
        fig1 = px.scatter(cluster_df, x='start_a', y='start_b', size='length')
        fig1.update_layout(clickmode='event+select')
        fig2 = px.scatter(cluster_df, x='start_a', y='differenz', size='length')
    else:
        fig1 = go.Figure()
        for _, row in cluster_df.iterrows():
            fig1.add_trace(go.Scatter(
            x=[row['start_a'], row['end_a']],  # X-Koordinaten
            y=[row['start_b'], row['end_b']],  # Y-Koordinaten
            mode='lines+markers',  # Zeigt Linien und Marker
            name=f"Line {row.name}",  # Optionale Beschriftung
            line=dict(color='blue',width=2),  # Linienbreite
            marker=dict(color='blue',size=5)  # Markierungsgröße
            ))
            fig1.update_layout(
                title="Linien-Visualisierung basierend auf DataFrame",
                xaxis_title="Text_a",
                yaxis_title="Text_b",
                showlegend=False,  # Legende anzeigen
                )
            fig1.add_trace(go.Scatter(
                x=[0, max(cluster_df['end_a'].max(), cluster_df['end_b'].max())],  # Bereich für die Diagonale
                y=[0, max(cluster_df['end_a'].max(), cluster_df['end_b'].max())],  # y=x
                    mode='lines',  # Nur Linie
                    line=dict(color='grey', width=1),  # Rote gestrichelte Linie
                    name='y = x',  # Beschriftung der Linie
                    showlegend=False  # Keine Legende
                ))
        fig2 = go.Figure()
        for _, row in cluster_df.iterrows():
            fig2.add_trace(go.Scatter(
                x=[row['start_a'], row['end_a']],  # X-Koordinaten
                y=[row['differenz'], row['differenz']],  # Y-Koordinaten
                mode='lines+markers',  # Zeigt Linien und Marker
                name=f"Line {row.name}",  # Optionale Beschriftung
                line=dict(color='blue',width=2),  # Linienbreite
                marker=dict(color='blue',size=5)  # Markierungsgröße
                ))
            fig2.update_layout(
                    title="Linien-Visualisierung basierend auf DataFrame",
                    xaxis_title="Text_a",
                    yaxis_title="Text_b",
                    showlegend=False,  # Legende anzeigen
                    )
            fig2.add_trace(go.Scatter(
                    x=[0, max(cluster_df['end_a'].max(), cluster_df['end_b'].max())],  # Bereich für die Diagonale
                    y=[0, 0],  # y=x
                    mode='lines',  # Nur Linie
                    line=dict(color='grey', width=1),  # Rote gestrichelte Linie
                    name='y = x',  # Beschriftung der Linie
                    showlegend=False  # Keine Legende
                ))
    
    return fig1, fig2
    
    
@callback(
    Output('table','columns'),
    Output('table','data'),
    Output('download_df','data'),
    Input('update_table','n_clicks'),
    State('store_df','data'),
    State('text_a_input','value'),
    State('text_b_input','value'),
    State('seperators','value'),
    prevent_initial_callback=True
)
def update_table(click,data,text_a,text_b,sep):
    if data is not None:
        pass
        df = pd.DataFrame(data)
        text_a = clean(text_a,sep) 
        text_b = clean(text_b,sep) 
        print(text_a)
        df = compare_texts(text_a,text_b,df)
        columns = [{"name": col, "id": col} for col in df.keys()]
        return columns,df.to_dict('records'),df.to_dict()
    return [{'name': 'tag', 'id': 'tag'}, {'name': 'Pos_a', 'id': 'Pos_a'}, {'name': 'Length_a', 'id': 'Length_a'}, {'name': 'a', 'id': 'a'}, {'name': 'Pos_b', 'id': 'Pos_b'}, {'name': 'Length_b', 'id': 'Length_b'}, {'name': 'b', 'id': 'b'}, {'name': 'Length_Cluster', 'id': 'Length_Cluster'}, {'name': 'Cluster', 'id': 'Cluster'}], None,pd.DataFrame().to_dict()

@callback(
    Output('download','data'),
    Input('download_button','n_clicks'),
    State('download_df','data'),
    prevent_initial_callback=True
)
def download_df(n_clicks,data):
    if data is None or len(data)==0:
        return None
    
    df = pd.DataFrame(data)
    html_table = df.to_html(index=False)
    html_content = f"""
    <!DOCTYPE html>
    <html lang="de">
    <head>
        <meta charset="UTF-8">
        <title>Download Tabelle</title>
        <style>
            table {{
                border-collapse: collapse;
                width: 100%;
                border: 1px solid black;
            }}
            th, td {{
                border: 1px solid black;
                padding: 8px;
                text-align: left;
            }}
            th {{
                background-color: #f2f2f2;
            }}
        </style>
    </head>
    <body>
        <h2>Exportierte Tabelle</h2>
        {html_table}
    </body>
    </html>
    """
    return dict(content=html_content,filename='result.html')

edition_comparison.run(debug=True, jupyter_mode="external", port=9092)

Dash app running on http://127.0.0.1:9092/


['་', ' ', '\\n']
་, ,\n
no characters to replace given.
['་', ' ', '\\n', '']
no characters to replace given.
['་', ' ', '\\n', '']
     start_a  end_a  start_b  end_b  length  differenz
0         19     57        4     42      38        -15
1         58     88       43     73      30        -15
2         91    102       76     87      11        -15
3        107    131      100    124      24         -7
4        132    155      126    149      23         -6
..       ...    ...      ...    ...     ...        ...
816    27941  27951    51298  51308      10      23357
817    27960  27970    51318  51328      10      23358
818    27978  28006    51336  51364      28      23358
819    28007  28020    51365  51378      13      23358
820    28034  28056    51394  51416      22      23360

[821 rows x 6 columns]
     start_a  end_a  start_b  end_b  length  differenz
0         19     57        4     42      38        -15
1         58     88       43     73      30        -15
2         91    10

In [3]:
text_a = debther_gangtok()
text_b = debther_peking()
sep,_,_,_ = debther_parameters()

In [4]:
text_a = clean(text_a,sep)
text_b = clean(text_b,sep)

no characters to replace given.
['་', ' ', '\\n']
no characters to replace given.
['་', ' ', '\\n']


In [5]:
df = find_cluster(text_a,text_b,10)

In [6]:
df

Unnamed: 0,start_text_a,end_text_a,start_text_b,end_text_b,length,differenz
0,19,57,4,42,38,-15
1,58,88,43,73,30,-15
2,91,102,76,87,11,-15
3,107,131,100,124,24,-7
4,132,155,126,149,23,-6
...,...,...,...,...,...,...
816,27941,27951,51298,51308,10,23357
817,27960,27970,51318,51328,10,23358
818,27978,28006,51336,51364,28,23358
819,28007,28020,51365,51378,13,23358


In [7]:

df_t = compare_texts(text_a,text_b,df)

ValueError: cluster_dict muss die Keys ['start_a', 'end_a', 'start_b', 'end_b', 'length'] enthalten. Es enthält ['start_text_a', 'end_text_a', 'start_text_b', 'end_text_b', 'length', 'differenz']

In [None]:
df_t

Unnamed: 0,tag,Pos_text_a,Length_text_a,text_a,Pos_text_b,Length_text_b,text_b,Length_Cluster,Cluster
0,unique,0,19,དེབ་ཐེར་དམར་པོ\nའཚལ་པ་ཀུན་དགའ་རྡོ་རྗེས་མཛད་པའི...,0,4,དེབ་ཐེར་དམར་པོ།\nསྭསྟི།,0,
1,cluster,19,0,,4,0,,38,ངག་གི་དབང་ཕྱུག་ལ་ཕྱག་འཚལ་ལོ།་།མཁས་པའི་མདུན་སར་...
2,unique,57,1,བསྡེབས,42,1,སྡེབས,0,
3,cluster,58,0,,43,0,,30,པ་འདི།་།ཀུན་དགའི་སླད་དུ་ཀུན་དགའི་མིང་ཅན་དེ་ཡིས...
4,unique,88,3,ཆོགས་པ་ལས,73,3,ཚོགས་པ་ལས།,0,
...,...,...,...,...,...,...,...,...,...
1637,cluster,27978,0,,51336,0,,28,།འཇིག་རྟེན་བྱ་བ་འདི་ལ་མ་རྨོངས་ཤིང་།་།མང་པོའི་ད...
1638,unique,28006,1,འདི,51364,1,།འདི,0,
1639,cluster,28007,0,,51365,0,,13,ལ་འཁྲུལ་དང་ནོངས་པའི་ཚོགས་མཆིས་ན།་།ཤེས་ལྡན་སྐྱེ...
1640,unique,28020,14,ཀྱི་བཟོད་པར་མཛོད།་མི་ཡུན་ཐུང་ཤེས་བྱའི་རྣམ་པ་མང...,51378,16,ཀྲིས་བཟོད་པར་མཛོད།་།ཚེ་འདི་ཡུན་ཐུང་ཤེས་བྱའི་རྣ...,0,
