In [13]:
from flask import Flask, request, render_template_string
import threading
import chardet

# Initialize Flask app
app = Flask(__name__)

# Boyer-Moore algorithm for pattern matching
def boyer_moore_search(text, pattern):
    m = len(pattern)
    n = len(text)

    if m == 0:
        return []

    # Generate bad character table
    bad_char = [-1] * 256
    for i in range(m):
        bad_char[ord(pattern[i])] = i

    s = 0
    matches = []
    while s <= n - m:
        j = m - 1

        while j >= 0 and pattern[j] == text[s + j]:
            j -= 1

        if j < 0:
            matches.append(s)
            s += (m - bad_char[ord(text[s + m])] if s + m < n else 1)
        else:
            s += max(1, j - bad_char[ord(text[s + j])])

    return matches

# Flask route to handle file uploads and pattern matching
 

def detect_encoding(file):
    raw_data = file.read()
    result = chardet.detect(raw_data)
    encoding = result['encoding']
    return raw_data.decode(encoding)
@app.route('/', methods=['GET', 'POST'])
def index():
    result = []
    if request.method == 'POST':
        file1 = request.files['file1']
        file2 = request.files['file2']
        
        if file1 and file2:
            text1 = file1.read().decode('utf-8')
            text2 = file2.read().decode('utf-8')
            
            # Find common phrases or words (for simplicity, let's assume we're looking for common words)
            words1 = text1.split()
            words2 = text2.split()
            
            common_words = set(words1).intersection(words2)
            
            for word in common_words:
                positions1 = boyer_moore_search(text1, word)
                positions2 = boyer_moore_search(text2, word)
                result.append({
                    'word': word,
                    'positions1': positions1,
                    'positions2': positions2
                })
                result = []
    similarity_metrics = {}
    if request.method == 'POST':
        file1 = request.files['file1']
        file2 = request.files['file2']

        if file1 and file2:
            text1 = detect_encoding(file1)
            text2 = detect_encoding(file2)

            words1 = text1.split()
            words2 = text2.split()

            common_words = set(words1).intersection(words2)
            common_word_count = len(common_words)
            total_words = max(len(words1), len(words2))
            percentage_similarity = (common_word_count / total_words) * 100

            for word in common_words:
                positions1 = boyer_moore_search(text1, word)
                positions2 = boyer_moore_search(text2, word)
                result.append({
                    'word': word,
                    'positions1': positions1,
                    'positions2': positions2
                })

            similarity_metrics = {
                'common_word_count': common_word_count,
                'total_words': total_words,
                'percentage_similarity': percentage_similarity
            }

            # Plotting the similarity metrics
            fig, ax = plt.subplots()
            ax.bar(['Common Words', 'Total Words'], [common_word_count, total_words])
            ax.set_ylabel('Count')
            ax.set_title('Similarity Metrics')
            buf = BytesIO()
            plt.savefig(buf, format='png')
            buf.seek(0)
            img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')

               


    
    
    
    # Using render_template_string for simplicity in Jupyter Notebook
    return render_template_string('''
    <!doctype html>
    <html lang="en">
    <head>
        <meta charset="utf-8">
        <title>Similarity Analysis</title>
    </head>
    <body>
        <h1>Similarity Analysis</h1>
        <form action="/" method="post" enctype="multipart/form-data">
            <div>
                <label for="file1">Upload File 1:</label>
                <input type="file" id="file1" name="file1" required>
            </div>
            <div>
                <label for="file2">Upload File 2:</label>
                <input type="file" id="file2" name="file2" required>
            </div>
            <div>
                <button type="submit">Analyze</button>
            </div>

</form>
        
        {% if result %}
            <h2>Common Phrases/Words and Positions</h2>
            <ul>
                {% for item in result %}
                    <li>
                        <strong>{{ item.word }}</strong><br>
                        File 1 Positions: {{ item.positions1 }}<br>
                        File 2 Positions: {{ item.positions2 }}
                    </li>
                {% endfor %}
            </ul>
        {% endif %}
    </body>
    </html>
    ''', result=result)

# Function to run the Flask app in a separate thread
def run_app():
    app.run(port=5000)

# Start the Flask app in a separate thread
thread = threading.Thread(target=run_app)
thread.start()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [14/Jul/2024 23:52:00] "GET / HTTP/1.1" 200 -
[2024-07-14 23:52:12,018] ERROR in app: Exception on / [POST]
Traceback (most recent call last):
  File "C:\Users\User\anaconda3\Lib\site-packages\flask\app.py", line 2525, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\User\anaconda3\Lib\site-packages\flask\app.py", line 1822, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\User\anaconda3\Lib\site-packages\flask\app.py", line 1820, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\User\anaconda3\Lib\site-packages\flask\app.py", line 1796, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Use