## Import

In [None]:
import os

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

import difflib
from docx import Document

#import spacy

## Config

In [None]:
pd.set_option('display.max_columns', None)  # Отображать все столбцы
pd.set_option('display.max_rows', None)     # Отображать все строки
pd.set_option('display.max_colwidth', None) # Отключить обрезание текста в ячейках

## Function

In [None]:
def read_docx(file_path):
    doc = Document(file_path)
    lines = []  
    
    for paragraph in doc.paragraphs:
        line = paragraph.text.strip()  
        if line: 
            lines.append(f'"{line}"')
    
    return lines

def load_files_to_dataframe(file_paths):
    uc_data = []  
    ssts_data = []  
    
    for file_path in file_paths:
        lines = read_docx(file_path)
        
        if 'UC' in file_path:
            uc_data.append(lines)
        else:
            ssts_data.append(lines)
    
    df = pd.DataFrame({
        'UC': uc_data,
        'SSTS': ssts_data
    })
    
    return df


def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

In [None]:
file_paths = [
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\HMI\UC-6583.docx', 
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\HMI\UC-8604.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\HMI\UC-8692.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\HMI\UC-8800.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\HMI\UC-11467.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\HMI\UC-25957.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\HMI\UC-26160.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\HMI\UC-26161.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\HMI\UC-26771.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\HMI\UC-28561.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\HMI\UC-30371.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\SSTS\SSTS-6583.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\SSTS\SSTS-8604.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\SSTS\SSTS-8692.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\SSTS\SSTS-8800.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\SSTS\SSTS-11467.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\SSTS\SSTS-25957.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\SSTS\SSTS-26161.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\SSTS\SSTS-26771.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\SSTS\SSTS-28561.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\SSTS\SSTS-30371.docx',
    r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\data\raw_data\SSTS\SSTS-31523.docx'

]  

df = load_files_to_dataframe(file_paths)

#df.to_excel('outputs__1.xlsx', index=False)

df


In [None]:
data = pd.read_excel(r'C:\Users\HP\Documents\GitHub\atom-compliance-ml\src\notebooks\outputs__1.xlsx')

data.head(1)

In [None]:
nlp = spacy.load("en_core_web_sm")

data['UC'] = data['UC'].apply(lemmatize_text)
data['SSTS'] = data['SSTS'].apply(lemmatize_text)

vectorizer = TfidfVectorizer()
uc_vectors = vectorizer.fit_transform(data['UC'])
ssts_vectors = vectorizer.transform(data['SSTS'])

similarity_matrix = cosine_similarity(uc_vectors, ssts_vectors)
euclidean_dist_matrix = euclidean_distances(uc_vectors, ssts_vectors)

max_similarities = similarity_matrix.max(axis=1)
mse = np.mean((1 - max_similarities) ** 2)  

print("Mean Squared Error (MSE):", mse)

seen = set()  

def get_differences_and_description(text1, text2):
    diff = list(difflib.ndiff(text1.split(), text2.split()))
    differences = ' '.join([word for word in diff if word.startswith('-') and word not in seen])
    description = ' '.join([word for word in diff if word.startswith('+') and word not in seen])
    seen.update(differences.split()) 
    seen.update(description.split())  
    return differences, description

report_data = {
    "UC Requirement": data['UC'],
    "Best SSTS Match": [data['SSTS'][i] for i in similarity_matrix.argmax(axis=1)],
    "Cosine Similarity": max_similarities,
    "Differences": [get_differences_and_description(data['UC'][i], data['SSTS'][j])[0] for i, j in enumerate(similarity_matrix.argmax(axis=1))],
    "Description": [get_differences_and_description(data['UC'][i], data['SSTS'][j])[1] for i, j in enumerate(similarity_matrix.argmax(axis=1))]
}

report = pd.DataFrame(report_data)

report

In [None]:
data_col = pd.DataFrame(similarity_matrix.diagonal())
data_col_eu = pd.DataFrame(euclidean_dist_matrix.diagonal())

In [None]:
data_col.to_csv('data_cos1.csv')
data_col_eu.to_csv('data_eu.csv')