In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np
import pandas as pd

In [5]:
def simple_tokenizer(text):
    tokens = re.findall(r"\b\w+\b", text.lower())
    return tokens

In [6]:
def preprocess_text(text):
    return " ".join(simple_tokenizer(text))

In [7]:
def calculate_common_words(text1, text2):
    # Prétraiter les textes
    preprocessed_text1 = preprocess_text(text1)
    preprocessed_text2 = preprocess_text(text2)

    # Vectorisation des textes
    set_text1 = set(simple_tokenizer(preprocessed_text1))
    set_text2 = set(simple_tokenizer(preprocessed_text2))

    # Calcul de la similarité cosinus entre les vecteurs
    # similarity_matrix = cosine_similarity(vectors)

    # La valeur de similarité cosinus est dans la position (0, 1) de la matrice
    # similarity = similarity_matrix[0, 1]

    # Retourner un tableau pandas représentant l'ensemble des mots similaires et le pourcentage de similarité
    common_words = set_text1.intersection(set_text2)

    # Calculer la similarité en fonction du nombre de mots communs
    similarity = len(common_words) / max(len(set_text1), len(set_text2))

    # Retourner un tableau pandas représentant les mots communs et le pourcentage de similarité
    return pd.DataFrame({"Common_Words": list(common_words)}), similarity

In [8]:
def calculate_similarity_matrix(texts):
    # Nombre de textes
    num_texts = len(texts)

    # Initialiser une liste pour stocker les résultats
    results = []

    # Calculer la similarité entre chaque paire de textes
    for i in range(num_texts):
        for j in range(i + 1, num_texts):
            similarity_df = calculate_common_words(texts[i], texts[j])
            # Ajouter le résultat à la liste des résultats
            results.append((similarity_df, f"Similarity between Text {i+1} and Text {j+1}"))

    return results

In [10]:
def test_similarity():
    # Exemple de textes (bouts de code)
    code_texts = [
        "def add_numbers(a, b):\n    return a + b",
        "def multiply_numbers(x, y):\n    return x * y",
        "for i in range(5):\n    print(i)",
        "Prétraiter les textes preprocessed_text1 = preprocess_text(text1) preprocessed_text2 = preprocess_text(text2) vectorizer = CountVectorizer(tokenizer = simple_tokenizer) vectors = vectorizer.fit_transform([preprocessed_text1, preprocessed_text2]) similarity_matrix = cosine_similarity(vectors) similarity = similarity_matrix[0, 1] common_words = set(vectorizer.get_feature_names_out()) return pd.DataFrame({'Common_Words': list(common_words), 'Similarity': similarity})",
        "Prétraiter les textes preprocessed_text1 = preprocess_text(text1) preprocessed_text2 = preprocess_text(text2) set_text1 = set(simple_tokenizer(preprocessed_text1)) set_text2 = set(simple_tokenizer(preprocessed_text2)) common_words = set_text1.intersection(set_text2) similarity = len(common_words) / max(len(set_text1), len(set_text2)) return pd.DataFrame({'Common_Words': list(common_words), 'Similarity': similarity})",
    ]
    # Calculer la matrice de similarité
    similarity_results = calculate_similarity_matrix(code_texts)

    # Afficher les résultats
    for result, description in similarity_results:
        print(f"\n{description}:\n")
        print(result)


# Appeler la fonction de test
test_similarity()


Similarity between Text 1 and Text 2:

(  Common_Words
0       return
1          def, 0.4)

Similarity between Text 1 and Text 3:

(Empty DataFrame
Columns: [Common_Words]
Index: [], 0.0)

Similarity between Text 1 and Text 4:

(  Common_Words
0       return, 0.038461538461538464)

Similarity between Text 1 and Text 5:

(  Common_Words
0       return, 0.047619047619047616)

Similarity between Text 2 and Text 3:

(Empty DataFrame
Columns: [Common_Words]
Index: [], 0.0)

Similarity between Text 2 and Text 4:

(  Common_Words
0       return, 0.038461538461538464)

Similarity between Text 2 and Text 5:

(  Common_Words
0       return, 0.047619047619047616)

Similarity between Text 3 and Text 4:

(Empty DataFrame
Columns: [Common_Words]
Index: [], 0.0)

Similarity between Text 3 and Text 5:

(Empty DataFrame
Columns: [Common_Words]
Index: [], 0.0)

Similarity between Text 4 and Text 5:

(          Common_Words
0   preprocessed_text1
1   preprocessed_text2
2         common_words
3          