In [16]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def normalize_arabic(text):
    text = re.sub(r"[إأآ]", "ا", text)  
    text = re.sub(r"ة", "ه", text)  
    text = re.sub(r"ى", "ي", text)  
    return text

# Sample data
documents = [
    "قانون شرعي الأسرة وقضايا الزواج والطلاق",
    "محامي متخصص في القضايا الجنائية والعقوبات",
    "القانون التجاري وإدارة الشركات"
]

user_input = "شرعية عائلية قانون حضانة زواج حقوق مدنية"

# Normalize
documents = [normalize_arabic(doc) for doc in documents]
user_input = normalize_arabic(user_input)

# Apply TF-IDF
vectorizer = TfidfVectorizer(min_df=1, analyzer='word', token_pattern=r'\b\w+\b', stop_words=None)
tfidf_matrix = vectorizer.fit_transform([user_input] + documents)

# Compute similarity
user_vector = tfidf_matrix[0]
lawyer_vectors = tfidf_matrix[1:]

similarities = cosine_similarity(user_vector, lawyer_vectors).flatten()

# Print results
for idx, sim in enumerate(similarities):
    print(f"Lawyer {idx+1}: Similarity Score = {sim:.4f}")


Lawyer 1: Similarity Score = 0.1019
Lawyer 2: Similarity Score = 0.0000
Lawyer 3: Similarity Score = 0.0000


In [18]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to normalize Arabic text (like changing "إ" to "ا", etc.)
def normalize_arabic(text):
    text = re.sub(r"[إأآ]", "ا", text)  # Normalize Alef
    text = re.sub(r"ة", "ه", text)  # Normalize Ta Marbuta
    text = re.sub(r"ى", "ي", text)  # Normalize Ya
    return text

# Your new documents
documents = [
    "ممر حيي جواهرج باح باح star year crimin",
    "ريف دمشق دوم دمشق year star judiciari administr",
    "القانون التجاري وإدارة الشركات"
]

# User input
user_input = "شرعية عائلية قانون حضانة زواج حقوق مدنية"

# Normalize both user input and documents
documents = [normalize_arabic(doc) for doc in documents]
user_input = normalize_arabic(user_input)

# Print the normalized documents and user input
print("Normalized Documents:", documents)
print("Normalized User Input:", user_input)

# Combine user input and documents for vectorization
all_texts = [user_input] + documents

# Create a TF-IDF vectorizer (ensures Arabic words are tokenized correctly)
vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\b[\wء-ي]+\b', stop_words=None)

# Apply vectorizer on the combined texts (user input + documents)
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Get the user input vector (first row) and lawyer vectors (remaining rows)
user_vector = tfidf_matrix[0]  # First row is user input
lawyer_vectors = tfidf_matrix[1:]  # Remaining rows are lawyer descriptions

# Compute cosine similarities between the user input vector and lawyer vectors
similarities = cosine_similarity(user_vector, lawyer_vectors).flatten()

# Print the similarity scores for each lawyer
for idx, sim in enumerate(similarities):
    print(f"Lawyer {idx+1}: Similarity Score = {sim:.4f}")


Normalized Documents: ['ممر حيي جواهرج باح باح star year crimin', 'ريف دمشق دوم دمشق year star judiciari administr', 'القانون التجاري واداره الشركات']
Normalized User Input: شرعيه عائليه قانون حضانه زواج حقوق مدنيه
Lawyer 1: Similarity Score = 0.0000
Lawyer 2: Similarity Score = 0.0000
Lawyer 3: Similarity Score = 0.0000


In [19]:
# Check the TF-IDF matrix
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())  # To view the dense matrix of TF-IDF values


TF-IDF Matrix:
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.37796447 0.37796447
  0.         0.         0.         0.         0.37796447 0.37796447
  0.37796447 0.37796447 0.37796447 0.         0.        ]
 [0.         0.32891916 0.         0.25932364 0.25932364 0.
  0.         0.         0.65783832 0.32891916 0.         0.
  0.32891916 0.         0.         0.         0.         0.
  0.         0.         0.         0.32891916 0.        ]
 [0.32891916 0.         0.32891916 0.25932364 0.25932364 0.
  0.         0.         0.         0.         0.         0.
  0.         0.65783832 0.32891916 0.32891916 0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.5
  0.5        0.5        0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.5       ]]


In [20]:
# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()
print("Feature Names (Words):")
print(feature_names)

Feature Names (Words):
['administr' 'crimin' 'judiciari' 'star' 'year' 'التجاري' 'الشركات'
 'القانون' 'باح' 'جواهرج' 'حضانه' 'حقوق' 'حيي' 'دمشق' 'دوم' 'ريف' 'زواج'
 'شرعيه' 'عائليه' 'قانون' 'مدنيه' 'ممر' 'واداره']


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Custom tokenizer for Arabic text that splits words appropriately
def arabic_tokenizer(text):
    return re.findall(r'\b[\wء-ي]+\b', text)  # Tokenize Arabic words

vectorizer = TfidfVectorizer(analyzer='word', tokenizer=arabic_tokenizer, stop_words=None)

# Apply vectorizer on the combined texts (user input + documents)
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Get the feature names and print them
feature_names = vectorizer.get_feature_names_out()
print("Feature Names (Words):")
print(feature_names)

# Compute cosine similarity
user_vector = tfidf_matrix[0]  # First row is user input
lawyer_vectors = tfidf_matrix[1:]  # Remaining rows are lawyer descriptions

# Calculate cosine similarity
similarities = cosine_similarity(user_vector, lawyer_vectors).flatten()

# Print similarity scores
for idx, sim in enumerate(similarities):
    print(f"Lawyer {idx+1}: Similarity Score = {sim:.4f}")


Feature Names (Words):
['administr' 'crimin' 'judiciari' 'star' 'year' 'التجاري' 'الشركات'
 'القانون' 'باح' 'جواهرج' 'حضانه' 'حقوق' 'حيي' 'دمشق' 'دوم' 'ريف' 'زواج'
 'شرعيه' 'عائليه' 'قانون' 'مدنيه' 'ممر' 'واداره']
Lawyer 1: Similarity Score = 0.0000
Lawyer 2: Similarity Score = 0.0000
Lawyer 3: Similarity Score = 0.0000




In [22]:
# Inspect the TF-IDF matrix for user input and documents
print("TF-IDF Matrix (dense):")
print(tfidf_matrix.toarray())  # To view the dense matrix of TF-IDF values

# Check feature names (terms extracted by vectorizer)
feature_names = vectorizer.get_feature_names_out()
print("Feature Names (Words):")
print(feature_names)

# Check the specific non-zero values in the TF-IDF matrix for better insights
dense_matrix = tfidf_matrix.toarray()
print("Dense Matrix of TF-IDF values:")
print(dense_matrix)


TF-IDF Matrix (dense):
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.37796447 0.37796447
  0.         0.         0.         0.         0.37796447 0.37796447
  0.37796447 0.37796447 0.37796447 0.         0.        ]
 [0.         0.32891916 0.         0.25932364 0.25932364 0.
  0.         0.         0.65783832 0.32891916 0.         0.
  0.32891916 0.         0.         0.         0.         0.
  0.         0.         0.         0.32891916 0.        ]
 [0.32891916 0.         0.32891916 0.25932364 0.25932364 0.
  0.         0.         0.         0.         0.         0.
  0.         0.65783832 0.32891916 0.32891916 0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.5
  0.5        0.5        0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.5       ]]
Feature

In [23]:
# Tokenize the documents and user input separately to see what tokens are being extracted
print("User Input Tokenized:", arabic_tokenizer(user_input))
for idx, doc in enumerate(documents):
    print(f"Document {idx+1} Tokenized:", arabic_tokenizer(doc))


User Input Tokenized: ['شرعيه', 'عائليه', 'قانون', 'حضانه', 'زواج', 'حقوق', 'مدنيه']
Document 1 Tokenized: ['ممر', 'حيي', 'جواهرج', 'باح', 'باح', 'star', 'year', 'crimin']
Document 2 Tokenized: ['ريف', 'دمشق', 'دوم', 'دمشق', 'year', 'star', 'judiciari', 'administr']
Document 3 Tokenized: ['القانون', 'التجاري', 'واداره', 'الشركات']


In [24]:
documents = [
    "قانون شرعي حضانة زواج حقوق مدنية",
    "قانون الأسرة وشروط الحضانة في القانون الشرعي",
    "الحقوق المدنية والإدارية في القضايا القانونية"
]

user_input = "شرعية عائلية قانون حضانة زواج حقوق مدنية"


In [25]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to normalize Arabic text (like changing "إ" to "ا", etc.)
def normalize_arabic(text):
    text = re.sub(r"[إأآ]", "ا", text)  # Normalize Alef
    text = re.sub(r"ة", "ه", text)  # Normalize Ta Marbuta
    text = re.sub(r"ى", "ي", text)  # Normalize Ya
    return text

# Custom tokenizer for Arabic text that splits words appropriately
def arabic_tokenizer(text):
    return re.findall(r'\b[\wء-ي]+\b', text)  # Tokenize Arabic words

# Your documents (for debugging purposes, use documents with more overlap)
documents = [
    "قانون شرعي حضانة زواج حقوق مدنية",
    "قانون الأسرة وشروط الحضانة في القانون الشرعي",
    "الحقوق المدنية والإدارية في القضايا القانونية"
]

# User input
user_input = "شرعية عائلية قانون حضانة زواج حقوق مدنية"

# Normalize both user input and documents
documents = [normalize_arabic(doc) for doc in documents]
user_input = normalize_arabic(user_input)

# Print the normalized documents and user input
print("Normalized Documents:", documents)
print("Normalized User Input:", user_input)

# Combine user input and documents for vectorization
all_texts = [user_input] + documents

# Create a TF-IDF vectorizer (ensures Arabic words are tokenized correctly)
vectorizer = TfidfVectorizer(analyzer='word', tokenizer=arabic_tokenizer, stop_words=None)

# Apply vectorizer on the combined texts (user input + documents)
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Inspect the TF-IDF matrix (dense format) to check non-zero values
dense_matrix = tfidf_matrix.toarray()
print("Dense Matrix of TF-IDF values:")
print(dense_matrix)

# Get the feature names and print them
feature_names = vectorizer.get_feature_names_out()
print("Feature Names (Words):")
print(feature_names)

# Compute cosine similarity
user_vector = tfidf_matrix[0]  # First row is user input
lawyer_vectors = tfidf_matrix[1:]  # Remaining rows are lawyer descriptions

# Calculate cosine similarity
similarities = cosine_similarity(user_vector, lawyer_vectors).flatten()

# Print similarity scores
for idx, sim in enumerate(similarities):
    print(f"Lawyer {idx+1}: Similarity Score = {sim:.4f}")


Normalized Documents: ['قانون شرعي حضانه زواج حقوق مدنيه', 'قانون الاسره وشروط الحضانه في القانون الشرعي', 'الحقوق المدنيه والاداريه في القضايا القانونيه']
Normalized User Input: شرعيه عائليه قانون حضانه زواج حقوق مدنيه
Dense Matrix of TF-IDF values:
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.35639424 0.35639424 0.35639424 0.
  0.4520409  0.4520409  0.         0.28853185 0.35639424 0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.39954636 0.39954636 0.39954636 0.5067739
  0.         0.         0.         0.32346721 0.39954636 0.
  0.        ]
 [0.40726515 0.40726515 0.         0.40726515 0.40726515 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.32109252 0.25995207 0.         0.
  0.40726515]
 [0.         0.         0.42176478 0.         0.         0.42176478
  0.42176478 0.42176478 0.         0.         0.         0.
  0.         0.         0.332524

In [26]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to normalize Arabic text (like changing "إ" to "ا", etc.)
def normalize_arabic(text):
    text = re.sub(r"[إأآ]", "ا", text)  # Normalize Alef
    text = re.sub(r"ة", "ه", text)  # Normalize Ta Marbuta
    text = re.sub(r"ى", "ي", text)  # Normalize Ya
    return text

# Custom tokenizer for Arabic text that splits words appropriately
def arabic_tokenizer(text):
    return re.findall(r'\b[\wء-ي]+\b', text)  # Tokenize Arabic words

# Your documents
documents = [
    "قانون شرعي حضانة زواج حقوق مدنية",
    "قانون الأسرة وشروط الحضانة في القانون الشرعي",
    "الحقوق المدنية والإدارية في القضايا القانونية"
]

# User input
user_input = "شرعية عائلية قانون حضانة زواج حقوق مدنية"

# Normalize both user input and documents
documents = [normalize_arabic(doc) for doc in documents]
user_input = normalize_arabic(user_input)

# Combine user input and documents for vectorization
all_texts = [user_input] + documents

# Create a TF-IDF vectorizer (ensures Arabic words are tokenized correctly)
vectorizer = TfidfVectorizer(analyzer='word', tokenizer=arabic_tokenizer, stop_words=None)

# Apply vectorizer on the combined texts (user input + documents)
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Compute cosine similarity
user_vector = tfidf_matrix[0]  # First row is user input
lawyer_vectors = tfidf_matrix[1:]  # Remaining rows are lawyer descriptions

# Calculate cosine similarity
similarities = cosine_similarity(user_vector, lawyer_vectors).flatten()

# Print similarity scores
for idx, sim in enumerate(similarities):
    print(f"Lawyer {idx+1}: Similarity Score = {sim:.4f}")


Lawyer 1: Similarity Score = 0.6629
Lawyer 2: Similarity Score = 0.0750
Lawyer 3: Similarity Score = 0.0000


In [27]:
import requests
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pyterrier as pt

# Start PyTerrier
if not pt.started():
    pt.init()

# Function to normalize Arabic text (like changing "إ" to "ا", etc.)
def normalize_arabic(text):
    text = re.sub(r"[إأآ]", "ا", text)  # Normalize Alef
    text = re.sub(r"ة", "ه", text)  # Normalize Ta Marbuta
    text = re.sub(r"ى", "ي", text)  # Normalize Ya
    return text

# Custom tokenizer for Arabic text that splits words appropriately
def arabic_tokenizer(text):
    return re.findall(r'\b[\wء-ي]+\b', text)  # Tokenize Arabic words

# Get data from the API (Lawyers, Rates, Agencies, Issues)
lawyer_response = requests.get("http://osamanaser806-32078.portmap.io:32078/api/v1/ai/lawyers")
print(f"status code : {lawyer_response.status_code}")
rate_response = requests.get("http://osamanaser806-32078.portmap.io:32078/api/v1/ai/rates")
print(f"status code : {rate_response.status_code}")
agency_response = requests.get("http://osamanaser806-32078.portmap.io:32078/api/v1/ai/agencies")
print(f"status code : {agency_response.status_code}")
issue_response = requests.get("http://osamanaser806-32078.portmap.io:32078/api/v1/ai/issues")
print(f"status code : {issue_response.status_code}")

# Parse the responses into DataFrames
lawyers_response = lawyer_response.json()
lawyers_pere = pd.DataFrame(lawyers_response['lawyers'])
lawyers = lawyers_pere.rename(columns={"id": "lawyer_id"}).drop(
    ["name", "email", "union_number", "affiliation_date", "phone", "rates", "avatar"], axis=1
)
lawyers['years_of_experience'] = lawyers['years_of_experience'].apply(lambda x: f"{x}year")

rates_response = rate_response.json()
rates = pd.DataFrame(rates_response["rates"]).dropna()
rates.drop(["id"], axis=1, inplace=True)
rates['rating'] = rates['rating'].apply(lambda x: f"{x}star")

agencies_response = agency_response.json()
agencies = pd.DataFrame(agencies_response["agencies"]).rename(columns={"id": "agency_id"})
agencies = agencies[["agency_id", "lawyer_id"]]

issues_response = issue_response.json()
issues = pd.DataFrame(issues_response["issues"]).drop(
    ["base_number", "record_number", "id", "start_date", "end_date", "status"], axis=1
)

# Merge data
lawyers_with_rates = lawyers.merge(rates, on="lawyer_id")
lawyers_with_rates = pd.merge(agencies, lawyers_with_rates, on=["lawyer_id"], how="inner")
lawyers_with_rates = pd.merge(issues, lawyers_with_rates, on="agency_id", how="inner")

# Create a text column to join all relevant information for each lawyer
lawyers_with_rates["Text"] = lawyers_with_rates[['court_name', 'address', 'union_branch', 'years_of_experience', 'rating', 'estimated_cost']].astype(str).agg(" ".join, axis=1)

# Preprocess and split text by language (adjust as needed)
from Preprocessing.Preprocess import split_by_language
lawyers_with_rates['processed_text'] = lawyers_with_rates['Text'].apply(split_by_language)

# Prepare documents for TF-IDF vectorization
documents = lawyers_with_rates[['lawyer_id', 'processed_text']].copy()
documents['processed_text'] = documents['processed_text'].apply(lambda x: " ".join(x))
documents.rename(columns={"lawyer_id": "docno", "processed_text": "text"}, inplace=True)

# TF-IDF Vectorization with custom Arabic tokenizer
vectorizer = TfidfVectorizer(analyzer='word', tokenizer=arabic_tokenizer, stop_words=None)
all_descriptions = ["شرعية عائلية"] + documents["text"].tolist()  # Add user input as the first item

# Apply vectorization to the combined texts (user input + documents)
tfidf_matrix = vectorizer.fit_transform(all_descriptions)

# Compute the cosine similarity
user_vector = tfidf_matrix[0]  # First row is user input
lawyer_vectors = tfidf_matrix[1:]  # Remaining rows are lawyer descriptions

# Calculate cosine similarity
similarities = cosine_similarity(user_vector, lawyer_vectors).flatten()

# Rank lawyers based on similarity
ranked_lawyers = np.argsort(similarities)[::-1]

# Display the top recommendations
print("\nTop Recommendations:")
for idx in ranked_lawyers:
    lawyer_id = documents.iloc[idx]["docno"]
    description = documents.iloc[idx]["text"]
    similarity = similarities[idx]
    print(f"Lawyer ID: {lawyer_id}, Similarity: {similarity:.4f}, Description: {description}")


  if not pt.started():
Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


status code : 200
status code : 200
status code : 200
status code : 200


[nltk_data] Downloading package punkt to C:\Users\Mohammad
[nltk_data]     Kher\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Mohammad
[nltk_data]     Kher\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Mohammad
[nltk_data]     Kher\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Mohammad
[nltk_data]     Kher\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Mohammad
[nltk_data]     Kher\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Mohammad
[nltk_data]     Kher\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



Top Recommendations:
Lawyer ID: 42, Similarity: 0.0000, Description: ممر حيي جواهرج باح باح star crimin year
Lawyer ID: 45, Similarity: 0.0000, Description: طريق عر ماجد شق رقم حفر باطن حفر باطن star year civil
Lawyer ID: 45, Similarity: 0.0000, Description: طريق عر ماجد شق رقم حفر باطن حفر باطن star year civil
Lawyer ID: 45, Similarity: 0.0000, Description: طريق عر ماجد شق رقم حفر باطن حفر باطن star year civil
Lawyer ID: 45, Similarity: 0.0000, Description: طريق عر ماجد شق رقم حفر باطن حفر باطن star year civil
Lawyer ID: 42, Similarity: 0.0000, Description: ممر حيي جواهرج باح باح star year civil
Lawyer ID: 42, Similarity: 0.0000, Description: ممر حيي جواهرج باح باح star year civil
Lawyer ID: 45, Similarity: 0.0000, Description: طريق عر ماجد شق رقم حفر باطن حفر باطن star year civil
Lawyer ID: 45, Similarity: 0.0000, Description: طريق عر ماجد شق رقم حفر باطن حفر باطن star year civil
Lawyer ID: 45, Similarity: 0.0000, Description: طريق عر ماجد شق رقم حفر باطن حفر باطن star year civil
La



In [28]:
import requests
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pyterrier as pt

# Start PyTerrier
if not pt.started():
    pt.init()

# Function to normalize Arabic text (like changing "إ" to "ا", etc.)
def normalize_arabic(text):
    text = re.sub(r"[إأآ]", "ا", text)  # Normalize Alef
    text = re.sub(r"ة", "ه", text)  # Normalize Ta Marbuta
    text = re.sub(r"ى", "ي", text)  # Normalize Ya
    return text

# Custom tokenizer for Arabic text that splits words appropriately
def arabic_tokenizer(text):
    return re.findall(r'\b[\wء-ي]+\b', text)  # Tokenize Arabic words

# Get data from the API (Lawyers, Rates, Agencies, Issues)
lawyer_response = requests.get("http://osamanaser806-32078.portmap.io:32078/api/v1/ai/lawyers")
print(f"status code : {lawyer_response.status_code}")
rate_response = requests.get("http://osamanaser806-32078.portmap.io:32078/api/v1/ai/rates")
print(f"status code : {rate_response.status_code}")
agency_response = requests.get("http://osamanaser806-32078.portmap.io:32078/api/v1/ai/agencies")
print(f"status code : {agency_response.status_code}")
issue_response = requests.get("http://osamanaser806-32078.portmap.io:32078/api/v1/ai/issues")
print(f"status code : {issue_response.status_code}")

# Parse the responses into DataFrames
lawyers_response = lawyer_response.json()
lawyers_pere = pd.DataFrame(lawyers_response['lawyers'])
lawyers = lawyers_pere.rename(columns={"id": "lawyer_id"}).drop(
    ["name", "email", "union_number", "affiliation_date", "phone", "rates", "avatar"], axis=1
)
# lawyers['years_of_experience'] = lawyers['years_of_experience'].apply(lambda x: f"{x}year")

rates_response = rate_response.json()
rates = pd.DataFrame(rates_response["rates"]).dropna()
rates.drop(["id"], axis=1, inplace=True)
# rates['rating'] = rates['rating'].apply(lambda x: f"{x}star")

agencies_response = agency_response.json()
agencies = pd.DataFrame(agencies_response["agencies"]).rename(columns={"id": "agency_id"})
agencies = agencies[["agency_id", "lawyer_id"]]

issues_response = issue_response.json()
issues = pd.DataFrame(issues_response["issues"]).drop(
    ["base_number", "record_number", "id", "start_date", "end_date", "status"], axis=1
)

# Merge data
lawyers_with_rates = lawyers.merge(rates, on="lawyer_id")
lawyers_with_rates = pd.merge(agencies, lawyers_with_rates, on=["lawyer_id"], how="inner")
lawyers_with_rates = pd.merge(issues, lawyers_with_rates, on="agency_id", how="inner")

# Create a text column to join all relevant information for each lawyer
lawyers_with_rates["Text"] = lawyers_with_rates[['court_name', 'address', 'union_branch', 'years_of_experience', 'rating', 'estimated_cost']].astype(str).agg(" ".join, axis=1)

# Normalize both user input and the lawyer text
user_input = "شرعية عائلية"
documents = lawyers_with_rates['Text'].apply(normalize_arabic).tolist()
user_input = normalize_arabic(user_input)

# Tokenize text properly
documents = [arabic_tokenizer(doc) for doc in documents]
user_input_tokens = arabic_tokenizer(user_input)

# Print tokenized documents and input for debugging
print("Tokenized Documents:", documents[:5])  # Show first 5 tokenized documents
print("Tokenized User Input:", user_input_tokens)

# Prepare data for TF-IDF vectorization
all_descriptions = [user_input] + [" ".join(doc) for doc in documents]

# Apply TF-IDF Vectorization to the combined texts (user input + documents)
vectorizer = TfidfVectorizer(analyzer='word', tokenizer=arabic_tokenizer, stop_words=None)
tfidf_matrix = vectorizer.fit_transform(all_descriptions)

# Print the shape of the resulting TF-IDF matrix for debugging
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

# Compute cosine similarity
user_vector = tfidf_matrix[0]  # First row is user input
lawyer_vectors = tfidf_matrix[1:]  # Remaining rows are lawyer descriptions

# Calculate cosine similarity
similarities = cosine_similarity(user_vector, lawyer_vectors).flatten()

# Rank lawyers based on similarity
ranked_lawyers = np.argsort(similarities)[::-1]

# Display the top recommendations
print("\nTop Recommendations:")
for idx in ranked_lawyers:
    lawyer_id = lawyers_with_rates.iloc[idx]["lawyer_id"]
    similarity = similarities[idx]
    print(f"Lawyer ID: {lawyer_id}, Similarity: {similarity:.4f}")

  if not pt.started():


status code : 200
status code : 200
status code : 200
status code : 200
Tokenized Documents: [['Criminal', '9304', 'ممر', 'يحيي', 'جواهرجي', 'الباحه', 'الباحه', '14year', '4star', '11672000'], ['Criminal', '9304', 'ممر', 'يحيي', 'جواهرجي', 'الباحه', 'الباحه', '14year', '4star', '11672000'], ['Criminal', '9304', 'ممر', 'يحيي', 'جواهرجي', 'الباحه', 'الباحه', '14year', '2star', '11672000'], ['Criminal', '9304', 'ممر', 'يحيي', 'جواهرجي', 'الباحه', 'الباحه', '14year', '5star', '11672000'], ['Criminal', '62', 'ممر', 'مطيع', 'الصقيه', 'ينبع', 'البحر', 'ينبع', 'البحر', '26year', '3star', '2765228']]
Tokenized User Input: ['شرعيه', 'عائليه']
TF-IDF Matrix Shape: (424, 351)

Top Recommendations:
Lawyer ID: 42, Similarity: 0.0000
Lawyer ID: 45, Similarity: 0.0000
Lawyer ID: 45, Similarity: 0.0000
Lawyer ID: 45, Similarity: 0.0000
Lawyer ID: 45, Similarity: 0.0000
Lawyer ID: 42, Similarity: 0.0000
Lawyer ID: 42, Similarity: 0.0000
Lawyer ID: 45, Similarity: 0.0000
Lawyer ID: 45, Similarity: 0.0000



Lawyer ID: 36, Similarity: 0.0000
Lawyer ID: 36, Similarity: 0.0000
Lawyer ID: 20, Similarity: 0.0000
Lawyer ID: 38, Similarity: 0.0000
Lawyer ID: 38, Similarity: 0.0000
Lawyer ID: 38, Similarity: 0.0000
Lawyer ID: 51, Similarity: 0.0000
Lawyer ID: 51, Similarity: 0.0000
Lawyer ID: 41, Similarity: 0.0000
Lawyer ID: 41, Similarity: 0.0000
Lawyer ID: 43, Similarity: 0.0000
Lawyer ID: 43, Similarity: 0.0000
Lawyer ID: 6, Similarity: 0.0000
Lawyer ID: 41, Similarity: 0.0000
Lawyer ID: 41, Similarity: 0.0000
Lawyer ID: 32, Similarity: 0.0000
Lawyer ID: 32, Similarity: 0.0000
Lawyer ID: 45, Similarity: 0.0000
Lawyer ID: 45, Similarity: 0.0000
Lawyer ID: 45, Similarity: 0.0000
Lawyer ID: 45, Similarity: 0.0000
Lawyer ID: 31, Similarity: 0.0000
Lawyer ID: 31, Similarity: 0.0000
Lawyer ID: 31, Similarity: 0.0000
Lawyer ID: 20, Similarity: 0.0000
Lawyer ID: 29, Similarity: 0.0000
Lawyer ID: 29, Similarity: 0.0000
Lawyer ID: 48, Similarity: 0.0000
Lawyer ID: 48, Similarity: 0.0000
Lawyer ID: 44, 

In [4]:
import requests
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pyterrier as pt

# Start PyTerrier
if not pt.started():
    pt.init()

# Function to remove numbers from the text
def remove_numbers(text):
    # Use regex to remove digits (0-9)
    return re.sub(r'\d+', '', text)

# Function to normalize Arabic text
def normalize_arabic(text):
    text = re.sub(r"[إأآ]", "ا", text)  # Normalize Alef
    text = re.sub(r"ة", "ه", text)  # Normalize Ta Marbuta
    text = re.sub(r"ى", "ي", text)  # Normalize Ya
    return text

# Custom tokenizer for Arabic text
def arabic_tokenizer(text):
    return re.findall(r'\b[\wء-ي]+\b', text)  # Tokenize Arabic words

# Get data from the API (Lawyers, Rates, Agencies, Issues)
lawyer_response = requests.get("http://osamanaser806-32078.portmap.io:32078/api/v1/ai/lawyers")
print(f"status code : {lawyer_response.status_code}")
rate_response = requests.get("http://osamanaser806-32078.portmap.io:32078/api/v1/ai/rates")
print(f"status code : {rate_response.status_code}")
agency_response = requests.get("http://osamanaser806-32078.portmap.io:32078/api/v1/ai/agencies")
print(f"status code : {agency_response.status_code}")
issue_response = requests.get("http://osamanaser806-32078.portmap.io:32078/api/v1/ai/issues")
print(f"status code : {issue_response.status_code}")

# Parse the responses into DataFrames
lawyers_response = lawyer_response.json()
lawyers_pere = pd.DataFrame(lawyers_response['lawyers'])
lawyers = lawyers_pere.rename(columns={"id": "lawyer_id"}).drop(
    ["name", "email", "union_number", "affiliation_date", "phone", "rates", "avatar"], axis=1
)
# lawyers['years_of_experience'] = lawyers['years_of_experience'].apply(lambda x: f"{x}year")

rates_response = rate_response.json()
rates = pd.DataFrame(rates_response["rates"]).dropna()
rates.drop(["id"], axis=1, inplace=True)
# rates['rating'] = rates['rating'].apply(lambda x: f"{x}star")

agencies_response = agency_response.json()
agencies = pd.DataFrame(agencies_response["agencies"]).rename(columns={"id": "agency_id"})
agencies = agencies[["agency_id", "lawyer_id"]]

issues_response = issue_response.json()
issues = pd.DataFrame(issues_response["issues"]).drop(
    ["base_number", "record_number", "id", "start_date", "end_date", "status"], axis=1
)

# Merge data
lawyers_with_rates = lawyers.merge(rates, on="lawyer_id")
lawyers_with_rates = pd.merge(agencies, lawyers_with_rates, on=["lawyer_id"], how="inner")
lawyers_with_rates = pd.merge(issues, lawyers_with_rates, on="agency_id", how="inner")

# Create a text column to join all relevant information for each lawyer
lawyers_with_rates["Text"] = lawyers_with_rates[['court_name', 'address', 'union_branch', 'years_of_experience', 'rating', 'estimated_cost']].astype(str).agg(" ".join, axis=1)

# Normalize and remove numbers from both user input and the lawyer text
user_input = "شرعية عائلية"
documents = lawyers_with_rates['Text'].apply(normalize_arabic).apply(remove_numbers).tolist()
user_input = normalize_arabic(user_input)
user_input = remove_numbers(user_input)

# Tokenize text properly
documents = [arabic_tokenizer(doc) for doc in documents]
user_input_tokens = arabic_tokenizer(user_input)

# Print tokenized documents and input for debugging
print("Tokenized Documents:", documents[:5])  # Show first 5 tokenized documents
print("Tokenized User Input:", user_input_tokens)

# Prepare data for TF-IDF vectorization
all_descriptions = [user_input] + [" ".join(doc) for doc in documents]

# Apply TF-IDF Vectorization to the combined texts (user input + documents)
vectorizer = TfidfVectorizer(analyzer='word', tokenizer=arabic_tokenizer, stop_words=None)
tfidf_matrix = vectorizer.fit_transform(all_descriptions)

# Print the shape of the resulting TF-IDF matrix for debugging
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

# Compute cosine similarity
user_vector = tfidf_matrix[0]  # First row is user input
lawyer_vectors = tfidf_matrix[1:]  # Remaining rows are lawyer descriptions

# Calculate cosine similarity
similarities = cosine_similarity(user_vector, lawyer_vectors).flatten()

# Rank lawyers based on similarity
ranked_lawyers = np.argsort(similarities)[::-1]

# Display the top recommendations
print("\nTop Recommendations:")
for idx in ranked_lawyers:
    lawyer_id = lawyers_with_rates.iloc[idx]["lawyer_id"]
    similarity = similarities[idx]
    print(f"Lawyer ID: {lawyer_id}, Similarity: {similarity:.4f}")


  if not pt.started():


ConnectTimeout: HTTPConnectionPool(host='osamanaser806-32078.portmap.io', port=32078): Max retries exceeded with url: /api/v1/ai/lawyers (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000023366963170>, 'Connection to osamanaser806-32078.portmap.io timed out. (connect timeout=None)'))