In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification


tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-mix-ner")
model = AutoModelForTokenClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-mix-ner")

# Load a pre-trained NER pipeline for Arabic
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Sample Arabic text
text = "رام الله هي مدينة فلسطينية تقع في الضفة الغربية."

# Run the NER model
ner_results = ner_pipeline(text)

# Inspect the output structure
if len(ner_results) > 0:
    first_result = ner_results[0]
    print("Keys in the returned entity dict:", first_result.keys())

# Print the recognized named entities
for entity in ner_results:
    entity_word = entity.get('word', 'N/A')
    entity_label = entity.get('entity', '/A').upper()  # or use your logic if it differs
    entity_score = entity.get('score', 'N/A')
    print(f"Entity: {entity_word}, Label: {entity_label}, Score: {entity_score}")


In [None]:
import sqlite3
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from gensim import corpora
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# Download stopwords if needed
nltk.download('stopwords')

# Connect to the SQLite database
conn = sqlite3.connect('articles.db')

# Extract the articles data into a Pandas DataFrame
df = pd.read_sql_query("SELECT headline, published, category, content FROM articles", conn)

# Close the database connection
conn.close()


# Function to convert and parse Arabic dates
def parse_arabic_date(date_str):
    date_pattern = re.compile(r'(\w+)\s+(\d{4})\s\.\sالساعة:\s(\d{2}:\d{2}\s[صم])')
    match = date_pattern.search(date_str)

    if match:
        arabic_month = match.group(1)
        year = match.group(2)
        time = match.group(3).replace('م', 'PM').replace('ص', 'AM')

        english_month = arabic_to_english_months.get(arabic_month)

        english_date_str = f'{english_month} {year} {time}'

        try:
            return pd.to_datetime(
                english_date_str,
                format='%B %Y %I:%M %p',
                errors='coerce'
            )
        except Exception as e:
            print(f"Error parsing date: {e}")
            return None
    else:
        return None

# Apply date parsing
df['published'] = df['published'].apply(parse_arabic_date)

# Drop rows with parsing errors in the 'published' field
df.dropna(subset=['published'], inplace=True)

# Clean and normalize text content
def clean_arabic_text(text):
    # Normalize Arabic diacritics and punctuation
    text = re.sub(r'[\u064B-\u0652]', '', text)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace

    return text

# Remove Arabic stop words
def remove_stopwords(text):
    stop_words = set(stopwords.words('arabic'))
    words = text.split()
    return ' '.join([word for word in words if word not in stop_words])

# Setup NER model
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-mix-ner")
model = AutoModelForTokenClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-mix-ner")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

def ner_preserve_entities(text, max_length=500):
    # Tokenize text and split into chunks
    tokens = tokenizer.tokenize(text)
    chunks = [' '.join(tokens[i:i + max_length]) for i in range(0, len(tokens), max_length)]

    preserved_entities = []

    for chunk in chunks:
        chunk_text = tokenizer.convert_tokens_to_string(chunk.split())
        ner_results = ner_pipeline(chunk_text)

        last_pos = 0
        for entity in ner_results:
            entity_start, entity_end, entity_word = entity['start'], entity['end'], entity['word']
            preserved_entities.append(chunk_text[last_pos:entity_start].strip())  # Add non-entity text
            preserved_entities.append(entity_word.replace(' ', '_'))  # Preserve entity with underscore
            last_pos = entity_end

        preserved_entities.append(chunk_text[last_pos:].strip())  # Add remaining text

    return ' '.join(filter(None, preserved_entities)).strip()

# Process text to clean, remove stop words, and apply NER
def process_text(text):
    text = clean_arabic_text(text)
    text = remove_stopwords(text)
    text = ner_preserve_entities(text)
    return text

# Apply text processing
df['content'] = df['content'].apply(process_text)

# Tokenize and prepare for topic modeling
texts = [content.split() for content in df['content']]

# Create dictionary and corpus for LDA
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# The resulting 'corpus' and 'dictionary' are ready for LDA modeling
