In [None]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect('articles.db')

# Extract the articles data into a Pandas DataFrame
df = pd.read_sql_query("SELECT headline, published, category, content FROM articles", conn)

# Close the database connection
conn.close()

# Inspect the first few rows of the DataFrame
print(df.head())


In [None]:
import re

def clean_arabic_text(text):
    # Normalize certain Arabic letters (optional step, uncomment if needed)
    # text = re.sub(r'[إأآا]', 'ا', text)
    # text = re.sub(r'ى', 'ي', text)
    # text = re.sub(r'ؤ', 'و', text)
    # text = re.sub(r'ئ', 'ي', text)
    # text = re.sub(r'ة', 'ه', text')

    # Remove Arabic diacritics
    text = re.sub(r'[\u064B-\u0652]', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove digits and special characters
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply the function to the 'content' column
df['content'] = df['content'].apply(clean_arabic_text)
print(df.head())

In [None]:
# Mapping of Arabic month names to English
arabic_to_english_months = {
    'يناير': 'January',
    'فبراير': 'February',
    'مارس': 'March',
    'أبريل': 'April',
    'مايو': 'May',
    'يونيو': 'June',
    'يوليو': 'July',
    'أغسطس': 'August',
    'سبتمبر': 'September',
    'أكتوبر': 'October',
    'نوفمبر': 'November',
    'ديسمبر': 'December'
}
import pandas as pd
import re

def parse_arabic_date(date_str):
    # Regular expression to extract the Arabic date and time
    date_pattern = re.compile(r'(\w+)\s+(\d{4})\s\.\sالساعة:\s(\d{2}:\d{2}\s[صم])')
    
    # Match the pattern
    match = date_pattern.search(date_str)
    
    if match:
        arabic_month = match.group(1)
        year = match.group(2)
        time = match.group(3).replace('م', 'PM').replace('ص', 'AM')  # Convert Arabic AM/PM indicators
        
        # Convert the Arabic month to English
        english_month = arabic_to_english_months.get(arabic_month)
        
        # Form the English date string
        english_date_str = f'{english_month} {year} {time}'
        
        # Parse the date using Pandas datetime
        # Handle both AM/PM formats and possible errors
        try:
            return pd.to_datetime(
                english_date_str,
                format='%B %Y %I:%M %p',
                errors='coerce'
            )
        except Exception as e:
            print(f"Error parsing date: {e}")
            return None
    else:
        return None

# Apply the function to your DataFrame
df['published'] = df['published'].apply(parse_arabic_date)

# Drop rows where parsing failed (optional)
df.dropna(subset=['published'], inplace=True)


In [None]:
import sqlite3
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models
import matplotlib.pyplot as plt
import seaborn as sns

# Download stopwords if needed
nltk.download('stopwords')

# Connect to the SQLite database
conn = sqlite3.connect('articles.db')

# Extract the articles data into a Pandas DataFrame
df = pd.read_sql_query("SELECT headline, published, category, content FROM articles", conn)

# Close the database connection
conn.close()

# Arabic month to English mapping
arabic_to_english_months = {
    'يناير': 'January',
    'فبراير': 'February',
    'مارس': 'March',
    'أبريل': 'April',
    'مايو': 'May',
    'يونيو': 'June',
    'يوليو': 'July',
    'أغسطس': 'August',
    'سبتمبر': 'September',
    'أكتوبر': 'October',
    'نوفمبر': 'November',
    'ديسمبر': 'December'
}

# Function to convert and parse Arabic dates
def parse_arabic_date(date_str):
    date_pattern = re.compile(r'(\w+)\s+(\d{4})\s\.\sالساعة:\s(\d{2}:\d{2}\s[صم])')
    match = date_pattern.search(date_str)
    
    if match:
        arabic_month = match.group(1)
        year = match.group(2)
        time = match.group(3).replace('م', 'PM').replace('ص', 'AM')
        
        english_month = arabic_to_english_months.get(arabic_month)
        
        english_date_str = f'{english_month} {year} {time}'
        
        try:
            return pd.to_datetime(
                english_date_str,
                format='%B %Y %I:%M %p',
                errors='coerce'
            )
        except Exception as e:
            print(f"Error parsing date: {e}")
            return None
    else:
        return None

# Apply date parsing
df['published'] = df['published'].apply(parse_arabic_date)

# Drop rows with parsing errors in the 'published' field
df.dropna(subset=['published'], inplace=True)

# Clean and normalize text content
def clean_arabic_text(text):
    # Normalize Arabic diacritics and punctuation
    text = re.sub(r'[\u064B-\u0652]', '', text)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    
    return text

# Apply text cleaning
df['content'] = df['content'].apply(clean_arabic_text)

# Remove Arabic stop words
def remove_stopwords(text):
    stop_words = set(stopwords.words('arabic'))
    words = text.split()
    return ' '.join([word for word in words if word not in stop_words])

df['content'] = df['content'].apply(remove_stopwords)

# Tokenize and prepare for topic modeling
texts = [content.split() for content in df['content']]

# Create dictionary and corpus for LDA
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Perform topic modeling
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Print discovered topics
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}: {topic}")

# Plot sentiment distribution (dummy sentiments for demonstration)
df['sentiment'] = df['content'].apply(lambda x: 0)  # Placeholder, customize with a proper sentiment analysis tool

sns.histplot(df['sentiment'], kde=True)
plt.title('Sentiment Distribution of Articles')
plt.show()
