In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Web Scraping

In [None]:
!pip install requests beautifulsoup4 pandas


# Scraping the Articles

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# رابط ويكيبيديا العربي
URL = "https://ar.wikipedia.org/wiki/"

# أسماء المقالات المراد جلبها
articles = ["الخيمياء والكيمياء في عصر الحضارة الإسلامية","تدمير البيئة"
            ,"محرك نفاث","السياحة في السعودية","خط حديد الحجاز", "تقطيع (شبكات)", "ويندوز فيستا"]

def scrape_wikipedia(article):
    # إرسال طلب للمقال
    response = requests.get(URL + article)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        
        # استخراج النصوص من الفقرات <p>
        paragraphs = soup.find_all('p')
        text = [p.get_text() for p in paragraphs]
        
        # دمج النصوص معًا
        return " ".join(text)
    else:
        print(f"Failed to fetch {article}")
        return ""

# تخزين النصوص المُستخرجة
data = []
for article in articles:
    print(f"Scraping: {article}")
    text = scrape_wikipedia(article)
    data.append({"Article": article, "Content": text})

# حفظ البيانات في DataFrame
df = pd.DataFrame(data)
print(df.head())

# حفظ النصوص المُستخرجة في ملف CSV
df.to_csv("arabic_wikipedia_articles.csv", index=False, encoding='utf-8-sig')


In [None]:
!pip install openai pandas tqdm

In [None]:
!pip install --upgrade openai


In [None]:
import os
from openai import OpenAI
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# OpenAI API setup
client = OpenAI(api_key = "")  # Replace with your key


In [None]:
import pandas as pd
import openai

# Load the CSV file
file_path = "/kaggle/working/arabic_wikipedia_articles.csv"  # Replace with your file path
df = pd.read_csv(file_path)


In [None]:
df

In [None]:
def generate_question_answer(content, article_title, model="gpt-4", max_tokens=200):
    """
    Generate a question and answer in Arabic from the content of an article.
    """
    prompt = (
        f"قم بإنشاء سؤال وإجابة باللغة العربية استنادًا إلى النص التالي من المقال بعنوان '{article_title}':\n\n"
        f"{content}\n\n"
        "يرجى تقديم النتيجة بهذا التنسيق:\n"
        "السؤال: <السؤال المولد>\n"
        "الإجابة: <الإجابة المولدة>"
    )

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=max_tokens,
            temperature=0.5
        )
        # Extract the output
        output = response.choices[0].message.content.strip()
        question, answer = output.split("\n")[:2]  # Split into question and answer
        return question.replace("السؤال: ", ""), answer.replace("الإجابة: ", "")
    except Exception as e:
        print(f"Error generating Q&A: {e}")
        return None, None


In [None]:
import pandas as pd
# Function to chunk long content into smaller parts
def chunk_text(text, chunk_size=1500):
    """
    Splits a text into smaller chunks of a specified size.

    Parameters:
        text (str): The input text to split.
        chunk_size (int): The maximum size of each chunk in characters.

    Returns:
        list: A list of text chunks.
    """
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Function to generate questions and answers
def generate_question_answer(content, article_title, model="gpt-4", max_tokens=200):
    """
    Generate a question and answer in Arabic from the content of an article.
    """
    prompt = (
        f"قم بإنشاء سؤال وإجابة باللغة العربية استنادًا إلى النص التالي من المقال بعنوان '{article_title}':\n\n"
        f"{content}\n\n"
        "يرجى تقديم النتيجة بهذا التنسيق:\n"
        "السؤال: <السؤال المولد>\n"
        "الإجابة: <الإجابة المولدة>"
    )

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=max_tokens,
            temperature=0.5
        )
        # Extract the output
        output = response.choices[0].message.content.strip()
        question, answer = output.split("\n")[:2]  # Split into question and answer
        return question.replace("السؤال: ", ""), answer.replace("الإجابة: ", "")
    except Exception as e:
        print(f"Error generating Q&A: {e}")
        return None, None

# Load the input data
input_file = "/kaggle/working/arabic_wikipedia_articles.csv"  # Path to your input file
output_file = "generated_questions_answers.csv"

try:
    # Read the input CSV file (expecting 'article' and 'content' columns)
    df = pd.read_csv(input_file)

    # Initialize a list to store results
    results = []

    # Process each article
    print("Generating questions and answers...")
    for _, row in df.iterrows():
        article_title = row['Article']  # Article title
        content = row['Content']       # Article content

        # Split the content into smaller chunks
        chunks = chunk_text(content, chunk_size=1500)

        # Generate Q&A for each chunk
        for idx, chunk in enumerate(chunks):
            print(f"Processing chunk {idx + 1} for article: {article_title}")
            question, answer = generate_question_answer(chunk, article_title)

            # Append results to the list
            if question and answer:
                results.append({
                    "article": article_title,
                    "chunk_number": idx + 1,
                    "chunk_content": chunk,
                    "question": question,
                    "answer": answer
                })

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)

    # Save the generated Q&A to a new CSV file
    results_df.to_csv(output_file, index=False, encoding="utf-8-sig")

    print(f"Questions and answers saved to: {output_file}")
except Exception as e:
    print(f"Error processing the file: {e}")
