In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.isri import ISRIStemmer
import arabicstopwords.arabicstopwords as stp
import re

In [2]:
# Load the dataset
df = pd.read_csv('paragraphs.csv')  # Replace with your CSV file path

In [3]:
# Download necessary models and resources
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
stop_words = set(stp.stopwords_list())
stemmer = ISRIStemmer()

In [5]:
# Define preprocessing function
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    # Remove English words and letters
    arabic_pattern = re.compile(r'[\u0621-\u064A]+')  # Arabic Unicode range
    arabic_tokens = [word for word in tokens if arabic_pattern.match(word)]
    
    # Stemming
    stemmed_tokens = [stemmer.stem(word) for word in arabic_tokens]
    
    return ' '.join(stemmed_tokens)


In [6]:
# Apply preprocessing to the dataset
df['preprocessed_text'] = df['text'].apply(preprocess_text)

# Optional: Discretize scores
df['discretized_score'] = pd.cut(df['score'], bins=10, labels=False)

# Save the preprocessed data
df.to_csv('preprocessed_dataset.csv', index=False)