In [2]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import os

# Ensure matplotlib is set to use a non-interactive backend for saving files
plt.switch_backend('Agg')

# --- Configuration ---
# Updated to load directly from the Hugging Face Hub
DATASET_PATH = "hamzabouajila/tunisian-derja-unified-raw-corpus"
OUTPUT_DIR = "analysis_output"
TEXT_COLUMN_NAME = "text"

# --- Main Analysis Functions ---

def load_dataset_for_analysis():
    """
    Loads the dataset directly from the Hugging Face Hub.
    Returns:
        A Hugging Face Dataset object or None if loading fails.
    """
    print(f"Loading dataset from Hugging Face: '{DATASET_PATH}'...")
    try:
        # Load the 'train' split of the dataset
        return load_dataset(DATASET_PATH, split='train')
    except Exception as e:
        print(f"Failed to load dataset from Hugging Face: {e}")
        return None

def analyze_dataset(dataset):
    """
    Performs a detailed analysis of the dataset.
    Args:
        dataset: The Hugging Face Dataset object.
    Returns:
        A dictionary of analysis insights.
    """
    print("\n--- Starting Dataset Analysis ---")
    df = dataset.to_pandas()
    
    # 1. Basic statistics
    total_rows = len(df)
    unique_rows = df[TEXT_COLUMN_NAME].nunique()
    null_values = df[TEXT_COLUMN_NAME].isnull().sum()
    empty_strings = (df[TEXT_COLUMN_NAME].astype(str).str.strip() == '').sum()
    
    print(f"Total entries: {total_rows}")
    print(f"Unique entries: {unique_rows}")
    print(f"Entries with Null values: {null_values}")
    print(f"Entries with empty strings: {empty_strings}")
    
    # 2. Text length analysis
    df['text_length'] = df[TEXT_COLUMN_NAME].astype(str).apply(len)
    df['word_count'] = df[TEXT_COLUMN_NAME].astype(str).apply(lambda x: len(re.findall(r'\b\w+\b', x)))
    
    avg_length = df['text_length'].mean()
    min_length = df['text_length'].min()
    max_length = df['text_length'].max()
    median_length = df['text_length'].median()
    
    avg_words = df['word_count'].mean()
    min_words = df['word_count'].min()
    max_words = df['word_count'].max()
    
    print("\n--- Text Length Insights ---")
    print(f"Average character length: {avg_length:.2f}")
    print(f"Minimum character length: {min_length}")
    print(f"Maximum character length: {max_length}")
    print(f"Median character length: {median_length}")
    print(f"Average word count: {avg_words:.2f}")
    print(f"Minimum word count: {min_words}")
    print(f"Maximum word count: {max_words}")
    
    # 3. Most common words (after cleaning)
    print("\n--- Most Common Words ---")
    all_text = " ".join(df[TEXT_COLUMN_NAME].astype(str).tolist())
    # Simple tokenization and cleaning: remove non-alphabetic characters
    words = re.findall(r'[\u0600-\u06FF\w\']+', all_text.lower())
    word_counts = Counter(words)
    most_common_words = word_counts.most_common(20)
    for word, count in most_common_words:
        print(f"'{word}': {count}")
    
    # 4. Save insights to a file
    insights = {
        'total_rows': total_rows,
        'unique_rows': unique_rows,
        'null_values': null_values,
        'empty_strings': empty_strings,
        'avg_length': avg_length,
        'min_length': min_length,
        'max_length': max_length,
        'median_length': median_length,
        'avg_words': avg_words,
        'min_words': min_words,
        'max_words': max_words,
        'most_common_words': most_common_words
    }
    return df, insights

def create_and_save_visualizations(df):
    """
    Creates and saves visualizations of the dataset.
    Args:
        df: The pandas DataFrame of the dataset.
    """
    print("\n--- Generating Visualizations ---")
    
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        
    # Plot 1: Histogram of text lengths
    plt.figure(figsize=(10, 6))
    sns.histplot(df['text_length'], bins=50, kde=True, color='skyblue')
    plt.title('Distribution of Text Length (Characters)')
    plt.xlabel('Character Length')
    plt.ylabel('Frequency')
    plt.savefig(os.path.join(OUTPUT_DIR, 'text_length_histogram.png'))
    plt.show() # Added to display plot in Jupyter
    plt.close()
    print(f"Saved histogram to {os.path.join(OUTPUT_DIR, 'text_length_histogram.png')}")
    
    # Plot 2: Bar chart of top 20 most common words
    all_text = " ".join(df[TEXT_COLUMN_NAME].astype(str).tolist())
    words = re.findall(r'[\u0600-\u06FF\w\']+', all_text.lower())
    word_counts = Counter(words)
    top_words_df = pd.DataFrame(word_counts.most_common(20), columns=['word', 'count'])
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='count', y='word', data=top_words_df, palette='viridis')
    plt.title('Top 20 Most Frequent Words')
    plt.xlabel('Frequency')
    plt.ylabel('Word')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'top_words_barchart.png'))
    plt.show() # Added to display plot in Jupyter
    plt.close()
    print(f"Saved bar chart to {os.path.join(OUTPUT_DIR, 'top_words_barchart.png')}")


In [4]:
dataset_hf = load_dataset_for_analysis()

if dataset_hf:
    df_analysis, insights = analyze_dataset(dataset_hf)
    create_and_save_visualizations(df_analysis)
    print(f"\nAnalysis complete. Check the '{OUTPUT_DIR}' folder for insights and plots!")


Loading dataset from Hugging Face: 'hamzabouajila/tunisian-derja-unified-raw-corpus'...


README.md:   0%|          | 0.00/331 [00:00<?, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/173M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/172M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/802659 [00:00<?, ? examples/s]


--- Starting Dataset Analysis ---
Total entries: 802659
Unique entries: 802658
Entries with Null values: 1
Entries with empty strings: 1

--- Text Length Insights ---
Average character length: 474.02
Minimum character length: 0
Maximum character length: 138325
Median character length: 62.0
Average word count: 81.62
Minimum word count: 0
Maximum word count: 25546

--- Most Common Words ---
'في': 1772354
'من': 1301867
'على': 879769
'و': 646063
'إلى': 386569
'أن': 372829
'ما': 305363
'عن': 292773
'التي': 272821
'مع': 228677
'تونس': 218745
'لا': 211174
'هذا': 188284
'،': 177059
'الذي': 159266
'بعد': 155041
'اليوم': 151762
'هذه': 151142
'بين': 149108
'الى': 144039

--- Generating Visualizations ---


  plt.show() # Added to display plot in Jupyter


Saved histogram to analysis_output/text_length_histogram.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='count', y='word', data=top_words_df, palette='viridis')
  plt.show() # Added to display plot in Jupyter


Saved bar chart to analysis_output/top_words_barchart.png

Analysis complete. Check the 'analysis_output' folder for insights and plots!
