In [None]:
#default testing commit
# Import required libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add project root to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)

from src.data_processor import DocumentationDataProcessor

# Initialize Data Processor
data_processor = DocumentationDataProcessor()

# Prepare Training Data
df = data_processor.prepare_training_data()

# Calculate Text Metrics
df['text_length'] = df['text'].str.len()
df['summary_length'] = df['summary'].str.len()
df['compression_ratio'] = df['summary_length'] / df['text_length']

# Descriptive Statistics
print("Dataset Overview:")
print(df.info())

print("\nText Length Statistics:")
print(df['text_length'].describe())

print("\nSummary Length Statistics:")
print(df['summary_length'].describe())

# Visualization
plt.figure(figsize=(15, 5))

# Text Length Distribution
plt.subplot(1, 3, 1)
sns.histplot(df['text_length'], kde=True)
plt.title('Text Length Distribution')
plt.xlabel('Text Length')

# Summary Length Distribution
plt.subplot(1, 3, 2)
sns.histplot(df['summary_length'], kde=True)
plt.title('Summary Length Distribution')
plt.xlabel('Summary Length')

# Compression Ratio Distribution
plt.subplot(1, 3, 3)
sns.histplot(df['compression_ratio'], kde=True)
plt.title('Compression Ratio Distribution')
plt.xlabel('Compression Ratio')

plt.tight_layout()
plt.show()

# Correlation Analysis
correlation_matrix = df[['text_length', 'summary_length', 'compression_ratio']].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Data Exploration Notebook

## Overview
This notebook provides a comprehensive exploration of the documentation dataset used for the Summarization AI project.

## Key Analysis Objectives
- Understand dataset characteristics
- Visualize text and summary distributions
- Identify key patterns in documentation