In [None]:

!pip install pandas numpy matplotlib seaborn scikit-learn
!pip install spacy PyPDF2 python-docx
!python -m spacy download en_core_web_sm

In [None]:
import json
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("✓ All libraries imported successfully!")

In [None]:
# Path to dataset
DATA_PATH = os.path.join('data', 'train.json')

# Check if file exists
if not os.path.exists(DATA_PATH):
    print("data/train.json not found!")
else:
    # Load dataset
    with open(DATA_PATH, 'r', encoding='utf-8') as f:
        resumes_data = json.load(f)
    
    print(f"✓ Dataset loaded successfully!")
    print(f"\nTotal resumes: {len(resumes_data)}")
    print(f"\nSample structure of first resume:")
    print(f"Keys: {list(resumes_data[0].keys())}")

In [None]:
# Display first resume details
sample_resume = resumes_data[0]

print("="*70)
print("SAMPLE RESUME TEXT (First 500 characters)")
print("="*70)
print(sample_resume['text'][:500])
print("\n...")

print("\n" + "="*70)
print("SAMPLE ANNOTATIONS (First 5)")
print("="*70)
for ann in sample_resume['annotations'][:5]:
    if len(ann) >= 3:
        start, end, label = ann[0], ann[1], ann[2]
        entity_text = sample_resume['text'][start:end]
        print(f"Label: {label:15s} | Text: {entity_text}")

In [None]:
# Initialize counters
entity_counts = Counter()
resume_lengths = []
annotations_per_resume = []
all_skills = []

# Process all resumes
for resume in resumes_data:
    text = resume.get('text', '')
    annotations = resume.get('annotations', [])
    
    resume_lengths.append(len(text))
    annotations_per_resume.append(len(annotations))
    
    for ann in annotations:
        if len(ann) >= 3:
            label = ann[2]
            entity_counts[label] += 1
            
            # Extract skill text
            if label == 'SKILL':
                skill_text = text[ann[0]:ann[1]].strip().lower()
                if skill_text:
                    all_skills.append(skill_text)

print("✓ Entity extraction complete!")
print(f"\nTotal entity labels found: {len(entity_counts)}")
print(f"Total unique skills extracted: {len(set(all_skills))}")

In [None]:
print("="*70)
print("ENTITY LABEL DISTRIBUTION")
print("="*70)

for entity, count in entity_counts.most_common():
    percentage = (count / sum(entity_counts.values())) * 100
    print(f"{entity:20s}: {count:8,d} ({percentage:5.2f}%)")

print("\n" + "="*70)
print("RESUME STATISTICS")
print("="*70)
print(f"Average resume length: {np.mean(resume_lengths):,.0f} characters")
print(f"Median resume length: {np.median(resume_lengths):,.0f} characters")
print(f"Average annotations per resume: {np.mean(annotations_per_resume):.1f}")
print(f"Max annotations in a resume: {max(annotations_per_resume)}")

In [None]:
# Get top 10 entities
top_entities = dict(entity_counts.most_common(10))

plt.figure(figsize=(12, 6))
plt.barh(list(top_entities.keys()), list(top_entities.values()), color='skyblue', edgecolor='black')
plt.xlabel('Count', fontsize=12)
plt.title('Top 10 Entity Types in Resume Dataset', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()

# Add value labels
for i, (key, value) in enumerate(top_entities.items()):
    plt.text(value + 100, i, f'{value:,}', va='center', fontsize=10)

plt.tight_layout()
plt.savefig('entity_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved as 'entity_distribution.png'")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Resume length distribution
axes[0].hist(resume_lengths, bins=50, color='lightgreen', edgecolor='black', alpha=0.7)
axes[0].axvline(np.mean(resume_lengths), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(resume_lengths):,.0f}')
axes[0].set_xlabel('Resume Length (characters)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Resume Lengths', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Annotations per resume
axes[1].hist(annotations_per_resume, bins=50, color='coral', edgecolor='black', alpha=0.7)
axes[1].axvline(np.mean(annotations_per_resume), color='blue', linestyle='--', linewidth=2, label=f'Mean: {np.mean(annotations_per_resume):.1f}')
axes[1].set_xlabel('Number of Annotations', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Distribution of Annotations per Resume', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('resume_statistics.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved as 'resume_statistics.png'")

In [None]:
# Count skill frequencies
skill_counts = Counter(all_skills)
top_20_skills = skill_counts.most_common(20)

print("="*70)
print("TOP 20 MOST MENTIONED SKILLS")
print("="*70)

for i, (skill, count) in enumerate(top_20_skills, 1):
    print(f"{i:2d}. {skill:30s}: {count:5d} mentions")

# Visualize
skills_df = pd.DataFrame(top_20_skills, columns=['Skill', 'Count'])

plt.figure(figsize=(12, 8))
plt.barh(skills_df['Skill'], skills_df['Count'], color='purple', alpha=0.7, edgecolor='black')
plt.xlabel('Frequency', fontsize=12)
plt.title('Top 20 Most Mentioned Skills', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('top_skills.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Chart saved as 'top_skills.png'")

In [None]:
# Create output directory
os.makedirs('processed_data', exist_ok=True)

# Save statistics
stats = {
    'total_resumes': len(resumes_data),
    'entity_counts': dict(entity_counts),
    'unique_skills': list(set(all_skills)),
    'top_20_skills': [skill for skill, _ in top_20_skills],
    'avg_resume_length': float(np.mean(resume_lengths)),
    'avg_annotations': float(np.mean(annotations_per_resume))
}

with open('processed_data/dataset_statistics.json', 'w', encoding='utf-8') as f:
    json.dump(stats, f, indent=2, ensure_ascii=False)

print("✓ Statistics saved to: processed_data/dataset_statistics.json")
print("\n" + "="*70)
print("PART 1 COMPLETE!")
print("="*70)