In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# üìä CV Builder - Dataset Analysis\n",
    "\n",
    "An√°lise completa dos datasets para treino do modelo de IA.\n",
    "\n",
    "**Objetivos:**\n",
    "- Validar qualidade dos dados\n",
    "- Identificar padr√µes e insights\n",
    "- Calcular estat√≠sticas importantes\n",
    "- Verificar distribui√ß√£o de exemplos\n",
    "- Gerar visualiza√ß√µes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## üîß Setup e Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports necess√°rios\n",
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from pathlib import Path\n",
    "from collections import Counter\n",
    "import warnings\n",
    "\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Configura√ß√£o de visualiza√ß√£o\n",
    "plt.style.use('seaborn-v0_8-darkgrid')\n",
    "sns.set_palette(\"husl\")\n",
    "plt.rcParams['figure.figsize'] = (12, 6)\n",
    "plt.rcParams['font.size'] = 10\n",
    "\n",
    "print(\"‚úÖ Imports carregados com sucesso!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## üìÇ Carregar Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Caminhos dos datasets\n",
    "DATA_DIR = Path('../datasets/processed')\n",
    "\n",
    "# Carregar datasets processados\n",
    "with open(DATA_DIR / 'text_improvement.json', 'r', encoding='utf-8') as f:\n",
    "    text_improvement = json.load(f)\n",
    "\n",
    "with open(DATA_DIR / 'skills_by_area.json', 'r', encoding='utf-8') as f:\n",
    "    skills_data = json.load(f)\n",
    "\n",
    "with open(DATA_DIR / 'summary_templates.json', 'r', encoding='utf-8') as f:\n",
    "    summary_templates = json.load(f)\n",
    "\n",
    "print(\"üìÅ Datasets carregados:\")\n",
    "print(f\"  - Text Improvement: {len(text_improvement.get('by_section', {}).get('experience', []))} exemplos de experi√™ncia\")\n",
    "print(f\"  - Skills Database: {skills_data.get('metadata', {}).get('total_skills', 0)} skills\")\n",
    "print(f\"  - Summary Templates: {summary_templates.get('metadata', {}).get('total_templates', 0)} templates\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## üìä An√°lise: Text Improvement Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Converter para DataFrame para an√°lise\n",
    "experience_examples = text_improvement['by_section']['experience']\n",
    "summary_examples = text_improvement['by_section']['summary']\n",
    "\n",
    "df_experience = pd.DataFrame(experience_examples)\n",
    "df_summary = pd.DataFrame(summary_examples)\n",
    "\n",
    "print(\"üîç An√°lise de Experi√™ncias Profissionais\")\n",
    "print(\"=\"*50)\n",
    "print(f\"Total de exemplos: {len(df_experience)}\")\n",
    "print(f\"\\nDistribui√ß√£o por Ind√∫stria:\")\n",
    "print(df_experience['industry'].value_counts())\n",
    "print(f\"\\nDistribui√ß√£o por Seniority:\")\n",
    "print(df_experience['seniority'].value_counts())\n",
    "\n",
    "# Estat√≠sticas de ATS Score\n",
    "print(f\"\\nüìà ATS Score Statistics:\")\n",
    "print(f\"  - M√©dia: {df_experience['ats_score'].mean():.2f}\")\n",
    "print(f\"  - Mediana: {df_experience['ats_score'].median():.2f}\")\n",
    "print(f\"  - Min: {df_experience['ats_score'].min()}\")\n",
    "print(f\"  - Max: {df_experience['ats_score'].max()}\")\n",
    "\n",
    "# Estat√≠sticas de Quality Score\n",
    "print(f\"\\n‚≠ê Quality Score Statistics:\")\n",
    "print(f\"  - M√©dia: {df_experience['quality_score'].mean():.2f}\")\n",
    "print(f\"  - Mediana: {df_experience['quality_score'].median():.2f}\")\n",
    "print(f\"  - Min: {df_experience['quality_score'].min()}\")\n",
    "print(f\"  - Max: {df_experience['quality_score'].max()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualiza√ß√£o 1: Distribui√ß√£o de ATS Scores\n",
    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
    "\n",
    "# Histograma ATS Score\n",
    "axes[0].hist(df_experience['ats_score'], bins=20, color='skyblue', edgecolor='black')\n",
    "axes[0].set_title('Distribui√ß√£o de ATS Scores', fontsize=14, fontweight='bold')\n",
    "axes[0].set_xlabel('ATS Score')\n",
    "axes[0].set_ylabel('Frequ√™ncia')\n",
    "axes[0].axvline(df_experience['ats_score'].mean(), color='red', linestyle='--', label=f'M√©dia: {df_experience[\"ats_score\"].mean():.1f}')\n",
    "axes[0].legend()\n",
    "\n",
    "# Histograma Quality Score\n",
    "axes[1].hist(df_experience['quality_score'], bins=20, color='lightcoral', edgecolor='black')\n",
    "axes[1].set_title('Distribui√ß√£o de Quality Scores', fontsize=14, fontweight='bold')\n",
    "axes[1].set_xlabel('Quality Score')\n",
    "axes[1].set_ylabel('Frequ√™ncia')\n",
    "axes[1].axvline(df_experience['quality_score'].mean(), color='red', linestyle='--', label=f'M√©dia: {df_experience[\"quality_score\"].mean():.1f}')\n",
    "axes[1].legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualiza√ß√£o 2: Scores por Ind√∫stria\n",
    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
    "\n",
    "# ATS Score por ind√∫stria\n",
    "df_experience.groupby('industry')['ats_score'].mean().plot(kind='bar', ax=axes[0], color='teal')\n",
    "axes[0].set_title('ATS Score M√©dio por Ind√∫stria', fontsize=14, fontweight='bold')\n",
    "axes[0].set_xlabel('Ind√∫stria')\n",
    "axes[0].set_ylabel('ATS Score M√©dio')\n",
    "axes[0].tick_params(axis='x', rotation=45)\n",
    "\n",
    "# Quality Score por ind√∫stria\n",
    "df_experience.groupby('industry')['quality_score'].mean().plot(kind='bar', ax=axes[1], color='purple')\n",
    "axes[1].set_title('Quality Score M√©dio por Ind√∫stria', fontsize=14, fontweight='bold')\n",
    "axes[1].set_xlabel('Ind√∫stria')\n",
    "axes[1].set_ylabel('Quality Score M√©dio')\n",
    "axes[1].tick_params(axis='x', rotation=45)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# An√°lise de Keywords\n",
    "print(\"üîë An√°lise de Keywords\")\n",
    "print(\"=\"*50)\n",
    "\n",
    "# Extrair todas as keywords\n",
    "all_keywords = []\n",
    "for example in experience_examples:\n",
    "    all_keywords.extend(example.get('keywords', []))\n",
    "\n",
    "# Contar frequ√™ncia\n",
    "keyword_counts = Counter(all_keywords)\n",
    "top_keywords = keyword_counts.most_common(20)\n",
    "\n",
    "print(f\"\\nTotal de keywords √∫nicas: {len(keyword_counts)}\")\n",
    "print(f\"\\nTop 20 Keywords Mais Usadas:\")\n",
    "for keyword, count in top_keywords:\n",
    "    print(f\"  {keyword}: {count}x\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualiza√ß√£o 3: Top Keywords\n",
    "keywords_df = pd.DataFrame(top_keywords, columns=['Keyword', 'Frequ√™ncia'])\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.barh(keywords_df['Keyword'], keywords_df['Frequ√™ncia'], color='coral')\n",
    "plt.xlabel('Frequ√™ncia', fontsize=12)\n",
    "plt.ylabel('Keyword', fontsize=12)\n",
    "plt.title('Top 20 Keywords Mais Utilizadas', fontsize=14, fontweight='bold')\n",
    "plt.gca().invert_yaxis()\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## üíº An√°lise: Skills Database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# An√°lise de Skills por √°rea\n",
    "print(\"üíº An√°lise de Skills Database\")\n",
    "print(\"=\"*50)\n",
    "\n",
    "# Extrair skills de tecnologia\n",
    "tech_skills = []\n",
    "tech_data = skills_data.get('technology', {})\n",
    "\n",
    "for category, skills_list in tech_data.items():\n",
    "    if isinstance(skills_list, list):\n",
    "        for skill in skills_list:\n",
    "            skill['area'] = 'technology'\n",
    "            skill['subcategory'] = category\n",
    "            tech_skills.append(skill)\n",
    "\n",
    "df_skills = pd.DataFrame(tech_skills)\n",
    "\n",
    "print(f\"Total de skills em Technology: {len(df_skills)}\")\n",
    "print(f\"\\nDistribui√ß√£o por Subcategoria:\")\n",
    "print(df_skills['subcategory'].value_counts())\n",
    "\n",
    "print(f\"\\nDistribui√ß√£o por Prioridade:\")\n",
    "print(df_skills['priority'].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Estat√≠sticas de Demand Score\n",
    "print(f\"\\nüìä Demand Score Statistics:\")\n",
    "print(f\"  - M√©dia: {df_skills['demand_score'].mean():.2f}\")\n",
    "print(f\"  - Mediana: {df_skills['demand_score'].median():.2f}\")\n",
    "print(f\"  - Min: {df_skills['demand_score'].min()}\")\n",
    "print(f\"  - Max: {df_skills['demand_score'].max()}\")\n",
    "\n",
    "# Top 10 skills por demand score\n",
    "top_demand = df_skills.nlargest(10, 'demand_score')[['name', 'demand_score', 'salary_impact', 'category']]\n",
    "print(f\"\\nüî• Top 10 Skills por Demand Score:\")\n",
    "print(top_demand.to_string(index=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualiza√ß√£o 4: Demand Score vs Salary Impact\n",
    "plt.figure(figsize=(12, 6))\n",
    "\n",
    "# Converter salary_impact para num√©rico\n",
    "df_skills['salary_numeric'] = df_skills['salary_impact'].str.replace('+', '').str.replace('%', '').astype(float)\n",
    "\n",
    "scatter = plt.scatter(df_skills['demand_score'], \n",
    "                     df_skills['salary_numeric'],\n",
    "                     c=df_skills['priority'].map({'high': 'red', 'medium': 'orange', 'low': 'green'}),\n",
    "                     alpha=0.6,\n",
    "                     s=100)\n",
    "\n",
    "plt.xlabel('Demand Score', fontsize=12)\n",
    "plt.ylabel('Salary Impact (%)', fontsize=12)\n",
    "plt.title('Demand Score vs Salary Impact (Skills de Tecnologia)', fontsize=14, fontweight='bold')\n",
    "plt.grid(True, alpha=0.3)\n",
    "\n",
    "# Adicionar labels para top skills\n",
    "top_5_skills = df_skills.nlargest(5, 'demand_score')\n",
    "for idx, row in top_5_skills.iterrows():\n",
    "    plt.annotate(row['name'], \n",
    "                (row['demand_score'], row['salary_numeric']),\n",
    "                xytext=(5, 5), \n",
    "                textcoords='offset points',\n",
    "                fontsize=9,\n",
    "                bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.5))\n",
    "\n",
    "# Legenda\n",
    "from matplotlib.patches import Patch\n",
    "legend_elements = [\n",
    "    Patch(facecolor='red', label='High Priority'),\n",
    "    Patch(facecolor='orange', label='Medium Priority'),\n",
    "    Patch(facecolor='green', label='Low Priority')\n",
    "]\n",
    "plt.legend(handles=legend_elements, loc='upper left')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualiza√ß√£o 5: Skills por Categoria\n",
    "category_counts = df_skills['subcategory'].value_counts()\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "category_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'))\n",
    "plt.title('Distribui√ß√£o de Skills por Categoria (Technology)', fontsize=14, fontweight='bold')\n",
    "plt.ylabel('')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## üìù An√°lise: Summary Templates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# An√°lise de templates\n",
    "print(\"üìù An√°lise de Summary Templates\")\n",
    "print(\"=\"*50)\n",
    "\n",
    "templates_by_role = summary_templates.get('templates_by_role', {})\n",
    "\n",
    "print(f\"Total de roles com templates: {len(templates_by_role)}\")\n",
    "print(f\"\\nRoles dispon√≠veis:\")\n",
    "for role in templates_by_role.keys():\n",
    "    print(f\"  - {role}\")\n",
    "\n",
    "# Contar templates por seniority\n",
    "seniority_count = {}\n",
    "for role, levels in templates_by_role.items():\n",
    "    for level in levels.keys():\n",
    "        if level not in seniority_count:\n",
    "            seniority_count[level] = 0\n",
    "        seniority_count[level] += 1\n",
    "\n",
    "print(f\"\\nTemplates por Seniority Level:\")\n",
    "for level, count in seniority_count.items():\n",
    "    print(f\"  {level}: {count} roles\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# An√°lise de Action Verbs\n",
    "action_verbs = summary_templates.get('action_verbs', {})\n",
    "\n",
    "print(f\"\\nüéØ Action Verbs por Categoria:\")\n",
    "for category, verbs in action_verbs.items():\n",
    "    print(f\"\\n{category.upper()}:\")\n",
    "    print(f\"  Total: {len(verbs)} verbos\")\n",
    "    print(f\"  Exemplos: {', '.join(verbs[:5])}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## üéØ Valida√ß√£o de Qualidade"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Valida√ß√£o de qualidade dos datasets\n",
    "print(\"‚úÖ Valida√ß√£o de Qualidade dos Datasets\")\n",
    "print(\"=\"*50)\n",
    "\n",
    "def validate_text_improvement(data):\n",
    "    issues = []\n",
    "    \n",
    "    for section, examples in data['by_section'].items():\n",
    "        for example in examples:\n",
    "            # Verificar campos obrigat√≥rios\n",
    "            if not example.get('original'):\n",
    "                issues.append(f\"Missing 'original' in {section} - ID: {example.get('id')}\")\n",
    "            if not example.get('improved'):\n",
    "                issues.append(f\"Missing 'improved' in {section} - ID: {example.get('id')}\")\n",
    "            if not example.get('keywords'):\n",
    "                issues.append(f\"Missing 'keywords' in {section} - ID: {example.get('id')}\")\n",
    "            \n",
    "            # Verificar scores\n",
    "            if 'ats_score' in example:\n",
    "                if not (0 <= example['ats_score'] <= 100):\n",
    "                    issues.append(f\"Invalid ATS score in {section} - ID: {example.get('id')}\")\n",
    "    \n",
    "    return issues\n",
    "\n",
    "def validate_skills(data):\n",
    "    issues = []\n",
    "    \n",
    "    for area, categories in data.items():\n",
    "        if area == 'metadata':\n",
    "            continue\n",
    "        \n",
    "        for category, skills in categories.items():\n",
    "            if not isinstance(skills, list):\n",
    "                continue\n",
    "            \n",
    "            for skill in skills:\n",
    "                # Verificar campos obrigat√≥rios\n",
    "                if not skill.get('name'):\n",
    "                    issues.append(f\"Missing 'name' in {area}/{category}\")\n",
    "                if not skill.get('demand_score'):\n",
    "                    issues.append(f\"Missing 'demand_score' for {skill.get('name')}\")\n",
    "                \n",
    "                # Verificar demand_score range\n",
    "                if 'demand_score' in skill:\n",
    "                    if not (0 <= skill['demand_score'] <= 100):\n",
    "                        issues.append(f\"Invalid demand_score for {skill.get('name')}\")\n",
    "    \n",
    "    return issues\n",
    "\n",
    "# Executar valida√ß√µes\n",
    "text_issues = validate_text_improvement(text_improvement)\n",
    "skills_issues = validate_skills(skills_data)\n",
    "\n",
    "print(f\"\\nText Improvement Dataset:\")\n",
    "if text_issues:\n",
    "    print(f\"  ‚ùå Encontrados {len(text_issues)} problemas\")\n",
    "    for issue in text_issues[:5]:  # Mostrar apenas os primeiros 5\n",
    "        print(f\"    - {issue}\")\n",
    "else:\n",
    "    print(f\"  ‚úÖ Nenhum problema encontrado\")\n",
    "\n",
    "print(f\"\\nSkills Database:\")\n",
    "if skills_issues:\n",
    "    print(f\"  ‚ùå Encontrados {len(skills_issues)} problemas\")\n",
    "    for issue in skills_issues[:5]:\n",
    "        print(f\"    - {issue}\")\n",
    "else:\n",
    "    print(f\"  ‚úÖ Nenhum problema encontrado\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## üìà Resumo Final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Resumo final da an√°lise\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"üìä RESUMO FINAL DA AN√ÅLISE\")\n",
    "print(\"=\"*60)\n",
    "\n",
    "print(\"\\n1Ô∏è‚É£ TEXT IMPROVEMENT DATASET\")\n",
    "print(f\"   - Total de exemplos: {len(df_experience) + len(df_summary)}\")\n",
    "print(f\"   - ATS Score m√©dio: {df_experience['ats_score'].mean():.1f}/100\")\n",
    "print(f\"   - Quality Score m√©dio: {df_experience['quality_score'].mean():.1f}/100\")\n",
    "print(f\"   - Keywords √∫nicas: {len(keyword_counts)}\")\n",
    "\n",
    "print(\"\\n2Ô∏è‚É£ SKILLS DATABASE\")\n",
    "print(f\"   - Total de skills: {len(df_skills)}\")\n",
    "print(f\"   - Demand Score m√©dio: {df_skills['demand_score'].mean():.1f}/100\")\n",
    "print(f\"   - Skills high priority: {len(df_skills[df_skills['priority'] == 'high'])}\")\n",
    "print(f\"   - Categorias: {df_skills['subcategory'].nunique()}\")\n",
    "\n",
    "print(\"\\n3Ô∏è‚É£ SUMMARY TEMPLATES\")\n",
    "print(f\"   - Roles cobertos: {len(templates_by_role)}\")\n",
    "print(f\"   - Action verbs dispon√≠veis: {sum(len(v) for v in action_verbs.values())}\")\n",
    "\n",
    "print(\"\\n4Ô∏è‚É£ QUALIDADE DOS DADOS\")\n",
    "if not text_issues and not skills_issues:\n",
    "    print(\"   ‚úÖ Todos os datasets passaram na valida√ß√£o!\")\n",
    "else:\n",
    "    print(f\"   ‚ö†Ô∏è Encontrados {len(text_issues) + len(skills_issues)} problemas\")\n",
    "\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"‚ú® An√°lise conclu√≠da com sucesso!\")\n",
    "print(\"=\"*60)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## üíæ Exportar Resultados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Exportar an√°lises para CSV\n",
    "output_dir = Path('../analysis_results')\n",
    "output_dir.mkdir(exist_ok=True)\n",
    "\n",
    "# Exportar DataFrames\n",
    "df_experience.to_csv(output_dir / 'experience_analysis.csv', index=False)\n",
    "df_skills.to_csv(output_dir / 'skills_analysis.csv', index=False)\n",
    "keywords_df.to_csv(output_dir / 'top_keywords.csv', index=False)\n",
    "\n",
    "print(f\"\\nüíæ Resultados exportados para: {output_dir.absolute()}\")\n",
    "print(\"   - experience_analysis.csv\")\n",
    "print(\"   - skills_analysis.csv\")\n",
    "print(\"   - top_keywords.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}