# Stack Overflow CV Training Dataset Exploration

This notebook inspects the curated Stack Overflow Developer Survey slice that trains the CV Creation Agent. It covers dataset loading, data quality checks, demographic and skills distributions, and salary/experience insights.

## 1. Environment Setup

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

try:
    import seaborn as sns
    sns.set_theme(style='whitegrid')
except ImportError:
    sns = None
    print('Optional dependency seaborn not found; install it for nicer visuals.')

search_root = Path.cwd().resolve()
candidates = [search_root] + list(search_root.parents)
ROOT_DIR = None
for candidate in candidates:
    app_dir = candidate / 'apps' / 'cv_creation_agent'
    if app_dir.exists():
        ROOT_DIR = candidate
        break

if ROOT_DIR is None:
    raise FileNotFoundError('Could not locate the cv_creation_agent app directory from current path.')

DATA_PATH = ROOT_DIR / 'apps' / 'cv_creation_agent' / 'data' / 'stackoverflow_cv_training_dataset.csv'
print(f'Resolved project root: {ROOT_DIR}')
print(f'Dataset path: {DATA_PATH}')
if not DATA_PATH.exists():
    raise FileNotFoundError('Expected dataset not found. Run the export notebook first.')


Resolved project root: D:\SEP490\BE PY\be-python
Dataset path: D:\SEP490\BE PY\be-python\apps\cv_creation_agent\data\stackoverflow_cv_training_dataset.csv


## 2. Load the dataset

In [3]:
df = pd.read_csv(DATA_PATH)
print(f'Shape: {df.shape[0]:,} rows x {df.shape[1]} columns')
df.head()


Shape: 65,437 rows x 36 columns


Unnamed: 0,MainBranch,Age,Employment,RemoteWork,DevType,OrgSize,Country,LanguageHaveWorkedWith,LanguageWantToWorkWith,LanguageAdmired,...,WebframeAdmired,ToolsTechHaveWorkedWith,ToolsTechWantToWorkWith,ToolsTechAdmired,NEWCollabToolsHaveWorkedWith,NEWCollabToolsWantToWorkWith,NEWCollabToolsAdmired,CompTotal,AIComplex,ConvertedCompYearly
0,I am a developer by profession,Under 18 years old,"Employed, full-time",Remote,,,United States of America,,,,...,,,,,,,,,,
1,I am a developer by profession,35-44 years old,"Employed, full-time",Remote,"Developer, full-stack",,United Kingdom of Great Britain and Northern I...,Bash/Shell (all shells);Go;HTML/CSS;Java;JavaS...,Bash/Shell (all shells);Go;HTML/CSS;Java;JavaS...,Bash/Shell (all shells);Go;HTML/CSS;Java;JavaS...,...,Express;Node.js;React,Docker;Homebrew;Kubernetes;npm;Vite;Webpack,Docker;Homebrew;Kubernetes;npm;Vite;Webpack,Docker;Homebrew;Kubernetes;npm;Vite;Webpack,PyCharm;Visual Studio Code;WebStorm,PyCharm;Visual Studio Code;WebStorm,PyCharm;Visual Studio Code;WebStorm,,,
2,I am a developer by profession,45-54 years old,"Employed, full-time",Remote,Developer Experience,,United Kingdom of Great Britain and Northern I...,C#,C#,C#,...,ASP.NET CORE,MSBuild,MSBuild,MSBuild,Visual Studio,Visual Studio,Visual Studio,,,
3,I am learning to code,18-24 years old,"Student, full-time",,"Developer, full-stack",,Canada,C;C++;HTML/CSS;Java;JavaScript;PHP;PowerShell;...,HTML/CSS;Java;JavaScript;PowerShell;Python;SQL...,HTML/CSS;Java;JavaScript;PowerShell;Python;SQL...,...,jQuery;Next.js;Node.js;React,Docker;npm;Pip,Docker;Kubernetes;npm,Docker;npm,,,,,Bad at handling complex tasks,
4,I am a developer by profession,18-24 years old,"Student, full-time",,"Developer, full-stack",,Norway,C++;HTML/CSS;JavaScript;Lua;Python;Rust,C++;HTML/CSS;JavaScript;Lua;Python,C++;HTML/CSS;JavaScript;Lua;Python,...,,APT;Make;npm,APT;Make,APT;Make,Vim,Vim,Vim,,,


### Column overview

In [4]:
pd.DataFrame({'column': df.columns, 'dtype': df.dtypes.astype(str)}).head(36)


Unnamed: 0,column,dtype
MainBranch,MainBranch,object
Age,Age,object
Employment,Employment,object
RemoteWork,RemoteWork,object
DevType,DevType,object
OrgSize,OrgSize,object
Country,Country,object
LanguageHaveWorkedWith,LanguageHaveWorkedWith,object
LanguageWantToWorkWith,LanguageWantToWorkWith,object
LanguageAdmired,LanguageAdmired,object


## 3. Missing data snapshot

In [5]:
missing = df.isna().sum().sort_values(ascending=False)
missing_pct = (missing / len(df) * 100).round(2)
missing_summary = pd.DataFrame({'missing_count': missing, 'missing_pct': missing_pct})
missing_summary.head(15)


Unnamed: 0,missing_count,missing_pct
ConvertedCompYearly,42002,64.19
PlatformAdmired,34060,52.05
CompTotal,31697,48.44
PlatformWantToWorkWith,30905,47.23
WebframeAdmired,30494,46.6
AIComplex,28416,43.42
WebframeWantToWorkWith,26902,41.11
DatabaseAdmired,26880,41.08
PlatformHaveWorkedWith,23071,35.26
DatabaseWantToWorkWith,22879,34.96


## 4. Normalize key numeric fields

In [6]:
def clean_years_experience(years_str):
    if pd.isna(years_str):
        return np.nan
    text = str(years_str).lower().strip()
    if text.startswith('less than 1'):
        return 0.5
    if text.startswith('more than 50'):
        return 50
    numbers = ''.join(ch for ch in text if ch.isdigit() or ch == '.')
    return float(numbers) if numbers else np.nan

df['YearsCodeProNumeric'] = df.get('YearsCodePro', pd.Series(dtype=str)).apply(clean_years_experience)
df['YearsCodeNumeric'] = df.get('YearsCode', pd.Series(dtype=str)).apply(clean_years_experience)
df['SalaryNumeric'] = pd.to_numeric(df.get('ConvertedCompYearly', pd.Series(dtype=str)), errors='coerce')
df[['YearsCodePro', 'YearsCodeProNumeric', 'ConvertedCompYearly', 'SalaryNumeric']].head()


Unnamed: 0,YearsCodePro,YearsCodeProNumeric,ConvertedCompYearly,SalaryNumeric
0,,,,
1,17.0,17.0,,
2,27.0,27.0,,
3,,,,
4,,,,


## 5. Experience and salary distributions

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
df['YearsCodeProNumeric'].dropna().clip(upper=40).hist(bins=20, ax=ax[0], color='#4c72b0')
ax[0].set_title('Professional Experience (years)')
ax[0].set_xlabel('Years')
ax[0].set_ylabel('Developers')

df['SalaryNumeric'].dropna().clip(upper=300000).hist(bins=30, ax=ax[1], color='#55a868')
ax[1].set_title('Annual Salary (USD, clipped at 300k)')
ax[1].set_xlabel('Salary ($)')
ax[1].set_ylabel('Developers')
plt.tight_layout()
plt.show()


In [None]:
salary_stats = df['SalaryNumeric'].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]).round(2)
experience_stats = df['YearsCodeProNumeric'].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]).round(2)
pd.DataFrame({'Salary (USD)': salary_stats, 'Years professional': experience_stats})


## 6. Dev type and employment landscape

In [None]:
def explode_counts(series):
    return (series.dropna().str.split(';').explode().str.strip().value_counts())

devtype_counts = explode_counts(df['DevType']).head(15)
employment_counts = df['Employment'].value_counts(dropna=False).head(10)
display(devtype_counts.to_frame(name='developers'))
display(employment_counts.to_frame(name='developers'))


## 7. Global footprint

In [None]:
country_counts = df['Country'].value_counts().head(15)
country_counts


## 8. Technology stack overview

In [None]:
language_counts = explode_counts(df.get('LanguageHaveWorkedWith', pd.Series(dtype=str)))
database_counts = explode_counts(df.get('DatabaseHaveWorkedWith', pd.Series(dtype=str)))
tools_counts = explode_counts(df.get('ToolsTechHaveWorkedWith', pd.Series(dtype=str)))

top_languages = language_counts.head(15)
top_databases = database_counts.head(10)
top_tools = tools_counts.head(15)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
top_languages.sort_values().plot(kind='barh', ax=axes[0], color='#4c72b0')
axes[0].set_title('Top Languages')
axes[0].set_xlabel('Respondents')

top_databases.sort_values().plot(kind='barh', ax=axes[1], color='#dd8452')
axes[1].set_title('Top Databases')
axes[1].set_xlabel('Respondents')

top_tools.sort_values().plot(kind='barh', ax=axes[2], color='#55a868')
axes[2].set_title('Top Tools & Platforms')
axes[2].set_xlabel('Respondents')

plt.tight_layout()
plt.show()


## 9. Remote work and compensation

In [None]:
salary_remote = (
    df.loc[df['SalaryNumeric'] > 0, ['RemoteWork', 'SalaryNumeric']]
      .dropna(subset=['RemoteWork'])
      .groupby('RemoteWork')['SalaryNumeric']
      .agg(['median', 'mean', 'count'])
      .sort_values('median', ascending=False)
)
salary_remote.round({'median': 0, 'mean': 0})


## 10. Salary vs. experience relationship

In [None]:
experience_salary = df[['YearsCodeProNumeric', 'SalaryNumeric']].dropna()
experience_salary = experience_salary[(experience_salary['SalaryNumeric'] > 0) & (experience_salary['SalaryNumeric'] < 300000)]

plt.figure(figsize=(7, 5))
plt.scatter(experience_salary['YearsCodeProNumeric'], experience_salary['SalaryNumeric'],
            alpha=0.2, s=10, color='#8172b3')
plt.title('Salary vs. Professional Experience')
plt.xlabel('Years of professional experience')
plt.ylabel('Salary (USD)')
plt.show()


---
**Next steps:** filter the dataset for the roles, regions, or experience bands you care about and reuse the helper functions above to create feature subsets for model experimentation.