# 01 - Data Exploration

This notebook explores the software effort estimation datasets.

## Contents
1. Load and inspect datasets
2. Statistical summary
3. Target variable analysis
4. Feature correlation
5. Visualization

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('..')

from src.data.data_loader import DataLoader
from src.utils.config import DATASETS

%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['font.size'] = 12

## 1. Load Dataset

In [None]:
# Load COCOMO81 dataset
loader = DataLoader('cocomo81')
df = loader.load_raw_data()

print(f"Dataset: COCOMO81")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")

In [None]:
# Display first 10 rows
df.head(10)

In [None]:
# Dataset info
df.info()

## 2. Statistical Summary

In [None]:
# Descriptive statistics
df.describe().T

In [None]:
# Check for missing values
missing = df.isnull().sum()
print("Missing Values:")
print(missing[missing > 0] if missing.sum() > 0 else "No missing values found!")

## 3. Target Variable Analysis

In [None]:
# Target variable (actual effort)
target = df['actual']

print("Target Variable Statistics:")
print(f"  Min: {target.min():.1f}")
print(f"  Max: {target.max():.1f}")
print(f"  Mean: {target.mean():.1f}")
print(f"  Median: {target.median():.1f}")
print(f"  Std: {target.std():.1f}")
print(f"  Skewness: {target.skew():.2f}")
print(f"  Kurtosis: {target.kurtosis():.2f}")

In [None]:
# Distribution of effort
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Original distribution
axes[0].hist(df['actual'], bins=20, edgecolor='black', alpha=0.7)
axes[0].axvline(df['actual'].mean(), color='red', linestyle='--', label=f'Mean: {df["actual"].mean():.0f}')
axes[0].axvline(df['actual'].median(), color='green', linestyle='--', label=f'Median: {df["actual"].median():.0f}')
axes[0].set_xlabel('Actual Effort (person-months)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Actual Effort')
axes[0].legend()

# Log-transformed distribution
axes[1].hist(np.log1p(df['actual']), bins=20, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_xlabel('Log(Actual Effort + 1)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Log-Transformed Effort')

plt.tight_layout()
plt.savefig('../reports/figures/effort_distribution.png', dpi=150)
plt.show()

In [None]:
# Box plot of effort
fig, ax = plt.subplots(figsize=(10, 6))
ax.boxplot(df['actual'], vert=True)
ax.set_ylabel('Actual Effort (person-months)')
ax.set_title('Box Plot of Actual Effort')
plt.show()

## 4. Feature Correlation

In [None]:
# Correlation matrix
plt.figure(figsize=(16, 12))
correlation_matrix = df.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', 
            center=0, fmt='.2f', square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.savefig('../reports/figures/correlation_matrix.png', dpi=150)
plt.show()

In [None]:
# Correlation with target variable
target_corr = correlation_matrix['actual'].drop('actual').sort_values(ascending=False)

plt.figure(figsize=(10, 8))
colors = ['green' if x > 0 else 'red' for x in target_corr.values]
plt.barh(target_corr.index, target_corr.values, color=colors)
plt.xlabel('Correlation with Actual Effort')
plt.title('Feature Correlation with Target Variable')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.savefig('../reports/figures/target_correlation.png', dpi=150)
plt.show()

## 5. Feature Analysis

In [None]:
# LOC vs Effort
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Linear scale
axes[0].scatter(df['loc'], df['actual'], alpha=0.6, edgecolor='black')
axes[0].set_xlabel('Lines of Code (LOC)')
axes[0].set_ylabel('Actual Effort (person-months)')
axes[0].set_title('LOC vs Actual Effort')

# Log-log scale
axes[1].scatter(np.log1p(df['loc']), np.log1p(df['actual']), alpha=0.6, edgecolor='black', color='orange')
axes[1].set_xlabel('Log(LOC + 1)')
axes[1].set_ylabel('Log(Effort + 1)')
axes[1].set_title('Log-Log Plot: LOC vs Effort')

plt.tight_layout()
plt.savefig('../reports/figures/loc_vs_effort.png', dpi=150)
plt.show()

In [None]:
# Distribution of cost drivers
cost_drivers = ['rely', 'data', 'cplx', 'time', 'stor', 'virt', 'turn',
                'acap', 'aexp', 'pcap', 'vexp', 'lexp', 'modp', 'tool', 'sced']

fig, axes = plt.subplots(3, 5, figsize=(20, 12))
axes = axes.flatten()

for i, col in enumerate(cost_drivers):
    axes[i].hist(df[col], bins=10, edgecolor='black', alpha=0.7)
    axes[i].set_title(col.upper())
    axes[i].set_xlabel('Value')

plt.suptitle('Distribution of COCOMO Cost Drivers', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig('../reports/figures/cost_drivers_distribution.png', dpi=150)
plt.show()

## 6. Summary

### Key Findings:
1. **Dataset Size**: 63 projects with 16 features
2. **Target Variable**: Highly skewed, ranges from 5.9 to 11,400 person-months
3. **No Missing Values**: Dataset is complete
4. **Strong Correlation**: LOC has the highest correlation with effort
5. **Log Transformation**: Helps normalize the target variable distribution