In [None]:
# Advanced Analysis Notebook
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# Load data
df = pd.read_csv('career_data.csv')
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nData types:\n", df.dtypes)

# Basic exploration
print("\nMissing values:\n", df.isnull().sum())
print("\nBasic statistics:\n", df.describe())

# Sector analysis
plt.figure(figsize=(12, 6))
sector_avg = df.groupby('Sector')['Average_Salary'].mean().sort_values(ascending=False)
sector_avg.plot(kind='bar')
plt.title('Average Salary by Sector')
plt.ylabel('Salary (LPA)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Time series analysis for specific fields
fields_to_analyze = ['Computer Science', 'AI/ML', 'Doctor', 'FinTech']

plt.figure(figsize=(15, 10))
for i, field in enumerate(fields_to_analyze, 1):
    field_data = df[df['Field'] == field]
    plt.subplot(2, 2, i)
    plt.plot(field_data['Year'], field_data['Average_Salary'], 'o-', label='Salary')
    plt.plot(field_data['Year'], field_data['Demand_Index'], 's--', label='Demand')
    plt.title(f'{field} Trends')
    plt.xlabel('Year')
    plt.legend()
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Correlation analysis
numeric_cols = ['Average_Salary', 'Demand_Index', 'Job_Satisfaction', 'Remote_Potential', 'Automation_Risk']
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

# Model comparison for salary prediction
def evaluate_models(field_name):
    field_data = df[df['Field'] == field_name].sort_values('Year')
    if len(field_data) < 3:
        return None
    
    X = field_data['Year'].values.reshape(-1, 1)
    y = field_data['Average_Salary'].values
    
    # Split data (last year for test)
    X_train, y_train = X[:-1], y[:-1]
    X_test, y_test = X[-1:], y[-1:]
    
    models = {
        'Linear': LinearRegression(),
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42)
    }
    
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[name] = {
            'mae': mean_absolute_error(y_test, y_pred),
            'r2': r2_score(y_test, y_pred)
        }
    
    return results

# Evaluate models for each field
model_results = {}
for field in df['Field'].unique():
    results = evaluate_models(field)
    if results:
        model_results[field] = results

# Display results
results_df = pd.DataFrame([
    {**{'Field': field}, **{f'{model}_{metric}': value for model, metrics in results.items() for metric, value in metrics.items()}}
    for field, results in model_results.items()
])

print("Model Comparison Results:")
print(results_df.round(3))