# Docker Workshop: Data Science Environment Demo

This notebook demonstrates the power of Docker and Dev Containers by showcasing various Python libraries in a reproducible environment.

## Libraries Demonstrated:
- **pandas**: Data manipulation and analysis
- **numpy**: Numerical computing
- **matplotlib & seaborn**: Data visualization
- **plotly**: Interactive visualizations
- **requests**: HTTP requests
- **scikit-learn**: Machine learning
- **BeautifulSoup**: Web scraping
- **python-dotenv**: Environment management

## 1. Import Libraries and Setup

In [None]:
# Core data science libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Web and API libraries
import requests
from bs4 import BeautifulSoup

# Machine learning
from sklearn.datasets import make_classification, load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Utilities
import os
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("🚀 All libraries imported successfully!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"🔢 NumPy version: {np.__version__}")
print(f"📈 Matplotlib version: {plt.matplotlib.__version__}")

## 2. Data Generation and Manipulation with Pandas

In [None]:
# Generate sample data
np.random.seed(42)

# Create a sample dataset
data = {
    'employee_id': range(1, 101),
    'name': [f'Employee_{i}' for i in range(1, 101)],
    'department': np.random.choice(['Engineering', 'Marketing', 'Sales', 'HR', 'Finance'], 100),
    'salary': np.random.normal(75000, 15000, 100).astype(int),
    'years_experience': np.random.randint(0, 20, 100),
    'performance_score': np.random.uniform(1, 5, 100).round(2),
    'remote_work': np.random.choice([True, False], 100)
}

df = pd.DataFrame(data)

# Display basic info
print("📋 Dataset Overview:")
print(f"Shape: {df.shape}")
print("\n📊 First 5 rows:")
display(df.head())

print("\n📈 Summary Statistics:")
display(df.describe())

## 3. Data Visualization with Matplotlib and Seaborn

In [None]:
# Create a comprehensive visualization dashboard
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Employee Analytics Dashboard', fontsize=16, fontweight='bold')

# 1. Salary distribution by department
sns.boxplot(data=df, x='department', y='salary', ax=axes[0, 0])
axes[0, 0].set_title('Salary Distribution by Department')
axes[0, 0].tick_params(axis='x', rotation=45)

# 2. Experience vs Performance correlation
sns.scatterplot(data=df, x='years_experience', y='performance_score', 
                hue='department', size='salary', alpha=0.7, ax=axes[0, 1])
axes[0, 1].set_title('Experience vs Performance by Department')

# 3. Remote work distribution
remote_counts = df['remote_work'].value_counts()
axes[1, 0].pie(remote_counts.values, labels=['On-site', 'Remote'], 
               autopct='%1.1f%%', startangle=90)
axes[1, 0].set_title('Remote Work Distribution')

# 4. Department size
dept_counts = df['department'].value_counts()
sns.barplot(x=dept_counts.values, y=dept_counts.index, ax=axes[1, 1])
axes[1, 1].set_title('Employees by Department')
axes[1, 1].set_xlabel('Number of Employees')

plt.tight_layout()
plt.show()

print("✅ Static visualizations created successfully!")

## 4. Interactive Visualizations with Plotly

In [None]:
# Create interactive plotly visualizations

# 1. Interactive scatter plot
fig1 = px.scatter(df, x='years_experience', y='salary', 
                  color='department', size='performance_score',
                  hover_data=['name', 'remote_work'],
                  title='Interactive Employee Analysis: Experience vs Salary',
                  labels={'years_experience': 'Years of Experience',
                         'salary': 'Annual Salary ($)'})
fig1.show()

# 2. Interactive histogram
fig2 = px.histogram(df, x='performance_score', color='department',
                    title='Performance Score Distribution by Department',
                    nbins=20, opacity=0.7)
fig2.show()

# 3. 3D scatter plot
fig3 = px.scatter_3d(df, x='years_experience', y='salary', z='performance_score',
                     color='department', size='salary',
                     title='3D Employee Analytics: Experience, Salary, and Performance')
fig3.show()

print("🎯 Interactive visualizations created successfully!")

## 5. Web Scraping and API Requests

In [None]:
# Demonstrate web requests and data fetching
print("🌐 Demonstrating Web Requests and APIs...")

# 1. Fetch some public API data (using JSONPlaceholder for demo)
try:
    response = requests.get('https://jsonplaceholder.typicode.com/posts', timeout=10)
    if response.status_code == 200:
        posts_data = response.json()[:10]  # Get first 10 posts
        posts_df = pd.DataFrame(posts_data)
        print(f"✅ Successfully fetched {len(posts_df)} posts from API")
        display(posts_df[['id', 'title', 'userId']].head())
    else:
        print(f"❌ API request failed with status code: {response.status_code}")
except requests.RequestException as e:
    print(f"❌ Error fetching data: {e}")

# 2. Simple web scraping example (parsing HTML)
html_content = """
<html>
<body>
    <div class="container">
        <h1>Docker Workshop Data</h1>
        <ul class="tech-list">
            <li>Python 3.11</li>
            <li>Jupyter Notebooks</li>
            <li>pandas 2.0+</li>
            <li>Docker Containers</li>
            <li>VS Code Dev Containers</li>
        </ul>
    </div>
</body>
</html>
"""

soup = BeautifulSoup(html_content, 'html.parser')
tech_items = soup.find_all('li')
technologies = [item.text for item in tech_items]

print("\n🔧 Technologies used in this workshop:")
for i, tech in enumerate(technologies, 1):
    print(f"{i}. {tech}")

print("\n✅ Web scraping demonstration completed!")

## 6. Machine Learning with Scikit-learn

In [None]:
# Machine learning demonstration
print("Machine Learning Demonstration")

# Load the classic iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Create a DataFrame for better visualization
iris_df = pd.DataFrame(X, columns=iris.feature_names)
iris_df['species'] = [iris.target_names[i] for i in y]

print(f"Iris dataset shape: {iris_df.shape}")
print("\nSpecies distribution:")
print(iris_df['species'].value_counts())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Display results
print("\nModel Performance:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

# Feature importance visualization
feature_importance = pd.DataFrame({
    'feature': iris.feature_names,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance in Iris Classification')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

# Confusion matrix with plotly
cm = confusion_matrix(y_test, y_pred)
fig = px.imshow(cm, 
                labels=dict(x="Predicted", y="Actual", color="Count"),
                x=iris.target_names,
                y=iris.target_names,
                title="Confusion Matrix - Iris Classification")
fig.show()

print("Machine learning demonstration completed!")