ChunML · llbbl · Sep 1, 2025
diff --git a/.gitignore b/.gitignore
@@ -117,3 +117,29 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# Claude Code settings
+.claude/*
+
+# IDE files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS files
+.DS_Store
+Thumbs.db
+
+# Model files and datasets (often too large for git)
+*.h5
+*.pkl
+*.pickle
+*.pt
+*.pth
+*.ckpt
+
+# Temporary files
+*.tmp
+*.temp
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,83 @@
+[tool.poetry]
+name = "ml-nlp-projects"
+version = "0.1.0"
+description = "Machine Learning and NLP projects collection including chatbot, embeddings, machine translation, and text generation"
+authors = ["Your Name <your.email@example.com>"]
+readme = "README.md"
+packages = [{include = "chatbot"}, {include = "embeddings"}, {include = "machine_translation"}, {include = "pos_tagging"}, {include = "sentiment_analysis"}, {include = "text_generation"}]
+
+[tool.poetry.dependencies]
+python = "^3.8"
+tensorflow = "^2.13.0"
+torch = "^2.0.0"
+numpy = "^1.24.0"
+pyyaml = "^6.0"
+requests = "^2.31.0"
+
+[tool.poetry.group.test.dependencies]
+pytest = "^7.4.0"
+pytest-cov = "^4.1.0"
+pytest-mock = "^3.11.0"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py", "*_test.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = [
+    "--strict-markers",
+    "--strict-config",
+    "--verbose",
+    "--cov=chatbot",
+    "--cov=embeddings", 
+    "--cov=machine_translation",
+    "--cov=pos_tagging",
+    "--cov=sentiment_analysis",
+    "--cov=text_generation",
+    "--cov-report=term-missing",
+    "--cov-report=html:htmlcov",
+    "--cov-report=xml:coverage.xml",
+    "--cov-fail-under=80"
+]
+markers = [
+    "unit: Unit tests",
+    "integration: Integration tests", 
+    "slow: Tests that take a long time to run"
+]
+
+[tool.coverage.run]
+source = ["chatbot", "embeddings", "machine_translation", "pos_tagging", "sentiment_analysis", "text_generation"]
+omit = [
+    "*/tests/*",
+    "*/test_*.py",
+    "*_test.py",
+    "*/conftest.py",
+    "*/__pycache__/*",
+    "*/tf1/*",
+    "*.txt"
+]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "def __repr__",
+    "if self.debug:",
+    "if settings.DEBUG",
+    "raise AssertionError",
+    "raise NotImplementedError",
+    "if 0:",
+    "if __name__ == .__main__.:"
+]
+show_missing = true
+skip_covered = false
+
+[tool.coverage.html]
+directory = "htmlcov"
+
+[tool.coverage.xml]
+output = "coverage.xml"
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,156 @@
+import pytest
+import tempfile
+import shutil
+import os
+from pathlib import Path
+import numpy as np
+import tensorflow as tf
+import torch
+
+
+@pytest.fixture
+def temp_dir():
+    """Create a temporary directory for tests."""
+    temp_dir = tempfile.mkdtemp()
+    yield temp_dir
+    shutil.rmtree(temp_dir)
+
+
+@pytest.fixture
+def sample_config():
+    """Provide a sample configuration dictionary for testing."""
+    return {
+        'MODEL_SIZE': 512,
+        'NUM_LAYERS': 6,
+        'H': 8,
+        'BATCH_SIZE': 32,
+        'VOCAB_SIZE': 10000,
+        'MAX_LENGTH': 128
+    }
+
+
+@pytest.fixture
+def sample_text_data():
+    """Provide sample text data for testing."""
+    return [
+        "Hello, how are you today?",
+        "I am doing well, thank you.",
+        "What is the weather like?",
+        "It's sunny and warm outside.",
+        "Would you like to go for a walk?"
+    ]
+
+
+@pytest.fixture
+def sample_numpy_array():
+    """Provide a sample numpy array for testing."""
+    return np.random.rand(10, 5)
+
+
+@pytest.fixture
+def sample_tensorflow_tensor():
+    """Provide a sample TensorFlow tensor for testing."""
+    return tf.constant([[1, 2, 3], [4, 5, 6]], dtype=tf.float32)
+
+
+@pytest.fixture
+def sample_torch_tensor():
+    """Provide a sample PyTorch tensor for testing."""
+    return torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32)
+
+
+@pytest.fixture
+def mock_model_config():
+    """Mock model configuration for testing."""
+    return {
+        'embedding_size': 128,
+        'hidden_size': 256,
+        'num_layers': 2,
+        'dropout_rate': 0.1,
+        'learning_rate': 0.001
+    }
+
+
+@pytest.fixture
+def sample_dataset_info():
+    """Sample dataset information for testing."""
+    return {
+        'vocab_size': 5000,
+        'max_length': 50,
+        'data_size': 1000,
+        'num_classes': 2
+    }
+
+
+@pytest.fixture
+def temp_checkpoint_dir(temp_dir):
+    """Create a temporary checkpoint directory."""
+    checkpoint_dir = os.path.join(temp_dir, 'checkpoints')
+    os.makedirs(checkpoint_dir, exist_ok=True)
+    return checkpoint_dir
+
+
+@pytest.fixture
+def sample_yaml_config(temp_dir):
+    """Create a sample YAML config file for testing."""
+    config_path = os.path.join(temp_dir, 'config.yml')
+    config_content = """
+MODEL_SIZE: 512
+NUM_LAYERS: 6
+H: 8
+BATCH_SIZE: 32
+"""
+    with open(config_path, 'w') as f:
+        f.write(config_content)
+    return config_path
+
+
+@pytest.fixture
+def sample_text_file(temp_dir):
+    """Create a sample text file for testing."""
+    file_path = os.path.join(temp_dir, 'sample.txt')
+    content = "This is a sample text file for testing purposes.\n"
+    content += "It contains multiple lines of text.\n"
+    content += "Each line can be used for different test scenarios.\n"
+
+    with open(file_path, 'w') as f:
+        f.write(content)
+    return file_path
+
+
+@pytest.fixture(autouse=True)
+def reset_random_seeds():
+    """Reset random seeds before each test for reproducibility."""
+    np.random.seed(42)
+    tf.random.set_seed(42)
+    torch.manual_seed(42)
+
+
+@pytest.fixture
+def mock_tokenizer():
+    """Mock tokenizer for testing."""
+    class MockTokenizer:
+        def __init__(self):
+            self.vocab_size = 1000
+
+        def encode(self, text):
+            return [1, 2, 3, 4, 5]  # Mock encoding
+
+        def decode(self, tokens):
+            return "mock decoded text"
+
+    return MockTokenizer()
+
+
+@pytest.fixture
+def small_batch_data():
+    """Provide small batch of data for testing."""
+    batch_size = 4
+    sequence_length = 10
+    vocab_size = 100
+
+    return {
+        'input_ids': np.random.randint(0, vocab_size, (batch_size, sequence_length)),
+        'attention_mask': np.ones((batch_size, sequence_length)),
+        'labels': np.random.randint(0, 2, (batch_size,))
+    }
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py