Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,29 @@ dmypy.json

# Pyre type checker
.pyre/

# Claude Code settings
.claude/*

# IDE files
.vscode/
.idea/
*.swp
*.swo
*~

# OS files
.DS_Store
Thumbs.db

# Model files and datasets (often too large for git)
*.h5
*.pkl
*.pickle
*.pt
*.pth
*.ckpt

# Temporary files
*.tmp
*.temp
3,748 changes: 3,748 additions & 0 deletions poetry.lock

Large diffs are not rendered by default.

83 changes: 83 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
[tool.poetry]
name = "ml-nlp-projects"
version = "0.1.0"
description = "Machine Learning and NLP projects collection including chatbot, embeddings, machine translation, and text generation"
authors = ["Your Name <your.email@example.com>"]
readme = "README.md"
packages = [{include = "chatbot"}, {include = "embeddings"}, {include = "machine_translation"}, {include = "pos_tagging"}, {include = "sentiment_analysis"}, {include = "text_generation"}]

[tool.poetry.dependencies]
python = "^3.8"
tensorflow = "^2.13.0"
torch = "^2.0.0"
numpy = "^1.24.0"
pyyaml = "^6.0"
requests = "^2.31.0"

[tool.poetry.group.test.dependencies]
pytest = "^7.4.0"
pytest-cov = "^4.1.0"
pytest-mock = "^3.11.0"


[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py", "*_test.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
addopts = [
"--strict-markers",
"--strict-config",
"--verbose",
"--cov=chatbot",
"--cov=embeddings",
"--cov=machine_translation",
"--cov=pos_tagging",
"--cov=sentiment_analysis",
"--cov=text_generation",
"--cov-report=term-missing",
"--cov-report=html:htmlcov",
"--cov-report=xml:coverage.xml",
"--cov-fail-under=80"
]
markers = [
"unit: Unit tests",
"integration: Integration tests",
"slow: Tests that take a long time to run"
]

[tool.coverage.run]
source = ["chatbot", "embeddings", "machine_translation", "pos_tagging", "sentiment_analysis", "text_generation"]
omit = [
"*/tests/*",
"*/test_*.py",
"*_test.py",
"*/conftest.py",
"*/__pycache__/*",
"*/tf1/*",
"*.txt"
]

[tool.coverage.report]
exclude_lines = [
"pragma: no cover",
"def __repr__",
"if self.debug:",
"if settings.DEBUG",
"raise AssertionError",
"raise NotImplementedError",
"if 0:",
"if __name__ == .__main__.:"
]
show_missing = true
skip_covered = false

[tool.coverage.html]
directory = "htmlcov"

[tool.coverage.xml]
output = "coverage.xml"
Empty file added tests/__init__.py
Empty file.
156 changes: 156 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import pytest
import tempfile
import shutil
import os
from pathlib import Path
import numpy as np
import tensorflow as tf
import torch


@pytest.fixture
def temp_dir():
"""Create a temporary directory for tests."""
temp_dir = tempfile.mkdtemp()
yield temp_dir
shutil.rmtree(temp_dir)


@pytest.fixture
def sample_config():
"""Provide a sample configuration dictionary for testing."""
return {
'MODEL_SIZE': 512,
'NUM_LAYERS': 6,
'H': 8,
'BATCH_SIZE': 32,
'VOCAB_SIZE': 10000,
'MAX_LENGTH': 128
}


@pytest.fixture
def sample_text_data():
"""Provide sample text data for testing."""
return [
"Hello, how are you today?",
"I am doing well, thank you.",
"What is the weather like?",
"It's sunny and warm outside.",
"Would you like to go for a walk?"
]


@pytest.fixture
def sample_numpy_array():
"""Provide a sample numpy array for testing."""
return np.random.rand(10, 5)


@pytest.fixture
def sample_tensorflow_tensor():
"""Provide a sample TensorFlow tensor for testing."""
return tf.constant([[1, 2, 3], [4, 5, 6]], dtype=tf.float32)


@pytest.fixture
def sample_torch_tensor():
"""Provide a sample PyTorch tensor for testing."""
return torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32)


@pytest.fixture
def mock_model_config():
"""Mock model configuration for testing."""
return {
'embedding_size': 128,
'hidden_size': 256,
'num_layers': 2,
'dropout_rate': 0.1,
'learning_rate': 0.001
}


@pytest.fixture
def sample_dataset_info():
"""Sample dataset information for testing."""
return {
'vocab_size': 5000,
'max_length': 50,
'data_size': 1000,
'num_classes': 2
}


@pytest.fixture
def temp_checkpoint_dir(temp_dir):
"""Create a temporary checkpoint directory."""
checkpoint_dir = os.path.join(temp_dir, 'checkpoints')
os.makedirs(checkpoint_dir, exist_ok=True)
return checkpoint_dir


@pytest.fixture
def sample_yaml_config(temp_dir):
"""Create a sample YAML config file for testing."""
config_path = os.path.join(temp_dir, 'config.yml')
config_content = """
MODEL_SIZE: 512
NUM_LAYERS: 6
H: 8
BATCH_SIZE: 32
"""
with open(config_path, 'w') as f:
f.write(config_content)
return config_path


@pytest.fixture
def sample_text_file(temp_dir):
"""Create a sample text file for testing."""
file_path = os.path.join(temp_dir, 'sample.txt')
content = "This is a sample text file for testing purposes.\n"
content += "It contains multiple lines of text.\n"
content += "Each line can be used for different test scenarios.\n"

with open(file_path, 'w') as f:
f.write(content)
return file_path


@pytest.fixture(autouse=True)
def reset_random_seeds():
"""Reset random seeds before each test for reproducibility."""
np.random.seed(42)
tf.random.set_seed(42)
torch.manual_seed(42)


@pytest.fixture
def mock_tokenizer():
"""Mock tokenizer for testing."""
class MockTokenizer:
def __init__(self):
self.vocab_size = 1000

def encode(self, text):
return [1, 2, 3, 4, 5] # Mock encoding

def decode(self, tokens):
return "mock decoded text"

return MockTokenizer()


@pytest.fixture
def small_batch_data():
"""Provide small batch of data for testing."""
batch_size = 4
sequence_length = 10
vocab_size = 100

return {
'input_ids': np.random.randint(0, vocab_size, (batch_size, sequence_length)),
'attention_mask': np.ones((batch_size, sequence_length)),
'labels': np.random.randint(0, 2, (batch_size,))
}
Empty file added tests/integration/__init__.py
Empty file.
Loading