## Notebook form my blog post "How to write unit test for data science code"

###You can read it [here](https://armandolivares.tech/)

## Simple Example

In [7]:
import unittest
import pandas as pd
import numpy as np
from scipy import stats


def mean(numbers):
    return sum(numbers) / len(numbers)


class TestMean(unittest.TestCase):
    def test_mean(self):
        self.assertEqual(mean([1, 2, 3]), 2)
        self.assertEqual(mean([1, 1, 1]), 1)
        self.assertEqual(mean([1, 2, 3, 4]), 2.5)


#if you will be save the snippet code to a file you need to uncomment the code below:

# if __name__ == '__main__':
#     unittest.main()


## Missing Value Example

In [8]:
def check_missing_values(data):
    # Check for missing values in data
    missing_values = data.isnull().sum()
    return missing_values

In [9]:


class TestCheckMissingValues(unittest.TestCase):
    def test_missing_values(self):
        # Create sample dataset
        data = pd.DataFrame({'col1': [1, 2, 3], 'col2': [4, None, 6]})
        
        # Check for missing values
        missing_values = check_missing_values(data)
        
        # Assert that the correct number of missing values is returned
        self.assertTrue((missing_values).equals(pd.Series({'col1': 0, 'col2': 1})))


#if you will be save the snippet code to a file you need to uncomment the code below:

# if __name__ == '__main__':
#     unittest.main()



## All code EDA example

In [10]:


class TestDataScienceCode(unittest.TestCase):
  def setUp(self):
    #define a setUp method that is called before each test method 
    #self.data = pd.read_csv('data.csv')
    self.data = pd.DataFrame(np.random.randint(0, 100, size=(100, 10)), columns=list('ABCDEFGHIJ'))

  def test_data_dimensions(self):
      #check that the shape of the data is (100, 10)
      self.assertEqual(self.data.shape, (100, 10))

  def test_data_types(self):
      #check that the data types are either int64 or float64
      for col in self.data.columns:
          self.assertTrue(self.data[col].dtype in [np.int64, np.float64])

  def test_missing_values(self):
      #check that there are no missing values
      self.assertFalse(self.data.isnull().values.any())

  def test_numerical_ranges(self):
      #method to test the numerical ranges
      for col in self.data.columns:
          if self.data[col].dtype in [np.int64, np.float64]:
              self.assertTrue(self.data[col].min() >= 0)
              self.assertTrue(self.data[col].max() <= 100)

  def test_column_distributions(self):
      #method to test the column distributions
      for col in self.data.columns:
          if self.data[col].dtype in [np.int64, np.float64]:
              self.assertFalse(self.data[col].skew() > 1)
              self.assertFalse(self.data[col].kurtosis() > 10)

  def test_column_correlations(self):
      #method to test the column correlations
      corr_matrix = self.data.corr()
      for col in self.data.columns:
          if self.data[col].dtype in [np.int64, np.float64]:
              self.assertTrue(abs(corr_matrix[col].min()) <= 0.5)
              self.assertTrue(abs(corr_matrix[col].max()) <= 0.5)

  def test_data_normality(self):
      #Test to ensure data is normally distributed
      for col in self.data.columns:
          if self.data[col].dtype in [np.int64, np.float64]:
              #calculate the p-value of the Shapiro-Wilk test for each column
              _, p_value = stats.shapiro(self.data[col])
              self.assertTrue(p_value > 0.05)

  def test_summary_statistics(self):
      # check that summary statistics are as expected
      summary_stats = self.data.describe()
      for col in self.data.columns:
          if self.data[col].dtype in [np.int64, np.float64]:
              #check that the mean of each column is equal to the mean in the summary statistics
              self.assertAlmostEqual(self.data[col].mean(), summary_stats[col]['mean'])

              #check that the median of each column is equal to the median in the summary statistics
              self.assertAlmostEqual(self.data[col].median(), summary_stats[col]['50%'])



#if you will be save the snippet code to a file you need to uncomment the code below:

# if __name__ == '__main__':
#     unittest.main()


# To run the test from the notebook:

In [12]:

#!python -m unittest
unittest.main(argv=[''], verbosity=2, exit=False)

test_missing_values (__main__.TestCheckMissingValues) ... ok
test_column_correlations (__main__.TestDataScienceCode) ... FAIL
test_column_distributions (__main__.TestDataScienceCode) ... ok
test_data_dimensions (__main__.TestDataScienceCode) ... ok
test_data_normality (__main__.TestDataScienceCode) ... FAIL
test_data_types (__main__.TestDataScienceCode) ... ok
test_missing_values (__main__.TestDataScienceCode) ... ok
test_numerical_ranges (__main__.TestDataScienceCode) ... ok
test_summary_statistics (__main__.TestDataScienceCode) ... ok
test_mean (__main__.TestMean) ... ok

FAIL: test_column_correlations (__main__.TestDataScienceCode)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-10-3b10e6dce4e3>", line 40, in test_column_correlations
    self.assertTrue(abs(corr_matrix[col].max()) <= 0.5)
AssertionError: False is not true

FAIL: test_data_normality (__main__.TestDataScienceCode)
------------------------

<unittest.main.TestProgram at 0x7f577aa86ca0>