<a href="https://colab.research.google.com/github/GabeMaldonado/UoL_Study_Materials/blob/main/data_structuring_and_testing_(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Part 1: Restructuring Data

In the first part of this programming exercise, your goal is to recover the original format of the Pima Indian Diabetes dataset. Here, you are given the same data, but in a much less manageable form. You should use the Numpy, Scipy and / or Pandas packages to implement a modular (ie. function-based) pipeline for restructuring the data. The final result should be identical to the downloadble data.

You may have to look back at the data in pima-indians-diabetes.csv to figure out the format of the messy version here.

Avoid using outside tools like a text editor or a spreadsheet program. Instead, all your transformations should be done programmatically in a way that can be tested in Part 2.

In [1]:
import pandas as pd
import numpy as np


# You should read in this data and restructure it to make it identical to the
# pima-indians-diabetes.csv introduced in the previous topic.
messy_data = "/content/messy-pima-indians-diabetes.csv"

df = pd.read_csv(messy_data)

In [2]:
df.head().append(df.tail())

Unnamed: 0,Non-diabetic
0,times_pregnant6.0000
1,plasma_glucose_concentration148.0000
2,diastolic_blood_pressure72.0000
3,triceps_thickness35.0000
4,2_hour_serum_insulin0.0000
7828,BMI30.4000
7829,diabetes_pedigreen0.3150
7830,age23.0000
7831,diabetes0.0000
7832,times_pregnant1.0000


In [3]:
df['values'] = df['Non-diabetic'].str.extract('(\d+\.\d+)')
df.head(10)

Unnamed: 0,Non-diabetic,values
0,times_pregnant6.0000,6.0
1,plasma_glucose_concentration148.0000,148.0
2,diastolic_blood_pressure72.0000,72.0
3,triceps_thickness35.0000,35.0
4,2_hour_serum_insulin0.0000,0.0
5,BMI33.6000,33.6
6,diabetes_pedigreen0.6270,0.627
7,age50.0000,50.0
8,diabetes1.0000,1.0
9,times_pregnant6.0000,6.0


In [4]:
df['variables'] = df['Non-diabetic'].str.extract(r'(\d*\.?\D+)')
df.head()

Unnamed: 0,Non-diabetic,values,variables
0,times_pregnant6.0000,6.0,times_pregnant
1,plasma_glucose_concentration148.0000,148.0,plasma_glucose_concentration
2,diastolic_blood_pressure72.0000,72.0,diastolic_blood_pressure
3,triceps_thickness35.0000,35.0,triceps_thickness
4,2_hour_serum_insulin0.0000,0.0,2_hour_serum_insulin


In [5]:
df2 = df[["variables", "values"]]
df2.head().append(df2.tail())

Unnamed: 0,variables,values
0,times_pregnant,6.0
1,plasma_glucose_concentration,148.0
2,diastolic_blood_pressure,72.0
3,triceps_thickness,35.0
4,2_hour_serum_insulin,0.0
7828,BMI,30.4
7829,diabetes_pedigreen,0.315
7830,age,23.0
7831,diabetes,0.0
7832,times_pregnant,1.0


In [6]:
df_pivoted = df2.pivot(columns="variables", values="values")
df_pivoted.head(10)

variables,2_hour_serum_insulin,BMI,Diabetic,Participants,age,diabetes,diabetes_pedigreen,diastolic_blood_pressure,plasma_glucose_concentration,times_pregnant,triceps_thickness
0,,,,,,,,,,6.0,
1,,,,,,,,,148.0,,
2,,,,,,,,72.0,,,
3,,,,,,,,,,,35.0
4,0.0,,,,,,,,,,
5,,33.6,,,,,,,,,
6,,,,,,,0.627,,,,
7,,,,,50.0,,,,,,
8,,,,,,1.0,,,,,
9,,,,,,,,,,6.0,


In [7]:
# see which columns contain no data so they can be safely dropped
df_pivoted.describe()

variables,2_hour_serum_insulin,BMI,Diabetic,Participants,age,diabetes,diabetes_pedigreen,diastolic_blood_pressure,plasma_glucose_concentration,times_pregnant,triceps_thickness
count,768.0,768.0,0.0,0.0,768.0,768.0,768.0,768.0,768.0,1536.0,768.0
unique,186.0,248.0,0.0,0.0,52.0,2.0,517.0,47.0,136.0,17.0,51.0
top,0.0,32.0,,,22.0,0.0,0.254,70.0,99.0,1.0,0.0
freq,374.0,13.0,,,72.0,500.0,6.0,57.0,17.0,270.0,227.0


In [8]:
#drop columns Participant and Diabetic
df_pivoted.drop(columns=["Diabetic", "Participants"], inplace=True)
df_pivoted = df_pivoted.apply(pd.to_numeric)


In [9]:
df_pivoted.head()

variables,2_hour_serum_insulin,BMI,age,diabetes,diabetes_pedigreen,diastolic_blood_pressure,plasma_glucose_concentration,times_pregnant,triceps_thickness
0,,,,,,,,6.0,
1,,,,,,,148.0,,
2,,,,,,72.0,,,
3,,,,,,,,,35.0
4,0.0,,,,,,,,


In [10]:
# drop NaN on axis 0 --rows-- group by every other 10 columns and divide times pregnant by 2
# borrowed this approach from Daniel de Jesus Miranda
df_pivoted.dropna(how="all", axis=0, inplace=True)
df_pivoted = df_pivoted.groupby(np.arange(len(df_pivoted))//10).sum()
df_pivoted['times_pregnant'] = df_pivoted['times_pregnant'] / 2
df_pivoted.head(10)

variables,2_hour_serum_insulin,BMI,age,diabetes,diabetes_pedigreen,diastolic_blood_pressure,plasma_glucose_concentration,times_pregnant,triceps_thickness
0,0.0,33.6,50.0,1.0,0.627,72.0,148.0,6.0,35.0
1,0.0,23.3,32.0,1.0,0.672,64.0,183.0,8.0,0.0
2,168.0,43.1,33.0,1.0,2.288,40.0,137.0,0.0,35.0
3,88.0,31.0,26.0,1.0,0.248,50.0,78.0,3.0,32.0
4,543.0,30.5,53.0,1.0,0.158,70.0,197.0,2.0,45.0
5,0.0,0.0,54.0,1.0,0.232,96.0,125.0,8.0,0.0
6,0.0,38.0,34.0,1.0,0.537,74.0,168.0,10.0,0.0
7,846.0,30.1,59.0,1.0,0.398,60.0,189.0,1.0,23.0
8,175.0,25.8,51.0,1.0,0.587,72.0,166.0,5.0,19.0
9,0.0,30.0,32.0,1.0,0.484,0.0,100.0,7.0,0.0


In [11]:
# put all the above coe as a function

def clean_data(df):
  df['values'] = df['Non-diabetic'].str.extract('(\d+\.\d+)')
  df['variables'] = df['Non-diabetic'].str.extract(r'(\d*\.?\D+)')
  df2 = df[["variables", "values"]]
  
  df_pivoted = df2.pivot(columns="variables", values="values")
  df_pivoted.drop(columns=["Diabetic", "Participants"], inplace=True)
  df_pivoted = df_pivoted.apply(pd.to_numeric)
  df_pivoted.dropna(how="all", axis=0, inplace=True)
  df_pivoted = df_pivoted.groupby(np.arange(len(df_pivoted))//10).sum()
  df_pivoted['times_pregnant'] = df_pivoted['times_pregnant'] / 2
  return df_pivoted

In [12]:
df_2 = pd.read_csv(messy_data)

In [13]:
df_2.head(2)

Unnamed: 0,Non-diabetic
0,times_pregnant6.0000
1,plasma_glucose_concentration148.0000


In [14]:
df3 = clean_data(df_2)
df3.head()


variables,2_hour_serum_insulin,BMI,age,diabetes,diabetes_pedigreen,diastolic_blood_pressure,plasma_glucose_concentration,times_pregnant,triceps_thickness
0,0.0,33.6,50.0,1.0,0.627,72.0,148.0,6.0,35.0
1,0.0,23.3,32.0,1.0,0.672,64.0,183.0,8.0,0.0
2,168.0,43.1,33.0,1.0,2.288,40.0,137.0,0.0,35.0
3,88.0,31.0,26.0,1.0,0.248,50.0,78.0,3.0,32.0
4,543.0,30.5,53.0,1.0,0.158,70.0,197.0,2.0,45.0


## Part 2: Unit Testing

Below, are a series of simple arithmetic functions. Define class of test-cases for these functions that will adequately assure you they are working properly.

Hopefully you implemented Part 1 using a pipeline of functions. Here, you should design and implement unit tests for each function. Be sure to test edge-cases with values not neccesarily observed in the dataset. You may have to refer to the Python unittest package documentations: https://docs.python.org/3/library/unittest.html

In [24]:
import unittest
# Implement your tests here for the functions in the following cell.


class myTest(unittest.TestCase):

  def test_method1(self):
    self.assertEqual(clean_data(df_2).shape,(768,9))
    self.assertEqual(clean_data(df_2).duplicated().sum(), 0)
    self.assertEqual(clean_data(df_2)['BMI'].max(), 67.1)
    self.assertEqual(round(clean_data(df_2)['age'].mean(),2), 33.24)
if __name__ == '__main__':
    unittest.main(argv=[''], exit=False,verbosity=2)

#         ...
        
# Should you want to delete a test case from within Jupyter notebook,
# you can run the following code to remove the class from the set of
# global variables: 
#
# `del IncrTestCast`

test_method1 (__main__.myTest) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.282s

OK


In [39]:
def incr(x):
    return x + 1

def decr(x):
    return x - 1

def add(x,y):
    return x + y

def subt(x):
    return x - 1

def mult(x,y):
    return x * y

def divi(x, y):
    return x / y

# This strange Python simulates running your code as if it were executed
# from the command-line, instead of within a Notebook. All it does is 
# call the automatically generated main() function (which is usually
# wrapped in Jupyter) with an explicit argument array with one value.
#if __name__ == '__main__':
#    unittest.main(argv=[''], exit=False,verbosity=2)

In [41]:
class myTest(unittest.TestCase):

  def test_incr(self):
    self.assertEqual(incr(5), 6)
    self.assertEqual(incr(-5), -4)

  def test_decr(self):
    self.assertEqual(decr(5), 4)
    self.assertEqual(decr(-4), -5)

  def test_add(self):
    self.assertEqual(add(5, 4), 9)
    self.assertEqual(add(-1, 6), 5)

  def test_subt(self):
    self.assertEqual(subt(10), 9)
    self.assertEqual(subt(1), 0)

  def test_mult(self):
    self.assertEqual(mult(5, 5), 25)
    self.assertEqual(mult(-4, 5), -20)

  def test_divi(self):
    self.assertEqual(divi(36, 6), 6)
    self.assertEqual(divi(12, 3), 4)

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False,verbosity=2)

test_add (__main__.myTest) ... ok
test_decr (__main__.myTest) ... ok
test_divi (__main__.myTest) ... ok
test_incr (__main__.myTest) ... ok
test_mult (__main__.myTest) ... ok
test_subt (__main__.myTest) ... ok

----------------------------------------------------------------------
Ran 6 tests in 0.008s

OK
