In [2]:
import pandas as pd

# Read the CSV file containing the medicines dataset of India and store it in a DataFrame called df_med
df_med = pd.read_csv('../data/data_files/A_Z_medicines_dataset_of_India.csv', encoding = "utf-8")

# Display the first few rows of the DataFrame
df_med.head()

Unnamed: 0,id,name,price,Is_discontinued,manufacturer_name,type,pack_size_label,short_composition1,short_composition2
0,1,Augmentin 625 Duo Tablet,223.42,False,Glaxo SmithKline Pharmaceuticals Ltd,allopathy,strip of 10 tablets,Amoxycillin (500mg),Clavulanic Acid (125mg)
1,2,Azithral 500 Tablet,132.36,False,Alembic Pharmaceuticals Ltd,allopathy,strip of 5 tablets,Azithromycin (500mg),
2,3,Ascoril LS Syrup,118.0,False,Glenmark Pharmaceuticals Ltd,allopathy,bottle of 100 ml Syrup,Ambroxol (30mg/5ml),Levosalbutamol (1mg/5ml)
3,4,Allegra 120mg Tablet,218.81,False,Sanofi India Ltd,allopathy,strip of 10 tablets,Fexofenadine (120mg),
4,5,Avil 25 Tablet,10.96,False,Sanofi India Ltd,allopathy,strip of 15 tablets,Pheniramine (25mg),


In [3]:
# Retrieve the shape of the DataFrame df_med
df_med.shape

(253973, 9)

In [4]:
# Generate descriptive statistics for the DataFrame df_med
df_med.describe()

Unnamed: 0,id,price
count,253973.0,253973.0
mean,126987.0,270.530844
std,73315.834296,3029.584134
min,1.0,0.0
25%,63494.0,48.0
50%,126987.0,79.0
75%,190480.0,140.0
max,253973.0,436000.0


In [7]:
# Convert the column names of the DataFrame df_med into a list
list(df_med)

['id',
 'name',
 'price',
 'Is_discontinued',
 'manufacturer_name',
 'type',
 'pack_size_label',
 'short_composition1',
 'short_composition2']

In [8]:
class DataCleaner:
    def __init__(self, file_path):
        """
        Initialize the DataCleaner object.

        Parameters:
        - file_path (str): The file path of the CSV file.
        """
        self.file_path = file_path
        self.data = None
    
    def load_data(self):
        """
        Load the data from the CSV file.
        """
        self.data = pd.read_csv(self.file_path)
    
    def benchmark(self):
        """
        This method will set benchmark values for no. of columns and list of headers
                
        Returns: no_of_columns, list_of_headers
        """
        no_of_columns = 9
        list_of_headers = ['id', 'name', 'price', 'Is_discontinued',
                           'manufacturer_name', 'type', 'pack_size_label',
                           'short_composition1', 'short_composition2']
        return no_of_columns, list_of_headers
    
    def validation_fn(self, no_of_columns, list_of_headers):
        """
        This method will check 3 parameters to validate data file
        i)   delimeter check
        ii)  header content check
        iii) no. of columns check
        
        Parameters:
        - no_of_columns (int): The expected number of columns in the dataset.
        - list_of_headers (list): A list of expected headers in the dataset.
        
        Returns: delimiter_check, header_content_check, no_of_columns_check
        """
        delimiter_check = len(self.data.columns) > 1
        header_content_check = list_of_headers == list(self.data)
        no_of_columns_check = len(self.data.columns) == no_of_columns
        return delimiter_check, header_content_check, no_of_columns_check
    
    def clean_column_names(self):
        """
        This method cleans inconsistent data
        i)   lower column name
        ii)  replace spacing between column headers with underscore
        """
        self.data.columns = map(str.lower, self.data.columns)
        self.data.columns = self.data.columns.str.replace(" ", "_")
    
    def write_csv(self, output_file):
        """
        Save the cleaned data to a CSV file.

        Parameters:
        - output_file (str): The file path of the output CSV file.
        """
        self.data.to_csv(output_file, index=False)
        print(f'Saved cleaned data to {output_file}')


In [9]:
if __name__ == '__main__':

    # Create an instance of the DataCleaner class and provide the file path to the dataset as a parameter
    cleaner = DataCleaner('../data/data_files/A_Z_medicines_dataset_of_India.csv')

    cleaner.load_data()

    # Retrieve the expected number of columns and list of expected headers from the data benchmarking process
    no_of_columns, list_of_headers = cleaner.benchmark()

    # Perform validation on the dataset using the expected number of columns and headers
    delimiter_check, header_content_check, no_of_columns_check = cleaner.validation_fn(no_of_columns, list_of_headers)


if delimiter_check and header_content_check and no_of_columns_check:
    # Clean the column names in the dataset
    cleaner.clean_column_names()
    
    # Write the cleaned dataset to a new CSV file
    cleaner.write_csv('../data/clean_files/clean_medicines_dataset.csv')
    

Saved cleaned data to ../data/clean_files/clean_medicines_dataset.csv
