In [2]:
import numpy as np
import pandas as pd
import glob as g
from datetime import datetime

In [3]:
x = np.ones((2,2))
x

array([[1., 1.],
       [1., 1.]])

# 1. Pipeline

In [1]:
import pandas as pd
from abc import ABC, abstractmethod

In [2]:
class DataProcessingStrategy(ABC):
    @abstractmethod
    def apply(self, data: pd.DataFrame) -> pd.DataFrame:
        pass

In [3]:
class DropMissingDataStrategy(DataProcessingStrategy):
    def apply(self, data: pd.DataFrame) -> pd.DataFrame:
        return data.dropna()

In [4]:
class StandardizeDataStrategy(DataProcessingStrategy):
    def apply(self, data:pd.DataFrame)-> pd.DataFrame:
        return (data - data.mean()) / data.std()

In [5]:
class DataPipeline:
    def __init__(self) -> None:
        self.strategies = []

    def add_strategy(self, strategy:DataProcessingStrategy):
        self.strategies.append(strategy)

    def process(self, data: pd.DataFrame) -> pd.DataFrame:
        for strategy in self.strategies:
            data = strategy.apply(data)

        return data

In [6]:
pipeline = DataPipeline()
pipeline.add_strategy(DropMissingDataStrategy())
pipeline.add_strategy(StandardizeDataStrategy())

In [7]:
data = pd.DataFrame({
    "A":[1, 2, 3, None, 5],
    "B":[5, 4, 2, 1, 3]
})
pipeline.process(data)

Unnamed: 0,A,B
0,-1.024695,1.161895
1,-0.439155,0.387298
2,0.146385,-1.161895
4,1.317465,-0.387298


# 2. Data Extraction

In [None]:
def extractFromCsv(filePath):
    df = pd.read_csv(filePath)

    return df


def extractFromJson(filePath):
    df = pd.read_json(filePath)

    return df

def extractFromExcel(filePath):
    df = pd.read_excel(filePath)

    return df

In [4]:
datetime.today()
datetime.today().year

2024

In [None]:
def extract():
    extracted_data = pd.DataFrame(columns=['col1', 'col2'])

    for i in g.glob("Raw_Data/*.csv"):
        extracted_data = extracted_data.append(extractFromCsv(i), ignore_index = True)

    for i in g.glob("Raw_Data/*.csv"):
        extracted_data = extracted_data.append(extractFromCsv(i), ignore_index = True)

    for i in g.glob("Raw_Data/*.csv"):
        extracted_data = extracted_data.append(extractFromCsv(i), ignore_index = True)

    return extracted_data

# 3. Data Transformation

In [None]:
def transform(data):
    data['col1'] = round(data['price'], 2)

    return data

# 4. Data load

In [None]:
def load(targetFile, data_to_load):
    data_to_load.to_csv(targetFile)

# 5. ETL Execution

In [None]:
extracted_data = extract()
transformed_data = transform(extracted_data)
load_data = load("transform_data/new_data.csv",transformed_data)

In [11]:
# Import libraries
import os
import time

# Insert the directory path in here
path = './'

# Extracting all the contents in the directory corresponding to path
l_files = os.listdir(path)

# Iterating over all the files
for file in l_files:

# Instantiating the path of the file
	file_path = f'{path}\\{file}'
	print(f'check file {file}')
	print(f'check {file_path}')

	# Checking whether the given file is a directory or not
	if os.path.isfile(file_path):
		try:
			# Printing the file pertaining to file_path
			os.startfile(file_path, 'print')
			print(f'Printing {file}')

			time.sleep(5)
		except:
			# Catching if any error occurs and alerting the user
			print(f'ALERT: {file} could not be printed! Please check\
			the associated softwares, or the file type.')
	else:
		print(f'ALERT: {file} is not a file, so can not be printed!')

print('Task finished!')


check file main.ipynb
check ./\main.ipynb
ALERT: main.ipynb is not a file, so can not be printed!
check file .DS_Store
check ./\.DS_Store
ALERT: .DS_Store is not a file, so can not be printed!
check file DL-Foundations
check ./\DL-Foundations
ALERT: DL-Foundations is not a file, so can not be printed!
check file file2.json
check ./\file2.json
ALERT: file2.json is not a file, so can not be printed!
check file file1.csv
check ./\file1.csv
ALERT: file1.csv is not a file, so can not be printed!
check file Data-Engineering
check ./\Data-Engineering
ALERT: Data-Engineering is not a file, so can not be printed!
check file ML-Foundations
check ./\ML-Foundations
ALERT: ML-Foundations is not a file, so can not be printed!
Task finished!
