<a href="https://colab.research.google.com/github/Akshayus29/Assignments/blob/main/data_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **batch process**

In [None]:
import os
import time
from datetime import datetime
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
sns.get_dataset_names()  #these are the seaborn inbuilt datas

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [None]:
def extract():
  df = sns.load_dataset('tips')
  print(f"data extract at {datetime.now()} . it's shape is {df.shape}")
  return df



def transform (df):
  df = df.dropna()
  df = df.drop_duplicates()
  print(f"data transformed at {datetime.now()} . it's shape is {df.shape}")
  return df

def load(df,out_dir = "output_dir",run_number = None):
  os.makedirs(out_dir,exist_ok=True)
  total_revenue = float(df["total_bill"].sum())
  avg_tip = float(df['tip'].mean())
  avg_party_size = float(df['size'].mean())

  now_to = datetime.now()
  run_tag = f"run number {run_number}" if run_number else""
  summary_file_name = f"summary_{run_tag}_{now_to}"
  summary_path = os.path.join(out_dir,summary_file_name)

  ummary = {
      "run_time" : now_to,
      "run_number" : run_number,
      "total_revenue" : total_revenue,
      "avg_tip" : avg_tip,
      "avg_party_size" : avg_party_size,
      "rows_processed" : len(df)}


  pd.DataFrame([summary]).to_csv(summary_path)
  print(f"data loaded at {summary_path}")
  return summary_path,summary


data_loaded = extract()
data_tranformed = transform(data_loaded)
data_loaded_path,summary = load(data_tranformed)




data extract at 2025-12-10 07:35:06.600755 . it's shape is (244, 7)
data transformed at 2025-12-10 07:35:06.622381 . it's shape is (243, 7)
data loaded at output_dir/summary__2025-12-10 07:35:06.623170


In [None]:
def run_pipeline(run_number = None):
  data_loaded = extract()
  data_tranformed = transform(data_loaded)
  data_loaded_path , summary = load(data_tranformed)
  print(f"execution run - {run_number} has completed")

def scheduled_pipeline(interval = 10, max_runs = 5):
  run_count = 0
  while run_count< max_runs:
    run_pipeline(run_count)
    time.sleep(interval)
    run_count += 1
  print("all the numbers completed")

scheduled_pipeline(interval = 10,max_runs = 5)


data extract at 2025-12-10 07:35:14.959559 . it's shape is (244, 7)
data transformed at 2025-12-10 07:35:14.963526 . it's shape is (243, 7)
data loaded at output_dir/summary__2025-12-10 07:35:14.963998
execution run - 0 has completed
data extract at 2025-12-10 07:35:24.971363 . it's shape is (244, 7)
data transformed at 2025-12-10 07:35:24.973801 . it's shape is (243, 7)
data loaded at output_dir/summary__2025-12-10 07:35:24.974515
execution run - 1 has completed
data extract at 2025-12-10 07:35:34.980618 . it's shape is (244, 7)
data transformed at 2025-12-10 07:35:34.983046 . it's shape is (243, 7)
data loaded at output_dir/summary__2025-12-10 07:35:34.983720
execution run - 2 has completed
data extract at 2025-12-10 07:35:44.989853 . it's shape is (244, 7)
data transformed at 2025-12-10 07:35:44.992388 . it's shape is (243, 7)
data loaded at output_dir/summary__2025-12-10 07:35:44.992742
execution run - 3 has completed
data extract at 2025-12-10 07:35:54.998712 . it's shape is (244,

**new data**

In [None]:
import requests

url = "https://randomuser.me/api/?results=50&nat=us,gb,ca,au,fr,dk"
response = requests.get(url)

data = response.json()
df = pd.json_normalize(data["results"])
df


Unnamed: 0,gender,email,phone,cell,nat,name.title,name.first,name.last,location.street.number,location.street.name,...,login.sha256,dob.date,dob.age,registered.date,registered.age,id.name,id.value,picture.large,picture.medium,picture.thumbnail
0,female,victoria.chavez@example.com,01-9698-8125,0476-749-223,AU,Mrs,Victoria,Chavez,6536,Woodland St,...,9fd03e3b42175202aa1e36aff222fb89c6cb23b1cf9ba3...,1947-09-17T20:16:51.763Z,78,2010-07-14T17:26:39.480Z,15,TFN,679798220,https://randomuser.me/api/portraits/women/55.jpg,https://randomuser.me/api/portraits/med/women/...,https://randomuser.me/api/portraits/thumb/wome...
1,male,philippe.lo@example.com,R29 K17-0920,P52 U49-1076,CA,Mr,Philippe,Lo,8470,Bay Ave,...,03a1f396714e1c276607b50d8e6d88907da951b50383e6...,1993-01-29T14:15:53.541Z,32,2008-06-10T22:31:02.994Z,17,SIN,505154344,https://randomuser.me/api/portraits/men/38.jpg,https://randomuser.me/api/portraits/med/men/38...,https://randomuser.me/api/portraits/thumb/men/...
2,male,gustav.johansen@example.com,99124195,34754939,DK,Mr,Gustav,Johansen,4557,Mellemgade,...,8e7fb509bfef9066efc854748d9fb56f0d5d68abb055ab...,1981-10-19T23:14:42.973Z,44,2004-06-21T00:48:48.696Z,21,CPR,191081-3763,https://randomuser.me/api/portraits/men/44.jpg,https://randomuser.me/api/portraits/med/men/44...,https://randomuser.me/api/portraits/thumb/men/...
3,male,justin.rogers@example.com,016973 26455,07247 301754,GB,Mr,Justin,Rogers,8672,Brick Kiln Road,...,9883232f7944380d5f4a76a9ff3978b7445d7b95aa429d...,1986-06-16T14:24:05.996Z,39,2007-08-11T03:44:38.543Z,18,NINO,AS 35 04 05 L,https://randomuser.me/api/portraits/men/70.jpg,https://randomuser.me/api/portraits/med/men/70...,https://randomuser.me/api/portraits/thumb/men/...
4,female,juliette.tremblay@example.com,Z54 B03-5579,E51 F89-0385,CA,Ms,Juliette,Tremblay,8957,Bay Ave,...,4ab6fcd916eb12e1d522af0af63ea0918deed55b5e75c0...,1986-07-18T07:25:58.721Z,39,2005-09-09T02:04:42.032Z,20,SIN,195800651,https://randomuser.me/api/portraits/women/11.jpg,https://randomuser.me/api/portraits/med/women/...,https://randomuser.me/api/portraits/thumb/wome...
5,female,lola.roberts@example.com,015394 76885,07231 444829,GB,Miss,Lola,Roberts,2058,Kingsway,...,3c98250ce4598f01764f6c48b392e72aa2018d6ffdb13d...,1947-07-16T10:41:45.687Z,78,2006-04-09T03:50:19.208Z,19,NINO,EJ 90 54 14 I,https://randomuser.me/api/portraits/women/94.jpg,https://randomuser.me/api/portraits/med/women/...,https://randomuser.me/api/portraits/thumb/wome...
6,female,gertrude.russell@example.com,01-1043-9306,0438-703-899,AU,Miss,Gertrude,Russell,4576,Locust Rd,...,33b9245eb289584f16dbb7de38b1e444e44fb00efbaab6...,1983-09-01T01:50:26.934Z,42,2020-02-21T04:50:28.515Z,5,TFN,298491072,https://randomuser.me/api/portraits/women/21.jpg,https://randomuser.me/api/portraits/med/women/...,https://randomuser.me/api/portraits/thumb/wome...
7,male,ben.welch@example.com,(668) 929-7736,(952) 671-5205,US,Mr,Ben,Welch,902,Valley View Ln,...,f05f6375e3ad566a5010c7b778a08a621aa883084248e5...,1946-10-24T19:45:58.003Z,79,2014-12-04T09:09:44.663Z,11,SSN,375-64-0113,https://randomuser.me/api/portraits/men/84.jpg,https://randomuser.me/api/portraits/med/men/84...,https://randomuser.me/api/portraits/thumb/men/...
8,male,blake.fortin@example.com,W36 G71-6443,I79 C20-5623,CA,Mr,Blake,Fortin,6739,St. Lawrence Ave,...,65821d2b7d5c65de1d366e8fad4e535cf0a48878dd48e1...,1967-09-14T11:17:58.230Z,58,2016-12-20T21:43:17.950Z,8,SIN,873866123,https://randomuser.me/api/portraits/men/17.jpg,https://randomuser.me/api/portraits/med/men/17...,https://randomuser.me/api/portraits/thumb/men/...
9,male,kevin.holmes@example.com,09-7263-9886,0480-682-086,AU,Mr,Kevin,Holmes,3443,Wheeler Ridge Dr,...,78a1c08c18f58d6eb36f837b548ccdeba5e2c6ad43831a...,1993-05-19T13:13:48.279Z,32,2014-03-12T18:42:17.880Z,11,TFN,238703547,https://randomuser.me/api/portraits/men/19.jpg,https://randomuser.me/api/portraits/med/men/19...,https://randomuser.me/api/portraits/thumb/men/...


## **streaming process**

In [None]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/velicki/Weather_Data_Analysis_Project/main/Weather_Data.csv")
df

Unnamed: 0,Date/Time,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa,Weather
0,1/1/2012 0:00,-1.8,-3.9,86,4,8.0,101.24,Fog
1,1/1/2012 1:00,-1.8,-3.7,87,4,8.0,101.24,Fog
2,1/1/2012 2:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"
3,1/1/2012 3:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog"
4,1/1/2012 4:00,-1.5,-3.3,88,7,4.8,101.23,Fog
...,...,...,...,...,...,...,...,...
8779,12/31/2012 19:00,0.1,-2.7,81,30,9.7,100.13,Snow
8780,12/31/2012 20:00,0.2,-2.4,83,24,9.7,100.03,Snow
8781,12/31/2012 21:00,-0.5,-1.5,93,28,4.8,99.95,Snow
8782,12/31/2012 22:00,-0.2,-1.8,89,28,9.7,99.91,Snow


In [None]:
df.shape

(8784, 8)

In [None]:
df.columns

Index(['Date/Time', 'Temp_C', 'Dew Point Temp_C', 'Rel Hum_%',
       'Wind Speed_km/h', 'Visibility_km', 'Press_kPa', 'Weather'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Date/Time         8784 non-null   object 
 1   Temp_C            8784 non-null   float64
 2   Dew Point Temp_C  8784 non-null   float64
 3   Rel Hum_%         8784 non-null   int64  
 4   Wind Speed_km/h   8784 non-null   int64  
 5   Visibility_km     8784 non-null   float64
 6   Press_kPa         8784 non-null   float64
 7   Weather           8784 non-null   object 
dtypes: float64(4), int64(2), object(2)
memory usage: 549.1+ KB


In [None]:
df.columns = df.columns.str.strip().str.replace(' ','_').str.replace('/','_')
df.columns

Index(['Date_Time', 'Temp_C', 'Dew_Point_Temp_C', 'Rel_Hum_%',
       'Wind_Speed_km_h', 'Visibility_km', 'Press_kPa', 'Weather'],
      dtype='object')

In [None]:
df.Weather.unique()

array(['Fog', 'Freezing Drizzle,Fog', 'Mostly Cloudy', 'Cloudy', 'Rain',
       'Rain Showers', 'Mainly Clear', 'Snow Showers', 'Snow', 'Clear',
       'Freezing Rain,Fog', 'Freezing Rain', 'Freezing Drizzle',
       'Rain,Snow', 'Moderate Snow', 'Freezing Drizzle,Snow',
       'Freezing Rain,Snow Grains', 'Snow,Blowing Snow', 'Freezing Fog',
       'Haze', 'Rain,Fog', 'Drizzle,Fog', 'Drizzle',
       'Freezing Drizzle,Haze', 'Freezing Rain,Haze', 'Snow,Haze',
       'Snow,Fog', 'Snow,Ice Pellets', 'Rain,Haze', 'Thunderstorms,Rain',
       'Thunderstorms,Rain Showers', 'Thunderstorms,Heavy Rain Showers',
       'Thunderstorms,Rain Showers,Fog', 'Thunderstorms',
       'Thunderstorms,Rain,Fog',
       'Thunderstorms,Moderate Rain Showers,Fog', 'Rain Showers,Fog',
       'Rain Showers,Snow Showers', 'Snow Pellets', 'Rain,Snow,Fog',
       'Moderate Rain,Fog', 'Freezing Rain,Ice Pellets,Fog',
       'Drizzle,Ice Pellets,Fog', 'Drizzle,Snow', 'Rain,Ice Pellets',
       'Drizzle,Snow,Fog', 

In [None]:
moderate = [
    "Drizzle",
    "Drizzle,Fog",
    "Drizzle,Ice Pellets,Fog",
    "Drizzle,Snow",
    "Drizzle,Snow,Fog",
    "Rain",
    "Rain,Fog",
    "Rain,Haze",
    "Rain,Snow",
    "Rain,Snow,Fog",
    "Rain,Snow Grains",
    "Rain,Snow,Ice Pellets",
    "Rain,Ice Pellets",
    "Rain Showers",
    "Rain Showers,Fog",
    "Rain Showers,Snow Showers",
    "Moderate Rain,Fog",
    "Snow",
    "Snow,Fog",
    "Snow,Haze",
    "Snow Showers",
    "Snow Showers,Fog",
    "Snow,Ice Pellets",
    "Snow Pellets",
    "Moderate Snow",
    "Freezing Fog"
]

severe = [
    "Freezing Rain",
    "Freezing Drizzle",
    "Freezing Drizzle,Fog",
    "Freezing Drizzle,Snow",
    "Freezing Drizzle,Haze",
    "Freezing Rain,Fog",
    "Freezing Rain,Haze",
    "Freezing Rain,Snow Grains",
    "Freezing Rain,Ice Pellets,Fog",
    "Snow,Blowing Snow",
    "Moderate Snow,Blowing Snow",
    "Thunderstorms",
    "Thunderstorms,Rain",
    "Thunderstorms,Rain Showers",
    "Thunderstorms,Heavy Rain Showers",
    "Thunderstorms,Rain Showers,Fog",
    "Thunderstorms,Rain,Fog",
    "Thunderstorms,Moderate Rain Showers,Fog"
]

mild = [
    "Clear",
    "Mainly Clear",
    "Mostly Cloudy",
    "Cloudy",
    "Fog",
    "Haze"
]

In [None]:
df['category'] = df['Weather'].apply(
    lambda x: 'moderate' if x in moderate
              else 'severe' if x in severe
              else 'mild' if x in mild
              else 'unknown'
)
df

Unnamed: 0,Date/Time,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa,Weather,category
0,1/1/2012 0:00,-1.8,-3.9,86,4,8.0,101.24,Fog,mild
1,1/1/2012 1:00,-1.8,-3.7,87,4,8.0,101.24,Fog,mild
2,1/1/2012 2:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog",severe
3,1/1/2012 3:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog",severe
4,1/1/2012 4:00,-1.5,-3.3,88,7,4.8,101.23,Fog,mild
...,...,...,...,...,...,...,...,...,...
8779,12/31/2012 19:00,0.1,-2.7,81,30,9.7,100.13,Snow,moderate
8780,12/31/2012 20:00,0.2,-2.4,83,24,9.7,100.03,Snow,moderate
8781,12/31/2012 21:00,-0.5,-1.5,93,28,4.8,99.95,Snow,moderate
8782,12/31/2012 22:00,-0.2,-1.8,89,28,9.7,99.91,Snow,moderate


In [None]:
import time

def alert_high_temp(row):
  if row['Temp_C']>3:
    print(f"alert!!! Temperature is high:{row['Temp_C']}")
    return True
  else:
    print(f"Temperature is {row['Temp_C']} and category is {row['category']}")
    return False


def stream_with_alert(df,delay = 1):
  for idx , row in df.iterrows():
    if alert_high_temp(row):
      print("Stopping stream due to high temperature!")
      break
  time.sleep(delay)
stream_with_alert(df)


Temperature is -1.8 and category is mild
Temperature is -1.8 and category is mild
Temperature is -1.8 and category is severe
Temperature is -1.5 and category is severe
Temperature is -1.5 and category is mild
Temperature is -1.4 and category is mild
Temperature is -1.5 and category is mild
Temperature is -1.4 and category is mild
Temperature is -1.4 and category is mild
Temperature is -1.3 and category is mild
Temperature is -1.0 and category is mild
Temperature is -0.5 and category is mild
Temperature is -0.2 and category is mild
Temperature is 0.2 and category is mild
Temperature is 0.8 and category is mild
Temperature is 1.8 and category is mild
Temperature is 2.6 and category is mild
Temperature is 3.0 and category is mild
alert!!! Temperature is high:3.8
Stopping stream due to high temperature!
