<a href="https://colab.research.google.com/github/Christelleelkhoury/Data-Engineering/blob/main/Integrating_Weather_Data_Into_a_Sales_Dataset_Using_APIs_and_MongoDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Step 1: Extract Sales Data from CSV

In [23]:
import pandas as pd

url="https://raw.githubusercontent.com/DrManalJalloul/Introduction-to-Data-Engineering/refs/heads/main/sales_data.csv"
sales_data = pd.read_csv(url)

# Display the first few rows of the DataFrame
sales_data.head()

Unnamed: 0,date,product_id,sales_amount,store_location
0,2025-02-05,P001,150,New York
1,2025-02-05,P002,300,Los Angeles
2,2025-02-05,P003,450,Chicago
3,2025-02-05,P004,600,Houston
4,2025-02-05,P005,750,Seattle


##Step 2: Fetch Weather Data from the API

In [27]:
import requests
def fetch_weather_data(city, date, api_key):
    base_url = f"https://api.openweathermap.org/data/2.5/weather?q={city}&appid={api_key}"
    response = requests.get(base_url)
    #print(response)
    data = response.json()
    #print(data)
    # Extract temperature, humidity, and weather description
    temperature = data['main']['temp'] - 273.15 # Convert from Kelvin to Celsius
    humidity = data['main']['humidity']
    weather_description = data['weather'][0]['description']
    return temperature, humidity, weather_description
# Example usage:
api_key = 'a80d93b249212d6a569bced3f615ddbc'

temp, humidity, description = fetch_weather_data('New York', '2025-02-01', api_key)
print(f"Temp: {temp:.1f}°C, Humidity: {humidity}%, Weather: {description}")

Temp: 1.0°C, Humidity: 45%, Weather: overcast clouds


##Step 3: Combine Weather Data with Sales Data

In [28]:
# Loop through each row of the sales_data dataframe, call the function and update the dataframe with weather data
for index, row in sales_data.iterrows():
    temp, humidity, description = fetch_weather_data(row["store_location"], row["date"], api_key)

    # Store the values in the dataframe
    sales_data.at[index, "Temperature (°C)"] = temp
    sales_data.at[index, "Humidity (%)"] = humidity
    sales_data.at[index, "Weather Description"] = description

    # Print the formatted weather data for each iteration
    print(f"{temp:.1f}°C, Humidity: {humidity}%, Weather: {description}")


1.0°C, Humidity: 45%, Weather: overcast clouds
17.3°C, Humidity: 68%, Weather: clear sky
-1.1°C, Humidity: 85%, Weather: overcast clouds
29.0°C, Humidity: 65%, Weather: broken clouds
3.9°C, Humidity: 88%, Weather: broken clouds
1.0°C, Humidity: 45%, Weather: overcast clouds
17.3°C, Humidity: 68%, Weather: clear sky
1.0°C, Humidity: 45%, Weather: overcast clouds
17.3°C, Humidity: 68%, Weather: clear sky
-1.1°C, Humidity: 85%, Weather: overcast clouds
29.0°C, Humidity: 65%, Weather: broken clouds
3.9°C, Humidity: 88%, Weather: broken clouds
1.0°C, Humidity: 45%, Weather: overcast clouds
17.3°C, Humidity: 68%, Weather: clear sky
1.0°C, Humidity: 45%, Weather: overcast clouds
17.3°C, Humidity: 68%, Weather: clear sky
-1.1°C, Humidity: 85%, Weather: overcast clouds
29.0°C, Humidity: 65%, Weather: broken clouds
3.9°C, Humidity: 88%, Weather: broken clouds
1.0°C, Humidity: 45%, Weather: overcast clouds
17.3°C, Humidity: 68%, Weather: clear sky
1.0°C, Humidity: 45%, Weather: overcast clouds
17

In [30]:
sales_data.head()

Unnamed: 0,date,product_id,sales_amount,store_location,Temperature (°C),Humidity (%),Weather Description
0,2025-02-05,P001,150,New York,1.02,45.0,overcast clouds
1,2025-02-05,P002,300,Los Angeles,17.29,68.0,clear sky
2,2025-02-05,P003,450,Chicago,-1.08,85.0,overcast clouds
3,2025-02-05,P004,600,Houston,28.99,65.0,broken clouds
4,2025-02-05,P005,750,Seattle,3.85,88.0,broken clouds


##Step 4: Load the Integrated Data into MongoDB

In [16]:
pip install pymongo

Collecting pymongo
  Downloading pymongo-4.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.11


In [31]:
from pymongo import MongoClient
from datetime import datetime

connection_string="mongodb+srv://christelleelkhoury:wCjkSnPEBpqb4Riu@cluster0.jqola.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
# Connect to the MongoDB Atlas cluster
client = MongoClient(connection_string)

# Access a specific database
db = client['WeatherData']

In [32]:
# Access a the collection sales within the database
sales = db['SalesData']


sales_dict = sales_data.to_dict(orient="records")

# Insert the sales data into MongoDB
sales.insert_many(sales_dict)

InsertManyResult([ObjectId('67a7c63b926f49613ae26a14'), ObjectId('67a7c63b926f49613ae26a15'), ObjectId('67a7c63b926f49613ae26a16'), ObjectId('67a7c63b926f49613ae26a17'), ObjectId('67a7c63b926f49613ae26a18'), ObjectId('67a7c63b926f49613ae26a19'), ObjectId('67a7c63b926f49613ae26a1a'), ObjectId('67a7c63b926f49613ae26a1b'), ObjectId('67a7c63b926f49613ae26a1c'), ObjectId('67a7c63b926f49613ae26a1d'), ObjectId('67a7c63b926f49613ae26a1e'), ObjectId('67a7c63b926f49613ae26a1f'), ObjectId('67a7c63b926f49613ae26a20'), ObjectId('67a7c63b926f49613ae26a21'), ObjectId('67a7c63b926f49613ae26a22'), ObjectId('67a7c63b926f49613ae26a23'), ObjectId('67a7c63b926f49613ae26a24'), ObjectId('67a7c63b926f49613ae26a25'), ObjectId('67a7c63b926f49613ae26a26'), ObjectId('67a7c63b926f49613ae26a27'), ObjectId('67a7c63b926f49613ae26a28'), ObjectId('67a7c63b926f49613ae26a29'), ObjectId('67a7c63b926f49613ae26a2a'), ObjectId('67a7c63b926f49613ae26a2b'), ObjectId('67a7c63b926f49613ae26a2c'), ObjectId('67a7c63b926f49613ae26a