Scrapping

In [35]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# Configure Chrome to run in headless mode
options = Options()
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

# Navigate to the BBC Weather page for Tel Aviv
url = "https://www.bbc.com/weather/293397"
driver.get(url)

# Wait for the weather elements to load
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CLASS_NAME, 'wr-date__long'))
)

# Get the page source and parse it with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Find all day elements
days = soup.find_all('li', class_='wr-day')

# Initialize lists to store data
dates = []
max_temps = []
min_temps = []
weather_types = []

# Extract data from each day with error handling
for day in days:
    date_elem = day.find('span', class_='wr-date__long')
    max_temp_elem = day.find('span', class_='wr-value--temperature--c')
    min_temp_elem = day.find('span', class_='wr-temperature__low')
    weather_type_elem = day.find('span', class_='wr-day__weather-type-description')

    # Only extract text if the element is found
    date = date_elem.text.strip() if date_elem else 'N/A'
    max_temp = max_temp_elem.text.strip() if max_temp_elem else 'N/A'
    min_temp = min_temp_elem.text.strip() if min_temp_elem else 'N/A'
    weather_type = weather_type_elem.text.strip() if weather_type_elem else 'N/A'
    
    dates.append(date)
    max_temps.append(max_temp)
    min_temps.append(min_temp)
    weather_types.append(weather_type)

# Create a DataFrame
df = pd.DataFrame({
    'Date': dates,
    'Max Temp (°C)': max_temps,
    'Min Temp (°C)': min_temps,
    'Weather Type': weather_types
})

# Clean the data (skip N/A rows)
df = df[df['Max Temp (°C)'] != 'N/A']

# Convert temperatures to integers if possible
df['Max Temp (°C)'] = df['Max Temp (°C)'].str.replace('°', '').astype(int, errors='ignore')
df['Min Temp (°C)'] = df['Min Temp (°C)'].str.replace('°', '').astype(int, errors='ignore')

# Save to CSV
df.to_csv('tel_aviv_weather.csv', index=False)

# Close the browser
driver.quit()

# Print the DataFrame
print(df)















                        Date  Max Temp (°C) Min Temp (°C) Weather Type
0                        N/A             29           N/A          N/A
1     Tuesday 24th September             29           N/A          N/A
2   Wednesday 25th September             30           N/A          N/A
3    Thursday 26th September             29           N/A          N/A
4      Friday 27th September             29           N/A          N/A
5    Saturday 28th September             29           N/A          N/A
6      Sunday 29th September             30           N/A          N/A
7      Monday 30th September             30           N/A          N/A
8        Tuesday 1st October             29           N/A          N/A
9      Wednesday 2nd October             29           N/A          N/A
10      Thursday 3rd October             28           N/A          N/A
11        Friday 4th October             28           N/A          N/A
12      Saturday 5th October             28           N/A          N/A
13    

In [36]:
df.head()

Unnamed: 0,Date,Max Temp (°C),Min Temp (°C),Weather Type
0,,29,,
1,Tuesday 24th September,29,,
2,Wednesday 25th September,30,,
3,Thursday 26th September,29,,
4,Friday 27th September,29,,


Visualizations

In [29]:
df.dropna(subset=['Date'], inplace=True)
df = df.dropna(subset=['Weather Type'])
df.dropna(subset=['Date'], inplace=True)

In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup

# Load the data
df = pd.read_csv('/Users/manuel/Desktop/DI-Bootcamp/Week7_WebScrapping/D2/DailyChallenge/tel_aviv_weather.csv')

# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%A %d %B', errors='coerce')

df = df.dropna(subset=['Weather Type'])
df.dropna(subset=['Date'], inplace=True)

weather_mode = df['Weather Type'].mode()
print(weather_mode)

# Calculate average temperature
df['Avg Temp (°C)'] = (df['Max Temp (°C)'] + df['Min Temp (°C)']) / 2

# Basic analysis
avg_max_temp = df['Max Temp (°C)'].mean()
avg_min_temp = df['Min Temp (°C)'].mean()
most_common_weather = df['Weather Type'].mode().values[0]

print(f"Average Max Temperature: {avg_max_temp:.1f}°C")
print(f"Average Min Temperature: {avg_min_temp:.1f}°C")
print(f"Most common weather type: {most_common_weather}")

# Visualization 1: Temperature trends
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Max Temp (°C)'], label='Max Temp')
plt.plot(df['Date'], df['Min Temp (°C)'], label='Min Temp')
plt.plot(df['Date'], df['Avg Temp (°C)'], label='Avg Temp')
plt.title('Temperature Trends in Tel Aviv')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('temperature_trends.png')
plt.close()

# Visualization 2: Weather type distribution
plt.figure(figsize=(10, 6))
sns.countplot(y='Weather Type', data=df, order=df['Weather Type'].value_counts().index)
plt.title('Distribution of Weather Types')
plt.xlabel('Count')
plt.ylabel('Weather Type')
plt.tight_layout()
plt.savefig('weather_type_distribution.png')
plt.close()

# Visualization 3: Temperature range
plt.figure(figsize=(12, 6))
sns.boxplot(x='Date', y='value', hue='variable', 
            data=pd.melt(df[['Date', 'Max Temp (°C)', 'Min Temp (°C)']], ['Date']))
plt.title('Temperature Range in Tel Aviv')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.xticks(rotation=45)
plt.legend(title='')
plt.tight_layout()
plt.savefig('temperature_range.png')
plt.close()

print("Analysis complete. Visualizations saved as PNG files.")

Series([], Name: Weather Type, dtype: float64)


IndexError: index 0 is out of bounds for axis 0 with size 0