In [11]:
# Section 1: Importing and Loading Cleaned City Data into a DataFrame

import pandas as pd  # Importing the pandas library to handle data frames

# Step 1: Load the cleaned city data from a CSV file into a pandas DataFrame
# The file 'uscities.csv' contains city-level data.
city_df = pd.read_csv('C:/Users/Krish Patel/Desktop/Project/uscities_cleaned.csv')

#structure and content of the DataFrame.
print(city_df.head())


          city   city_ascii state_id  state_name  county_fips  county_name  \
0     New York     New York       NY    New York        36081       Queens   
1  Los Angeles  Los Angeles       CA  California         6037  Los Angeles   
2      Chicago      Chicago       IL    Illinois        17031         Cook   
3        Miami        Miami       FL     Florida        12086   Miami-Dade   
4      Houston      Houston       TX       Texas        48201       Harris   

       lat       lng  population  density  \
0  40.6943  -73.9249    18908608  11080.3   
1  34.1141 -118.4068    11922389   3184.7   
2  41.8375  -87.6866     8497759   4614.5   
3  25.7840  -80.2101     6080145   4758.9   
4  29.7860  -95.3885     5970127   1384.0   

                                                zips          id  
0  11229 11228 11226 11225 11224 11222 11221 1122...  1840034016  
1  91367 90291 90293 90292 91316 91311 90035 9003...  1840020491  
2  60018 60649 60641 60640 60643 60642 60645 6064...  18400

In [12]:
# Section 2: Filtering Massachusetts Cities from the City Data

# Step 1: Filter for cities located in Massachusetts
massachusetts_city_df = city_df[city_df['state_name'] == 'Massachusetts']

# Step 2: Rename for easier reference in analysis
mass_cities = massachusetts_city_df

# Display the first few rows to confirm filtering
print(mass_cities.head())

            city   city_ascii state_id     state_name  county_fips  \
9         Boston       Boston       MA  Massachusetts        25025   
89     Worcester    Worcester       MA  Massachusetts        25027   
100  Springfield  Springfield       MA  Massachusetts        25013   
289  New Bedford  New Bedford       MA  Massachusetts        25005   
379    Cambridge    Cambridge       MA  Massachusetts        25017   

    county_name      lat      lng  population  density  \
9       Suffolk  42.3188 -71.0852     4328315   5319.0   
89    Worcester  42.2705 -71.8079      486486   2110.2   
100     Hampden  42.1155 -72.5395      437752   1881.5   
289     Bristol  41.6697 -70.9428      153701   1942.4   
379   Middlesex  42.3759 -71.1185      117962   7119.9   

                                                  zips          id  
9    02120 02121 02122 02124 02125 02126 02127 0212...  1840000455  
89   01608 01610 01609 01602 01603 01604 01605 0160...  1840000434  
100  01151 01144 01118 

In [13]:
# Section 3: Fetching Monthly Average Weather Data for a city function 

from datetime import datetime
from meteostat import Point, Daily
import pandas as pd

def fetch_monthly_avg_weather_data(lat, lon, start_date, end_date):
    """
    Fetch monthly average weather data for a given latitude, longitude, and date range.
    
    Parameters:
        lat (float): Latitude of the location
        lon (float): Longitude of the location
        start_date (str): Start date in the format 'YYYY-MM-DD'
        end_date (str): End date in the format 'YYYY-MM-DD'
        
    Returns:
        pandas.DataFrame: DataFrame containing the monthly average weather data
    """
    # Convert start and end dates to datetime objects
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    
    # Create a Point object for the specified location
    location = Point(lat, lon)
    
    # Fetch daily weather data for the location and date range
    data = Daily(location, start, end)
    daily_data = data.fetch()
    
    # Handle case where no data is available
    if daily_data.empty:
        print(f"No weather data available for lat: {lat}, lon: {lon} in the given range.")
        return pd.DataFrame()
    
    # Resample to monthly averages and reset the index to include 'time'
    monthly_avg_data = daily_data.resample('ME').mean().reset_index()
    
    return monthly_avg_data

In [14]:
# Example check for Boston (latitude and longitude of Boston, MA)
lat = 42.3601
lon = -71.0589
start_date = '2015-01-01'
end_date = '2024-12-31'

# Fetch monthly average weather data
monthly_weather_data = fetch_monthly_avg_weather_data(lat, lon, start_date, end_date)

# Display the first few rows of the monthly average data
print(monthly_weather_data.head())

        time       tavg       tmin       tmax      prcp        snow  \
0 2015-01-31  -3.151613  -7.132258   0.670968  2.922581   84.838710   
1 2015-02-28  -6.942857 -12.039286  -2.253571  3.064286  864.642857   
2 2015-03-31   0.483871  -3.512903   4.961290  2.500000  569.032258   
3 2015-04-30   8.783333   4.580000  13.206667  1.933333   15.333333   
4 2015-05-31  16.487097  10.770968  22.880645  1.000000    0.000000   

         wdir       wspd  wpgt         pres        tsun  
0  262.103448  19.883871   NaN  1017.719355  264.290323  
1  262.600000  19.225000   NaN  1016.753571  274.821429  
2  275.181818  17.787097   NaN  1017.396774  358.290323  
3  180.076923  19.230000   NaN  1014.710000  412.866667  
4  180.107143  17.729032   NaN  1019.367742  565.612903  


In [15]:
# Section 4: Fetching Monthly Average Weather Data for  all cities in Massachusetts 

def fetch_monthly_avg_weather_for_mass_cities(mass_cities_df, start_date, end_date):
    """
    Fetch monthly average weather data for all cities in the mass_cities DataFrame.
    
    Parameters:
        mass_cities_df (pandas.DataFrame): DataFrame containing city information with 'lat' and 'lng' columns
        start_date (str): Start date in the format 'YYYY-MM-DD'
        end_date (str): End date in the format 'YYYY-MM-DD'
        
    Returns:
        pandas.DataFrame: Combined DataFrame containing monthly average weather data for all cities
    """
    all_weather_data = []  # List to store each city's monthly average weather data

    # Iterate over each city in the DataFrame
    for index, row in mass_cities_df.iterrows():
        city_name = row['city']
        lat = row['lat']
        lon = row['lng']
        
        print(f"Fetching monthly average weather data for {city_name} (Lat: {lat}, Lon: {lon})")
        
        # Fetch monthly average weather data for the current city
        weather_data = fetch_monthly_avg_weather_data(lat, lon, start_date, end_date)
        
        # Check if any data is returned; if empty, continue to the next city
        if weather_data.empty:
            print(f"No weather data available for {city_name}")
            continue
        
        # Add the city name to the weather data for identification
        weather_data['city'] = city_name
        
        # Append the city's data to the list
        all_weather_data.append(weather_data)
    
    # Concatenate all city weather data into a single DataFrame
    if all_weather_data:
        combined_weather_df = pd.concat(all_weather_data, axis=0)
    else:
        combined_weather_df = pd.DataFrame()  # Return an empty DataFrame if no data is fetched
    
    return combined_weather_df

In [16]:
# Section 5: Getting all cities Data from 2015 to 2024

# Start and end date for the weather data
start_date = '2015-01-01'
end_date = '2024-12-31'

# Assuming 'mass_cities' is your DataFrame containing the cities data
all_cities_weather = fetch_monthly_avg_weather_for_mass_cities(mass_cities, start_date, end_date)

print(all_cities_weather.head())



print("Weather data for all cities in Massachusetts saved")

Fetching monthly average weather data for Boston (Lat: 42.3188, Lon: -71.0852)
Fetching monthly average weather data for Worcester (Lat: 42.2705, Lon: -71.8079)
Fetching monthly average weather data for Springfield (Lat: 42.1155, Lon: -72.5395)
Fetching monthly average weather data for New Bedford (Lat: 41.6697, Lon: -70.9428)
Fetching monthly average weather data for Cambridge (Lat: 42.3759, Lon: -71.1185)
Fetching monthly average weather data for Lowell (Lat: 42.6389, Lon: -71.3217)
Fetching monthly average weather data for Leominster (Lat: 42.5209, Lon: -71.7717)
Fetching monthly average weather data for Brockton (Lat: 42.0821, Lon: -71.0242)
Fetching monthly average weather data for Quincy (Lat: 42.2506, Lon: -71.0187)
Fetching monthly average weather data for Lynn (Lat: 42.4781, Lon: -70.9664)
Fetching monthly average weather data for Fall River (Lat: 41.7136, Lon: -71.1015)
Fetching monthly average weather data for Newton (Lat: 42.3316, Lon: -71.2085)
Fetching monthly average wea

In [17]:
#Section 6: Rearranging the all_cities_weather Dataframe

all_cities_weather = all_cities_weather.reset_index(drop=True)

# Drop the 'level_0' column if it still exists
if 'level_0' in all_cities_weather.columns:
    all_cities_weather = all_cities_weather.drop(columns=['level_0'])

# Rename the 'index' column to 'time' if present
if 'index' in all_cities_weather.columns:
    all_cities_weather = all_cities_weather.rename(columns={'index': 'time'})

# Reorder the columns, ensuring 'city' is first and 'time' is second
columns_order = ['city', 'time', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun']
all_cities_weather = all_cities_weather[columns_order]

# Sort by 'city' and 'time' to ensure proper ordering
all_cities_weather = all_cities_weather.sort_values(by=['city', 'time'])

print(all_cities_weather)

print("Weather data saved with 'time' as a column and 'city' as the first column.")

                 city       time       tavg       tmin       tmax      prcp  \
8518  Acushnet Center 2015-01-31  -2.532258  -7.096774   1.438710  2.361905   
8519  Acushnet Center 2015-02-28  -6.432143 -12.496429  -1.453571  0.628571   
8520  Acushnet Center 2015-03-31   0.212903  -5.058065   4.945161  2.690909   
8521  Acushnet Center 2015-04-30   8.600000   3.396667  13.786667  0.855000   
8522  Acushnet Center 2015-05-31  15.706452   9.906452  21.829032  1.363333   
...               ...        ...        ...        ...        ...       ...   
6152    Yarmouth Port 2024-06-30  19.546667  15.670000  23.060000  2.440000   
6153    Yarmouth Port 2024-07-31  22.877419  20.012903  26.054839  2.087097   
6154    Yarmouth Port 2024-08-31  21.006452  16.719355  24.858065  1.674194   
6155    Yarmouth Port 2024-09-30  17.070000  12.813333  20.906667  5.000000   
6156    Yarmouth Port 2024-10-31  12.807692   7.876923  17.184615  2.100000   

      snow        wdir       wspd  wpgt         pre

In [18]:
# Section 7: Average of Cities' Weather as a Whole for Massachusetts

# Step 1: Group by 'time' column (monthly) and calculate the average for each weather metric across all cities
# This calculates the average weather data for each month across all Massachusetts cities.
massachusetts_monthly_avg = monthly_weather_data.groupby('time').mean().reset_index()

# Step 2: Display the first few rows to check the resulting DataFrame
# The result provides a view of the overall monthly weather trends for Massachusetts as a whole.
print(massachusetts_monthly_avg.head())

        time       tavg       tmin       tmax      prcp        snow  \
0 2015-01-31  -3.151613  -7.132258   0.670968  2.922581   84.838710   
1 2015-02-28  -6.942857 -12.039286  -2.253571  3.064286  864.642857   
2 2015-03-31   0.483871  -3.512903   4.961290  2.500000  569.032258   
3 2015-04-30   8.783333   4.580000  13.206667  1.933333   15.333333   
4 2015-05-31  16.487097  10.770968  22.880645  1.000000    0.000000   

         wdir       wspd  wpgt         pres        tsun  
0  262.103448  19.883871   NaN  1017.719355  264.290323  
1  262.600000  19.225000   NaN  1016.753571  274.821429  
2  275.181818  17.787097   NaN  1017.396774  358.290323  
3  180.076923  19.230000   NaN  1014.710000  412.866667  
4  180.107143  17.729032   NaN  1019.367742  565.612903  


In [19]:
# Section 8: Data Cleaning and Preparation for Massachusetts Monthly Weather Data

# Import the calendar module to convert month numbers to names
import calendar

# Step 1: Extract 'year' and 'month' from the 'time' column in the weather DataFrame
# This will help us analyze the data by year and month more easily.
massachusetts_monthly_avg['year'] = pd.DatetimeIndex(massachusetts_monthly_avg['time']).year
massachusetts_monthly_avg['month'] = pd.DatetimeIndex(massachusetts_monthly_avg['time']).month

# Step 2: Remove the 'time' column as it's no longer needed
# After extracting the 'year' and 'month', we can drop 'time' to clean up the DataFrame.
massachusetts_monthly_avg = massachusetts_monthly_avg.drop(columns=['time'])

# Step 3: Create a new column 'month_name' based on the 'month' column
# This converts the month numbers (1-12) into their corresponding full month names (e.g., January, February).
massachusetts_monthly_avg['month_name'] = massachusetts_monthly_avg['month'].apply(lambda x: calendar.month_name[x])

# Step 4: Reorder the columns to put 'year', 'month', and 'month_name' at the front
# This makes it easier to view and analyze the data by year and month.
columns_order = ['year', 'month', 'month_name', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun']
massachusetts_monthly_avg = massachusetts_monthly_avg[columns_order]

# Step 5: Convert the temperature values from Celsius to Fahrenheit
# Applying the conversion formula (°F = °C * 9/5 + 32) to all temperature columns ('tavg', 'tmin', 'tmax').
massachusetts_monthly_avg['tavg'] = massachusetts_monthly_avg['tavg'] * 9/5 + 32
massachusetts_monthly_avg['tmin'] = massachusetts_monthly_avg['tmin'] * 9/5 + 32
massachusetts_monthly_avg['tmax'] = massachusetts_monthly_avg['tmax'] * 9/5 + 32

# Step 6: Display the modified DataFrame to check the results
# We print the first few rows to verify that the changes (removing 'time', adding 'month_name', and converting temperatures) were applied correctly.
print(massachusetts_monthly_avg.head())



   year  month month_name       tavg       tmin       tmax      prcp  \
0  2015      1    January  26.327097  19.161935  33.207742  2.922581   
1  2015      2   February  19.502857  10.329286  27.943571  3.064286   
2  2015      3      March  32.870968  25.676774  40.930323  2.500000   
3  2015      4      April  47.810000  40.244000  55.772000  1.933333   
4  2015      5        May  61.676774  51.387742  73.185161  1.000000   

         snow        wdir       wspd  wpgt         pres        tsun  
0   84.838710  262.103448  19.883871   NaN  1017.719355  264.290323  
1  864.642857  262.600000  19.225000   NaN  1016.753571  274.821429  
2  569.032258  275.181818  17.787097   NaN  1017.396774  358.290323  
3   15.333333  180.076923  19.230000   NaN  1014.710000  412.866667  
4    0.000000  180.107143  17.729032   NaN  1019.367742  565.612903  


In [20]:
# Section 9: Saving the Cleaned Massachusetts Monthly Weather Data to CSV

# Define the path to save the file
file_path = 'C:/Users/Krish Patel/Desktop/Project/massachusetts_monthly_weather_data.csv'

# Save the DataFrame to a CSV file
# This will save the cleaned and prepared monthly weather data with year, month, and converted temperatures.
massachusetts_monthly_avg.to_csv(file_path, index=False)

# Print confirmation message
print(f"Data saved successfully to {file_path}")


Data saved successfully to C:/Users/Krish Patel/Desktop/Project/massachusetts_monthly_weather_data.csv
