In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from scipy.stats import linregress

# Import API key
from config import weather_api_key

# Incorporated citipy to determine city based on latitude and longitude
from citipy import citipy

# Output File (CSV)
output_data_file = "../output_data/cities.csv"

# Range of latitudes and longitudes
lat_range = (-90, 90)
lng_range = (-180, 180)

In [None]:
 # List for holding lat_lngs and cities
lat_lngs = []
cities = []

# Create a set of random lat and lng combinations
lats = np.random.uniform(lat_range[0], lat_range[1], size=1500)
lngs = np.random.uniform(lng_range[0], lng_range[1], size=1500)
lat_lngs = zip(lats, lngs)

# Identify nearest city for each lat, lng combination
for lat_lng in lat_lngs:
    city = citipy.nearest_city(lat_lng[0], lat_lng[1]).city_name
    
    # If the city is unique, then add it to a our cities list
    if city not in cities:
        cities.append(city)

# Print the city count to confirm sufficient count
len(cities)

In [None]:
#Performing API call to retrieve data
#printing number and name of the city while being processed
url="https://api.openweathermap.org/data/2.5/weather?"
lat=[]
lng=[]
temp=[]
humidity=[]
clouds=[]
wind=[]
country=[]
date=[]
for city in cities:
    try:
        query_url=f"{url}q={city}&units=imperial&appid={weather_api_key}"
        response = requests.get(query_url).json()
        lat.append(response['coord']['lat'])
        lng.append(response['coord']['lon'])
        temp.append(response['main']['temp_max'])
        humidity.append(response['main']['humidity'])
        clouds.append(response['clouds']['all'])
        wind.append(response['wind']['speed'])
        country.append(response['sys']['country'])
        date.append(response['dt'])
        print(f"Processing city number {cities.index(city)+1}, name: {city}")
    except:
        lat.append(pd.NaT)
        lng.append(pd.NaT)
        temp.append(pd.NaT)
        humidity.append(pd.NaT)
        clouds.append(pd.NaT)
        wind.append(pd.NaT)
        country.append(pd.NaT)
        date.append(pd.NaT)
        print("City not found. Processing further...")
        
print("Data retrieval completed")

In [None]:
#building data frame to display city data
cities_data=pd.DataFrame({"City": cities,
                         "Lat": lat,
                         "Lng": lng,
                         "Max temp": temp,
                         "Humidity": humidity,
                         "Cloudiness": clouds,
                         "Wind Speed": wind,
                         "Country": country,
                         "Date":date})
cities_data

In [None]:
#removing all NA rows
cleaned=cities_data.dropna(axis=0, how="any")
cleaned.head()

In [None]:
#saving raw data as csv
cleaned.to_csv("../output_data/Cities.csv")

In [None]:
#displaying statistical summary for the numerical columns
stats_summary=cleaned[["Lat","Lng","Max temp","Humidity","Cloudiness","Wind Speed"]].astype(float)
stats_summary=stats_summary.describe()
stats_summary

In [None]:
#checking if there are cities with humidity greater than 100%
humidity_index=[]
for humidity in cleaned["Humidity"]:
    if humidity>100:
        humidity_index.append(index(humidity))
humidity_index

<h1>Plotting the data</h1>

<h2>Temperature (F) vs. Latitude Plot</h2>

In [None]:
#creating scatter plot to showcase Temperature (F) vs. Latitude
x_axis=cleaned["Lat"]
max_temp=cleaned["Max temp"]
plt.title("City Latitude vs Max Temperature (05/12/21)")
plt.xlabel("Latitude")
plt.ylabel("Max Temperature (F)")
plt.scatter(x_axis, max_temp, marker="o",facecolors="red",edgecolors="black",alpha=0.75)
plt.savefig("../output_data/Latitude_vs_temp.png")
plt.show

The plot is analysing the correlation between the latitude and the maximum temperature in the cities. The temperature is displayed in Farenheit. 

------------------------------------------------------------------------------------

<h2>Humidity (%) vs. Latitude Plot</h2>

In [None]:
#creating scatter plot to showcase Humidity (%) vs. Latitude
x_axis=cleaned["Lat"]
humidity_pct=cleaned["Humidity"]
plt.title("City Latitude vs Humidity (05/12/21)")
plt.xlabel("Latitude")
plt.ylabel("Humidity (%)")
plt.scatter(x_axis, humidity_pct, marker="o",facecolors="red",edgecolors="black",alpha=0.75)
plt.savefig("../output_data/Latitude_vs_humidity.png")
plt.show

The plot is analysing the correlation between the latitude and the percentage of humidity in the cities. 

---------------------------------------------------------------------------

<h2>Cloudiness (%) vs. Latitude Plot</h2>

In [None]:
#creating scatter plot to showcase Cloudiness (%) vs. Latitude
x_axis=cleaned["Lat"]
cloudiness_pct=cleaned["Cloudiness"]
plt.title("City Latitude vs Cloudiness (05/12/21)")
plt.xlabel("Latitude")
plt.ylabel("Cloudiness (%)")
plt.scatter(x_axis, cloudiness_pct, marker="o",facecolors="red",edgecolors="black",alpha=0.75)
plt.savefig("../output_data/Latitude_vs_cloudiness.png")
plt.show

The plot is analysing the correlation between the latitude and the percentage of cloudiness in the cities.

------------------------------------------------------------------------------------

<h2>Wind Speed (mph) vs. Latitude Plot</h2>

In [None]:
#creating scatter plot to showcase Wind Speed (mph) vs. Latitude
x_axis=cleaned["Lat"]
wind_speed=cleaned["Wind Speed"]
plt.title("City Latitude vs Wind Speed (05/12/21)")
plt.xlabel("Latitude")
plt.ylabel("Wind Speed (mph)")
plt.scatter(x_axis, wind_speed, marker="o",facecolors="red",edgecolors="black",alpha=0.75)
plt.savefig("../output_data/Latitude_vs_wind_speed.png")
plt.show

The plot is analysing the correlation between the latitude and the wind speed in the cities. The speed of the wind is displayed in miles per hour.

--------------------------------------------------------------------

<h1>Linear Regression</h1>

In [None]:
#spliting the data frame into northern and southern depending on the latitude
northern=cleaned.loc[cleaned["Lat"]>=0,:]
southern=cleaned.loc[cleaned["Lat"]<0,:]

<h2>Temperature (F) vs. Latitude Plot</h2>

<h3>Northern Hemisphere</h3>

In [None]:
#plotting
x_axis=northern["Lat"]
max_temp=northern["Max temp"]
plt.title("Northern Hemisphere - Temperature (F) vs. Latitude (05/12/21)")
plt.xlabel("Latitude")
plt.ylabel("Max Temperature (F)")
# Performing a linear regression 
(slope, intercept,rvalue,pvalue,stderr) = linregress(x_axis.astype(float), max_temp.astype(float)) 
regress_values=x_axis*slope+intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_axis, max_temp, marker="o",facecolors="red",edgecolors="black",alpha=0.75)
plt.plot(x_axis,regress_values,"b-")
plt.annotate(line_eq,(0,-40),fontsize=15,color="blue")
print(f"The r-squared is: {rvalue**2}")
plt.savefig("../output_data/northern_Latitude_vs_max_temp.png")
plt.show

<h3>Southern Hemisphere</h3>

In [None]:
#plotting
x_axis=southern["Lat"]
max_temp=southern["Max temp"]
plt.title("Southern Hemisphere - Temperature (F) vs. Latitude (05/12/21)")
plt.xlabel("Latitude")
plt.ylabel("Max Temperature (F)")
# Performing a linear regression 
(slope, intercept,rvalue,pvalue,stderr) = linregress(x_axis.astype(float), max_temp.astype(float)) 
regress_values=x_axis*slope+intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_axis, max_temp, marker="o",facecolors="red",edgecolors="black",alpha=0.75)
plt.plot(x_axis,regress_values,"b-")
plt.annotate(line_eq,(-30,45),fontsize=15,color="blue")
print(f"The r-squared is: {rvalue**2}")
plt.savefig("../output_data/southern_Latitude_vs_max_temp.png")
plt.show

The graphs above present the correlation betwen cities latitude and maxium temprature for Norther Hemisphere and Southern Hemisphere. The linear regression of the first plot displays a negative coefficient meaning that as the latitude increases the temprature drops. The opposite correlation is displayed on the second graph, where positive coefficient suggests that as the latitude grows the temprature also tends to increase. However, a lower value of r-squared for the Southern Hemisphere suggests a lower significance level of this data.

<H2>Humidity (%) vs. Latitude</h2>

<h3>Northern Hemisphere</h3>

In [None]:
#plotting
x_axis=northern["Lat"]
humidity=northern["Humidity"]
plt.title("Northern Hemisphere - Humidity(%) vs. Latitude (05/12/21)")
plt.xlabel("Latitude")
plt.ylabel("Humidity (%)")
# Performing a linear regression 
(slope, intercept,rvalue,pvalue,stderr) = linregress(x_axis.astype(float), humidity.astype(float)) 
regress_values=x_axis*slope+intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_axis, humidity, marker="o",facecolors="red",edgecolors="black",alpha=0.75)
plt.plot(x_axis,regress_values,"b-")
plt.annotate(line_eq,(45,20),fontsize=15,color="blue")
print(f"The r-squared is: {rvalue**2}")
plt.savefig("../output_data/northern_Latitude_vs_humidity.png")
plt.show

<h3> Southern Hemisphere</h3>

In [None]:
#plotting
x_axis=southern["Lat"]
humidity=southern["Humidity"]
plt.title("Southern Hemisphere - Humidity(%) vs. Latitude (05/12/21)")
plt.xlabel("Latitude")
plt.ylabel("Humidity (%)")
# Performing a linear regression 
(slope, intercept,rvalue,pvalue,stderr) = linregress(x_axis.astype(float), humidity.astype(float)) 
regress_values=x_axis*slope+intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_axis, humidity, marker="o",facecolors="red",edgecolors="black",alpha=0.75)
plt.plot(x_axis,regress_values,"b-")
plt.annotate(line_eq,(-23,0),fontsize=15,color="blue")
print(f"The r-squared is: {rvalue**2}")
plt.savefig("../output_data/southern_Latitude_vs_humidity.png")
plt.show

The graphs above analyse the correlation between cities latitude and humidity percentage for the Northern Hemisphere and Southern Hemisphere. Both graphs present a positive correlation. However, r-squared values for both linear regressions are low, meaning lower significance.

<h2>Cloudiness (%) vs. Latitude</h2>

<h3> Northern Hemisphere</h3>

In [None]:
#plotting
x_axis=northern["Lat"]
cloudiness=northern["Cloudiness"]
plt.title("Northern Hemisphere - Cloudiness(%) vs. Latitude (05/12/21)")
plt.xlabel("Latitude")
plt.ylabel("Cloudiness (%)")
# Performing a linear regression 
(slope, intercept,rvalue,pvalue,stderr) = linregress(x_axis.astype(float), cloudiness.astype(float)) 
regress_values=x_axis*slope+intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_axis, cloudiness, marker="o",facecolors="red",edgecolors="black",alpha=0.75)
plt.plot(x_axis,regress_values,"b-")
plt.annotate(line_eq,(45,20),fontsize=15,color="blue")
print(f"The r-squared is: {rvalue**2}")
plt.savefig("../output_data/northern_Latitude_vs_cloudiness.png")
plt.show

<h3>Southern Hemisphere</h3>

In [None]:
#plotting
x_axis=southern["Lat"]
cloudiness=southern["Cloudiness"]
plt.title("Southern Hemisphere - Cloudiness(%) vs. Latitude (05/12/21)")
plt.xlabel("Latitude")
plt.ylabel("Cloudiness (%)")
# Performing a linear regression 
(slope, intercept,rvalue,pvalue,stderr) = linregress(x_axis.astype(float), cloudiness.astype(float)) 
regress_values=x_axis*slope+intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_axis, cloudiness, marker="o",facecolors="red",edgecolors="black",alpha=0.75)
plt.plot(x_axis,regress_values,"b-")
plt.annotate(line_eq,(-55,65),fontsize=15,color="blue")
print(f"The r-squared is: {rvalue**2}")
plt.savefig("../output_data/southern_Latitude_vs_cloudiness.png")
plt.show

The scatter plots above analyse the correlation between latitude and cloudiness percentage for both Northern and Southern Hemispheres. Both of the graphs identify a positive trend. However, similar to the previous plots, the r-squared values remain low, meaning the sample data does not provide enough evidence to apply the identified trend to the whole population. 

<h2>Wind Speed (mph) vs. Latitude</h2>

<h3>Northern Hemisphere</h3>

In [None]:
#plotting
x_axis=northern["Lat"]
wind_speed=northern["Wind Speed"]
plt.title("Northern Hemisphere - Wind Speed(mph) vs. Latitude (05/12/21)")
plt.xlabel("Latitude")
plt.ylabel("Wind Speed (mph)")
# Performing a linear regression 
(slope, intercept,rvalue,pvalue,stderr) = linregress(x_axis.astype(float), wind_speed.astype(float)) 
regress_values=x_axis*slope+intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_axis, wind_speed, marker="o",facecolors="red",edgecolors="black",alpha=0.75)
plt.plot(x_axis,regress_values,"b-")
plt.annotate(line_eq,(0,30),fontsize=15,color="blue")
print(f"The r-squared is: {rvalue**2}")
plt.savefig("../output_data/northern_Latitude_vs_wind_speed.png")
plt.show

<h3>Southern Hemisphere</h3>

In [None]:
#plotting
x_axis=southern["Lat"]
wind_speed=southern["Wind Speed"]
plt.title("Southern Hemisphere - Wind Speed(mph) vs. Latitude (05/12/21)")
plt.xlabel("Latitude")
plt.ylabel("Wind Speed (mph)")
# Performing a linear regression 
(slope, intercept,rvalue,pvalue,stderr) = linregress(x_axis.astype(float), wind_speed.astype(float)) 
regress_values=x_axis*slope+intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_axis, wind_speed, marker="o",facecolors="red",edgecolors="black",alpha=0.75)
plt.plot(x_axis,regress_values,"b-")
plt.annotate(line_eq,(-25,25),fontsize=15,color="blue")
print(f"The r-squared is: {rvalue**2}")
plt.savefig("../output_data/southern_Latitude_vs_wind_speed.png")
plt.show

The linear regression is modelling the correlation between the latitude and wind speed, displayed in miles per hour. The coefficient is positive for Northern Hemisphere and negative for Southern Hemisphere. The r-squared for both linear regressions are closed to 0, which indicates that the relatioships observed above may not exist in the larger population.