## England weather dataset

In [None]:
import numpy as np  # Importing the NumPy library for array operations and mathematical functions

import matplotlib.pyplot as plt  # Importing the Matplotlib library for creating visualizations

import pandas as pd  # Importing the Pandas library for data manipulation and analysis

import seaborn as sns  # Importing the Seaborn library for statistical data visualization

from datetime import datetime  # Importing the datetime module from the datetime library for handling date and time data

In [None]:
data = pd.read_csv("/kaggle/input/england-weather-dataset/EnglandWeather_.csv")  # Reading a CSV file
data

In [None]:
# making dataframe of our csv file
df = pd.DataFrame (data)
df

In [None]:
df_rows_count, df_columns_count = df.shape  # Getting the number of rows and columns in the DataFrame using the shape attribute. The values are assigned to the variables df_rows_count and df_columns_count.

print(f"number of rows: {df_rows_count}")  # Printing the number of rows in the DataFrame using an f-string and the df_rows_count variable.

print(f"number of columns: {df_columns_count}")  # Printing the number of columns in the DataFrame using an f-string and the df_columns_count variable.

In [None]:
df.info()
#This includes column names, data types, and the count of non-null values in each column.

In [None]:
# Putting columns with float64 type in another dataframe named by df_num
df_num = df.select_dtypes (["float64"])

In [None]:
# getting description of our df_num dataframe
df_num.describe()

In [None]:
# Putting columns with float64 type in another dataframe named by df_num
df_obj = df.select_dtypes (["object"])

In [None]:
# getting description of our df_obj dataframe

df_obj.describe()

In [None]:
print(f'number of duplicated data: {len(df[df.duplicated()])}')  # Calculating the number of duplicated rows in the DataFrame 

df[df.duplicated()]  # Filtering the DataFrame df to show only the duplicated rows. 

In [None]:
# dropping duplicated values from our DataFrame
df1 = df.copy()
df1 = df1.drop_duplicates()
df1

In [None]:
# making a copy of our DataFrame
df2 = df1.copy()
# removing NaN values of our DataFrame
df2 = df2.dropna()
df2

In [None]:
# showing the number of NaN values in each columns after dropping them
df2.isna().sum()

In [None]:
# removing the noise (pressure equals to zero is noise in this dataset)
df3 = df2.copy()
df3 = df3[df3["PRESSURE (millibars)"]!=0]
df3

In [None]:
df4 = df3.copy()  # Creating a copy of the DataFrame df3 and assigning it to the variable df4.

# Converting the 'Formatted Date' column in the DataFrame df4 to datetime format using the pd.to_datetime() function.
#The `utc=True` argument ensures the datetime values are represented in UTC.

df4['Formatted Date'] = pd.to_datetime(df4['Formatted Date'], utc=True)  

df4['Formatted Date'].dtype  # Accessing the 'Formatted Date' column in the DataFrame df4 and checking its data type using the dtype attribute.

In [None]:
df5 = df4.sort_values('Formatted Date')  # Sorting the DataFrame df4 based on the 'Formatted Date' column using the sort_values() method.

df5 = df5.reset_index(drop=True)  # Resetting the index of the DataFrame df5 after sorting.

df5 

In [None]:
df6 = df5.copy()
# to remove the data of 2005, I just put the rows from index = 1  to the last row in a new dataframe
df6 = df6.iloc[1:]
df6

In [None]:
# getting description of our new dataframe (df6)
df6.describe(include='all')

## Visualization

## Scatter plots (Target VS Features)

In [None]:
plt.figure(figsize=(15, 10))  # Creating a new figure with a specific size of 15 inches by 10 inches using the figsize parameter of plt.figure().

plt.scatter(df6["TEMPERATURE (C)"], df6["HUMIDITY"], s=2, color='magenta')  # Creating a scatter plot with the 'TEMPERATURE (C)' column as the x-axis, 'HUMIDITY' column as the y-axis, marker size of 2, and color of magenta.

plt.xlabel("TEMPERATURE (C)", fontsize=20, color="blue")  # Setting the x-axis label as "TEMPERATURE (C)" with a font size of 20 and color of blue.
plt.ylabel("HUMIDITY", fontsize=20, color="blue")  # Setting the y-axis label as "HUMIDITY" with a font size of 20 and color of blue.
plt.title("HUMIDITY VS Temperature", fontsize=20, color="red")  # Setting the title of the plot as "HUMIDITY VS Temperature" with a font size of 20 and color of red.

plt.xticks(range(-20, 42, 5), fontsize=15)  # Setting the x-axis tick positions and labels with a range from -20 to 42, incrementing by 5, and font size of 15.
plt.yticks(fontsize=15)  # Setting the font size of y-axis tick labels to 15.

plt.grid()  # Adding grid lines to the plot.

plt.show()  # Displaying the plot.

#### According to the plot above, there are some points with humidity equal to 0 which are not having much frequency in our dataset, so they need further analysis, maybe you need to talk with the owner of this dataset to find out if there were any errors in collecting the data.
 -------------------------------------------------------------------------------------------------------------------------
#### The highest dense part of the plot is related to the temperature between 10 to 20 C. the highest amount of humidity is 1 and the average of that is equal to 0.6-0.8.
 -------------------------------------------------------------------------------------------------------------------------
#### A decreasing trend can be seen in the plot, as the temperature increases, the average humidity decreases.

In [None]:
# The comments for this cell are the same with our first scatter plot. you can turn back to that cell and see what's going on.
plt.figure(figsize = (15,10))

plt.scatter(df6["WIND SPEED (km/h)"], df6["HUMIDITY"], s = 2 , color = 'indigo')

plt.xlabel ("WIND SPEED (km/h)", fontsize = 20 , color = "Blue")
plt.ylabel ("HUMIDITY", fontsize = 20 , color = "Blue")
plt.title ("HUMIDITY VS Wind Speed", fontsize = 20 , color = "red")

plt.xticks (range(0,70,5), fontsize = 15)
plt.yticks (fontsize = 15)


plt.grid()

plt.show()

#### As you can see in the plot above, there is a point that is farther than other data with wind speed equal to 63-65 km/h. maybe it’s related to a rare condition that has happened or it may be a noise that needs further analysis.
-------------------------------------------------------------------------------------------------------------------------

#### We can see that most of our data are in the range with 0-15 km/h for wind speed and 0.6-0.9 for humidity.
-------------------------------------------------------------------------------------------------------------------------

#### Again we can see some points with humidity equal to zero that are far from other data. 

In [None]:
# The comments for this cell are the same with our first scatter plot. you can turn back to that cell and see what's going on.

plt.figure(figsize = (18,12))

plt.scatter(df6["PRESSURE (millibars)"], df6["HUMIDITY"], s = 2 , color = 'teal')

plt.xlabel ("PRESSURE (millibars)", fontsize = 20 , color = "Blue")
plt.ylabel ("HUMIDITY", fontsize = 20 , color = "Blue")
plt.title ("HUMIDITY VS Pressure", fontsize = 20 , color = "red")

plt.xticks  (range(960,1070,10), fontsize = 15)
plt.yticks (fontsize = 15)


plt.grid()


plt.show()

 #### According to the plot, most of the recorded data have pressure between 1010-1025 millibars. And the humidity between 0.6-0.9. The density of points in certain areas can provide insights into the occurrence of specific combinations of temperature and humidity.
 -------------------------------------------------------------------------------------------------------------------------
 #### At the left part of the plot, we can see a decreasing trend that as the pressure increases from 975 to 1010 millibars, the humidity decreases. There is also an increasing trend in the right part of the plot that as the pressure increases, the humidity increases too. 

In [None]:
# The comments for this cell are the same with our first scatter plot. you can turn back to that cell and see what's going on.

plt.figure(figsize = (15,10))

plt.scatter(df6["TEMPERATURE (C)"], df6["PRESSURE (millibars)"], s = 2 , color = 'deeppink')

plt.xlabel ("TEMPERATURE (C)", fontsize = 20 , color = "Blue")
plt.ylabel ("PRESSURE (millibars)", fontsize = 20 , color = "Blue")
plt.title ("Pressure VS Temperature", fontsize = 20 , color = "red")

plt.xticks ( fontsize = 15)
plt.yticks (range(970,1070,10),fontsize = 15)


plt.grid()

plt.show()

 #### According to the plot, the highest pressures are occurred when the temperature is lower than 10 C. by increasing the temperature, the average amount of pressure decreases. There are some points in the plot with the lower pressure that occurred between 0-20 C. the highest dense of the data are related to the part which has pressure between 1010 and 1022.
 -------------------------------------------------------------------------------------------------------------------------

  #### It can be seen that when the temperature is lower than 10 C, the trend is decreasing, it means that by increasing the temperature, the mean wind speed decreases. Bur after 10 C, there is no specific trend, by increasing the temperature, the wind speed can be higher or lower. 


In [None]:
# The comments for this cell are the same with our first scatter plot. you can turn back to that cell and see what's going on.

plt.figure(figsize = (15,10))

plt.scatter(df6["TEMPERATURE (C)"], df6["WIND SPEED (km/h)"], s = 2 , color = 'dodgerblue')

plt.xlabel ("TEMPERATURE (C)", fontsize = 20 , color = "Blue")
plt.ylabel ("WIND SPEED (km/h)", fontsize = 20 , color = "Blue")
plt.title ("Wind Speed VS Temperature", fontsize = 20 , color = "red")

plt.xticks ( fontsize = 15)
plt.yticks (fontsize = 15)


plt.grid()

plt.show()

  #### The plot trend shows that by increasing the temperature from -22 to 8 C, the wind speed increases. But after 8C, if the temperature increases, the wind speed decreases.
 -------------------------------------------------------------------------------------------------------------------------

  #### The temperature range of 0-20 C and the wind speed range of 5-18 km/h have the highest frequency. As you can see, there is a point with wind speed higher than 60 km/h that it was seen in the preceding plots too. 

In [None]:
# The comments for this cell are the same with our first scatter plot. you can turn back to that cell and see what's going on.

plt.figure(figsize = (15,10))

plt.scatter(df6["PRESSURE (millibars)"],df6["WIND SPEED (km/h)"],  s = 2 , color = 'brown')

plt.xlabel ("WIND SPEED (km/h)", fontsize = 20 , color = "Blue")
plt.ylabel ("PRESSURE (millibars)", fontsize = 20 , color = "Blue")
plt.title ("Wind Speed VS Pressure", fontsize = 20 , color = "red")

plt.xticks  (fontsize = 15)
plt.yticks (fontsize = 15)


plt.grid()

plt.show()

  #### At the left part of the plot, there is no specific trend, but after the pressure of 1010 millibars, as the pressure increases, the wind speed decreases. Pressures between 1010 and 1022 millibars and wind speeds between 0 and 20 km/h have higher frequency than other values.

In [None]:
plt.figure(figsize=(8, 10))  # Creating a new figure with a specific size of 8 inches by 10 inches using the figsize parameter of plt.figure().

plt.hist(df6["HUMIDITY"], ec="black", color="lightpink")  # Creating a histogram plot using the "HUMIDITY" column from df6, with black edge color (ec) and light pink bars.

plt.xlabel("HUMIDITY", fontsize=15, color="red")  # Setting the x-axis label as "HUMIDITY" with a font size of 15 and color of red.
plt.ylabel("Frequency", fontsize=15, color="red")  # Setting the y-axis label as "Frequency" with a font size of 15 and color of red.
plt.title("HUMIDITY Histogram", fontsize=15, color="red")  # Setting the title of the plot as "HUMIDITY Histogram" with a font size of 15 and color of red.

plt.grid()  # Adding grid lines to the plot.

plt.xticks(np.arange(0, 1.1, 0.1), fontsize=13)  # Setting the x-axis tick positions and labels with a range from 0 to 1, incrementing by 0.1, and font size of 13.
plt.yticks(range(0, 24500, 1000), fontsize=13)  # Setting the y-axis tick positions and labels with a range from 0 to 24500, incrementing by 1000, and font size of 13.

plt.show()  # Displaying the plot.

 #### The highest humidity frequency is related to 0.8-1 and the lowest one related to 0-0.3
 -------------------------------------------------------------------------------------------------------------------------


In [None]:
# The comments for this cell are the same with our first histogram. you can turn back to that cell and see what's going on.

plt.figure(figsize = (8,10))

plt.hist(df6["TEMPERATURE (C)"], ec = "black", color = "deepskyblue", bins = 14)

plt.xlabel ("TEMPERATURE (C)", fontsize = 15 , color = "Red")
plt.ylabel ("Frequency", fontsize = 15 , color = "Red")
plt.title ("Temperature Histogram", fontsize = 15 , color = "red")

plt.grid()

plt.xticks (range(-20,45,5),fontsize = 13)
plt.yticks (fontsize = 13)


plt.show()

  #### Based on the histogram above, we can see which range of temperature has the highest frequency. The temperature range of 5-22 C has the highest frequency. And the temperature ranges of -22-5 C and 30-40 C have the lowest frequencies.
 -------------------------------------------------------------------------------------------------------------------------


In [None]:
# The comments for this cell are the same with our first histogram. you can turn back to that cell and see what's going on.

plt.figure(figsize = (8,9))

plt.hist(df6["PRESSURE (millibars)"], ec = "black", color = "purple")

plt.xlabel ("Pressure", fontsize = 15 , color = "Red")
plt.ylabel ("Frequency", fontsize = 15 , color = "Red")
plt.title ("Pressure Histogram", fontsize = 15 , color = "red")

plt.grid()

plt.xticks (range(980,1050,10), fontsize = 13)
plt.yticks (range(0,45000,5000),fontsize = 13)


plt.show()

  #### Based on the plot above, the highest frequency of pressure is related to 1015-1025 millibars. And the lowest frequencies are for the pressure range of 985-1005 and 1035-1045 millibars.
 -------------------------------------------------------------------------------------------------------------------------


In [None]:
# The comments for this cell are the same with our first histogram. you can turn back to that cell and see what's going on.

plt.figure(figsize = (8,9))

plt.hist(df6["WIND SPEED (km/h)"], ec = "black", color = "yellow", bins = 13)

plt.xlabel ("WIND SPEED (km/h)", fontsize = 15 , color = "Red")
plt.ylabel ("Frequency", fontsize = 15 , color = "Red")
plt.title ("Wind Speed Histogram", fontsize = 15 , color = "red")

plt.grid()

plt.xticks (range(0,70,5),fontsize = 13)
plt.yticks (range(0,45000,5000),fontsize = 13)

plt.show()

 #### It’s clear that wind speeds between 10-15 km/h have higher frequency than other values. And the lowest frequencies are for wind speed of higher than 35 km/h.
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments for this cell are the same with our first histogram. you can turn back to that cell and see what's going on.

plt.figure(figsize = (10,8))

plt.hist(df6["PRECIP TYPE"], ec = "black", color = "greenyellow", bins = 13)

plt.xlabel ("PRECIP TYPE", fontsize = 15 , color = "Red")
plt.ylabel ("Frequency", fontsize = 15 , color = "Red")
plt.title ("PRECIP TYPE Histogram", fontsize = 15 , color = "red")

plt.grid()

plt.xticks (fontsize = 13)
plt.yticks (range(0,90000,5000),fontsize = 13)

plt.show()

 #### We have approximately 94000 data. And according to the plots above, most of the time, the weather is rainy than snowy. This is clear from the histogram too. Nearly 85000 data are related to the rainy condition but the snowy condition is less than 12000.
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
plt.figure(figsize=(8, 10))  # Creating a new figure with a specific size of 8 inches by 10 inches using the figsize parameter of plt.figure().

Year = pd.to_datetime(df6["Formatted Date"]).dt.year  # Extracting the year from the "Formatted Date" column of df6 using pd.to_datetime().dt.year and assigning it to the variable Year.

plt.title("Number of samples per year", fontsize=18, color='red')  # Setting the title of the plot as "Number of samples per year" with a font size of 18 and color of red.

sns.countplot(x=Year)  # Creating a countplot using Seaborn's countplot() function, with the x-axis as Year.

plt.yticks(range(0, 9200, 300))  # Setting the y-axis tick positions and labels with a range from 0 to 9200, incrementing by 300.

plt.xlabel("Year", color='red', fontsize=16)  # Setting the x-axis label as "Year" with a font size of 16 and color of red.
plt.ylabel("Count", color='red', fontsize=16)  # Setting the y-axis label as "Count" with a font size of 16 and color of red.

plt.grid()  # Adding grid lines to the plot.

plt.show()  # Displaying the plot.

 #### As you can see in the plot above, for each year, we have approximately 8400 to 8700 data. The years 2008 and 2011 have the highest data in contrast to others.
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments for this cell are the same with our first countplot. you can turn back to that cell and see what's going on.

plt.figure(figsize = (8, 10))

Month = pd.to_datetime(df6["Formatted Date"]).dt.month

plt.title ("Number of samples per month", fontsize = 18, color = 'red')

sns.countplot(x = Month)

plt.yticks(range(0,8500,300))

plt.xlabel("Month",color = 'red', fontsize = 16)
plt.ylabel("Count",color = 'red', fontsize = 16)

plt.grid()

plt.show()

 #### As you can see in the plot above, most of the data are for the months 1,3,7,8 that are January, March, July and August respectively.it means that we have more  data for winter and summer seasons. But all months have more than approximately 7300 data.
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments for this cell are the same with our first countplot. you can turn back to that cell and see what's going on.

plt.figure(figsize = (11, 10))

Day = pd.to_datetime(df6["Formatted Date"]).dt.day

plt.title ("Number of samples per day", fontsize = 18, color = "red")

sns.countplot(x = Day)

plt.yticks(range(0,3300,100))

plt.xlabel("Day",color = 'red', fontsize = 16)
plt.ylabel("Count",color = 'red', fontsize = 16)

plt.grid()

plt.show()

 #### The number of data for each day is approximately near to each other and it’s a number near to 3000-3200. But three days of each month, have the lowest frequencies. The days 29, 30, 31 and among these 3 days, the day 31 has the least frequency due to the fact that all months don’t have 31 days. Most of them have at least 30 days.
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments for this cell are the same with our first countplot. you can turn back to that cell and see what's going on.

plt.figure(figsize = (11, 10))

Day = pd.to_datetime(df6["Formatted Date"]).dt.day

plt.title ("Number of PRECIP TYPE repetitions", fontsize = 18, color = "red")

sns.countplot(x = df6["PRECIP TYPE"], palette= ["cyan",'pink'])



plt.xlabel("PRECIP TYPE",color = 'red', fontsize = 16)
plt.ylabel("Count",color = 'red', fontsize = 16)

plt.yticks(range(0,100000,10000), fontsize =14 )
plt.xticks(fontsize = 16)
plt.grid()

plt.show()

 #### As you can see in the plot above, rain is more probable than snow
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
rain, snow = df6["PRECIP TYPE"].value_counts()  # Counting the occurrences of each unique value in the "PRECIP TYPE" column of df6 and assigning the counts to the variables rain and snow.
print(f'Snow: {rain}')  # Printing the count of snow using a formatted string with the value of rain inserted.
print(f'Rain: {snow}')  # Printing the count of rain using a formatted string with the value of snow inserted.

In [None]:
labels = ['Rain', 'Snow']  # Creating a list of labels for the pie chart: 'Rain' and 'Snow'.
sizes = [rain, snow]  # Creating a list of sizes (counts) for the corresponding labels: rain and snow.

fig, ax = plt.subplots()  # Creating a figure and subplot using plt.subplots() and assigning them to fig and ax.

ax.pie(sizes, labels=labels, colors=["cyan", "pink"], autopct='%1.1f%%')  # Creating a pie chart using ax.pie(). 
# The sizes are the counts of rain and snow, labels are the corresponding labels, colors are set to cyan and pink, and autopct formats the percentage values with one decimal place.

plt.title("PRECIP TYPE percent", color='red', fontsize=16)  # Setting the title 
plt.show()  # Displaying the plot.

 #### As you can see in the plot above, rain is more probable than snow
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
counts = df6["SUMMARY"].value_counts()  # Counting the occurrences of each unique value in the "SUMMARY" column of df6 and storing the counts in the variable "counts".

df_count = pd.DataFrame({"SUMMARY": counts.index, "Count": counts.values})  # Creating a new DataFrame called "df_count" with two columns: "SUMMARY" and "Count".

df_count.sort_values(by="Count")  # Sorting the DataFrame "df_count" by the values in the "Count" column.

df_count  # Printing the DataFrame "df_count".

In [None]:
others = df_count["Count"].iloc[5:].sum()  # Calculating the sum of counts for rows beyond the first 5 rows in the "Count" column of df_count and storing it in the variable "others".

df_count_max = df_count.head(5)  # Selecting the first 5 rows of df_count and storing them in the DataFrame "df_count_max".

new_row = {"SUMMARY": "Others", "Count": others}  # Creating a new row with the label "Others" and the calculated sum of counts, and storing it in the dictionary "new_row".

counter = pd.concat([df_count_max, pd.DataFrame(new_row, index=[0])], ignore_index=True)  # Concatenating df_count_max and a new DataFrame created from new_row, and assigning the result to the DataFrame "counter". The ignore_index parameter resets the index of the concatenated DataFrame.

In [None]:
# this code will solve the problem of not showing the plots created by plotly when you open my notebook
import plotly
from plotly.offline import iplot

In [None]:
# import plotly as one of the most powerful libraries for visualization and create dynamics plot with.
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)

In [None]:
# Creating pie plot using plotly

plt.figure(figsize = (4,8))
sizes = counter ["Count"]
labels=counter["SUMMARY"]
# figure
fig = {
  "data": [
    {
      "values": sizes,
      "labels": labels,
      "domain": {"x": [0, .5]},
      "name": "SUMMARY",
      "hoverinfo":"label+percent+name",
      "hole": .1,
      "type": "pie"
    },],
  "layout": {
        "title":"SUMMARY rates",
        "annotations": [
            { "font": { "size": 20},
              "showarrow": False,
              "text": "",
                "x": .2,
                "y": 1,
             
            },
        ]
    }
}

iplot(fig)


 #### For a better visualization, I just plotted the summary types that have the highest frequencies among others which are related partly cloudy, mostly cloudy, over cast, clear and foggy. And I put other summary types in another group named by others.
 -------------------------------------------------------------------------------------------------------------------------
  #### As you can see the highest rates are related to the “Partly Cloudy” with 33.3 % and “Mostly Cloudy” with 29.4 %. The lowest rates are related to the “Foggy” and “Others” summary types.
  -------------------------------------------------------------------------------------------------------------------------
  #### So it can be concluded that in England, the weather is cloudy most of the time. 

-------------------------------------------------------------------------------------------------------------------------

In [None]:
df7 = df6.copy()  # Creating a copy of df6 and assigning it to df7.

df7['Date'] = pd.to_datetime(df7['Formatted Date']).dt.date.astype('datetime64[ns]')  # Converting the 'Formatted Date' column to datetime format, extracting the date component, and assigning it to a new 'Date' column in df7.

df7['Time'] = df7['Formatted Date'].dt.time  # Extracting the time component from the 'Formatted Date' column and assigning it to a new 'Time' column in df7.

df7  # Printing the modified DataFrame df7.

In [None]:
df7 = df6.copy()  # Creating a copy of df6 and assigning it to df7.

df7['Date'] = pd.to_datetime(df7['Formatted Date']).dt.date.astype('datetime64[ns]')  # Converting the 'Formatted Date' column to datetime format, extracting the date component, and assigning it to a new 'Date' column in df7.

df7['Time'] = df7['Formatted Date'].dt.time  # Extracting the time component from the 'Formatted Date' column and assigning it to a new 'Time' column in df7.

df7  # Printing the modified DataFrame df7.

In [None]:
df8 = df7.copy()
df8['month'] = df8['Date'].dt.month
df8['day'] = df8['Date'].dt.day
df8['year'] = df8['Date'].dt.year
df8['hour']=[t.hour for t in df8["Time"].values]
df8

In [None]:
df12 = pd.DataFrame(df8.groupby("SUMMARY")['HUMIDITY'].mean(), columns=["HUMIDITY"])  # Calculating the mean humidity for each unique value in the "SUMMARY" column of df8 and creating a new DataFrame df12 with the calculated mean humidity values.

df12 = df12.sort_values(by="HUMIDITY")  # Sorting df12 in ascending order based on the "HUMIDITY" column.

df12


In [None]:
df11 = df8.groupby("hour")['SUMMARY'].value_counts().unstack(fill_value=0)  # Calculating the count of each unique value in the "SUMMARY" column of df8 grouped by the "hour" column, and creating a new DataFrame df11 with the counts, with missing values filled as 0.

df11 = df11.idxmax()  # Finding the maximum occurring value for each hour and assigning the resulting series to df11.

df11 = pd.DataFrame(df11, columns=["Time"])  # Creating a new DataFrame df11 with the maximum occurring values as the "Time" column.

df11 = df11.sort_values(by="Time")  # Sorting df11 in ascending order based on the "Time" column.
df11

In [None]:
plt.figure(figsize=(15, 14))  # Set the figure size.

plt.scatter(df11.index, df11["Time"], s=60, color='red')  # Plot the scatter points with x-values from df11 index and y-values from the "Time" column.

plt.xticks(rotation=90, fontsize=13, color="blue")  # Rotate and set the x-axis tick labels' rotation, font size, and color.
plt.yticks(fontsize=13, color="blue")  # Set the y-axis tick labels' font size and color.

plt.xlabel("SUMMARY", fontsize=20, color="green")  # Set the x-axis label with font size and color.
plt.ylabel("Time", fontsize=20, color="green")  # Set the y-axis label with font size and color.
plt.title("The Most Probable SUMMARY Type Based on Time", fontsize=20, color="green")  # Set the plot title with font size and color.
plt.yticks(range(0, 25, 1))  # Set the y-axis tick locations.

plt.grid(True, color='black')  # Add a grid with black color.

plt.subplots_adjust(bottom=0.3)  # Adjust the bottom margin to avoid overlapping labels.

plt.show()  # Show the plot.

 #### According to the plot, rain mostly has occurred at 00:00, clear occurred at 01:00, Drizzle at 02:00, and so on. As you can see, 10:00 is the most probable time for 5 conditions, most of them are related to the Breezy and dry and the combinations of them.
  -------------------------------------------------------------------------------------------------------------------------
 #### It can be seen that over cast condition has occurred mostly at 22:00 and 23:00. So from this plot, you can obtain useful information that which summary type is the most probable one at each hour.
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
df9 = pd.DataFrame(df8.groupby("SUMMARY")['WIND SPEED (km/h)'].mean(), columns=["WIND SPEED (km/h)"])
df10 = df9.sort_values(by = "WIND SPEED (km/h)")
df10

In [None]:
# The comments are the same with our first scatter plot. you can turn back to it and see what's going on
plt.figure (figsize = (14,12))

plt.scatter(df10.index, df10['WIND SPEED (km/h)'], s = 60, color = 'darkviolet')

plt.xticks(rotation =90, fontsize = 12, color = "darkBlue")
plt.yticks(fontsize = 12, color = "darkBlue")

plt.xlabel ("SUMMARY",fontsize = 18, color = "red" )
plt.ylabel ("Time",fontsize = 18, color = "red" )
plt.title ("The average wind speed for each SUMMARY type",fontsize = 18, color = "red" )

plt.yticks(range(0,65,4))
plt.grid(True, color = 'black')

plt.subplots_adjust(bottom=0.3)


plt.show()

  #### We can see the average value for wind speed for each summary type. We can see that the highest mean speed is related to the “Dangerously windy and partly cloudy” that is equal to 64 km/h. and the lowest one is related to the “Rain” that is equal to approximately 6 km/h.
  -------------------------------------------------------------------------------------------------------------------------
  #### According to the preceding plots in the previous pages, we found out that the most probable speeds are for the values under 15 km/h that in this plot we can see summary types with average speed of 15 km/h and less, that they are located in “Rain” to “Dry” in the horizontal axis.
  -------------------------------------------------------------------------------------------------------------------------
 #### If you remember, we saw a rare condition in our last scatter plots, with speed more than 60 km/h. So from this plot, we can understand that the rare condition that we mentioned, is related to “Dangerously windy and partly cloudy”.
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
df11 = pd.DataFrame(df8.groupby("SUMMARY")['TEMPERATURE (C)'].mean(), columns=["TEMPERATURE (C)"])
df11= df11.sort_values(by = "TEMPERATURE (C)")
df11

In [None]:
# The comments are the same with our first scatter plot. you can turn back to it and see what's going on

plt.figure (figsize = (14,14))

plt.scatter(df11.index, df11['TEMPERATURE (C)'], s = 60, color = 'magenta')

plt.xticks(rotation =90, fontsize = 12, color = "darkblue")
plt.yticks(fontsize = 12, color = "darkBlue")

plt.xlabel ("SUMMARY",fontsize =18, color = "red" )
plt.ylabel ("TEMPERATURE (C)",fontsize = 18, color = "red" )
plt.title ("The average Temperature for each SUMMARY type",fontsize = 18, color = "red" )

plt.yticks(range(-4,34,2))
plt.grid(True, color = 'black')

plt.subplots_adjust(bottom=0.3)


plt.show()

 #### For this plot, we can have a same analysis like the preceding one. We can see the mean temperature for each summary type in the plot above. The highest average temperature is related to the “Dry” type. And the lowest average temperature is related to “Breezy and Foggy” type which is under 0. All other types have temperature above zero.
  -------------------------------------------------------------------------------------------------------------------------
  #### From the histogram in previous pages, we understood that the most frequency is related to the temperature between 5 and 9 C, that from the plot above we can see that this range of temperature is related to the “Windy”, “Breezy and Over cast”, “Over cast”, “Windy and Over cast”.
   -------------------------------------------------------------------------------------------------------------------------

In [None]:
df30=pd.DataFrame(df8.groupby("SUMMARY")['PRESSURE (millibars)'].mean(), columns=["PRESSURE (millibars)"])
df30=df30.sort_values(by = "PRESSURE (millibars)")
df30

In [None]:
# The comments are the same with our first scatter plot. you can turn back to it and see what's going on

plt.figure (figsize = (14,14))

plt.scatter(df30.index, df30['PRESSURE (millibars)'], s = 60, color = 'deeppink')

plt.xticks(rotation =90, fontsize = 12, color = "darkblue")
plt.yticks(fontsize = 12, color = "darkBlue")

plt.xlabel ("SUMMARY",fontsize =18, color = "red" )
plt.ylabel ("PRESSURE (millibars)",fontsize = 18, color = "red" )
plt.title ("The average Pressure for each SUMMARY type",fontsize = 18, color = "red" )

plt.yticks(range(1004,1026,2))
plt.grid(True, color = 'black')

plt.subplots_adjust(bottom=0.3)


plt.show()

 ####  In the plot above, we can see the average pressure for each summary type. The lowest one is related to the “Windy and over cast” which is equal to nearly 1007 millibars. And the highest pressures are related to the “Foggy” and “Breezy and Dry” that are near to 1022.
  -------------------------------------------------------------------------------------------------------------------------
 #### In the scatter plots related to pressure, we understood that most of our data have pressure in range of 1010 to 1022 millibars. This range of pressure can be seen in the plot above which is related to all summary types except “Windy and over cast”, “Breezy and over cast”, “Windy and partly cloudy”, “Breezy and foggy”, and “Dangerously and partly cloudy”.
   -------------------------------------------------------------------------------------------------------------------------

In [None]:
df12 = pd.DataFrame(df8.groupby("SUMMARY")['HUMIDITY'].mean(), columns=["HUMIDITY"])
df12= df12.sort_values(by = "HUMIDITY")
df12

In [None]:
# The comments are the same with our first scatter plot. you can turn back to it and see what's going on

plt.figure (figsize = (14,14))

plt.scatter(df12.index, df12['HUMIDITY'], s = 60, color = 'darkred')

plt.xticks(rotation =90, fontsize = 12, color = "darkblue")
plt.yticks(fontsize = 12, color = "darkBlue")

plt.xlabel ("SUMMARY",fontsize = 15, color = "red" )
plt.ylabel ("HUMIDITY",fontsize = 15, color = "red" )
plt.title ("The average HUMIDITY for each SUMMARY type",fontsize = 15, color = "red" )

plt.yticks(np.arange(0,1.05,0.05))
plt.grid(True)

plt.subplots_adjust(bottom=0.3)


plt.show()

 #### In preceding plots, we found out that there are some points with humidity under 0.3 which with low frequencies, from the plot above, we can understand that those points were related to these summary types: “Dry”, “Windy and Dry”, “Dry and partly cloudy”, “Dry and mostly cloudy”, “Breezy and dry”. 
  -------------------------------------------------------------------------------------------------------------------------
 #### We can see that the lowest average humidity is related to the “Dry” condition and the highest are related to “Breezy and foggy”, “Rain”, “Foggy”.
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
df8

In [None]:
plt.figure(figsize=(25,14))  # Create a new figure with a size of 25 inches in width and 14 inches in height.

sns.lineplot(x=df8['Date'], y=df8["WIND SPEED (km/h)"], color='darkblue', errorbar=None)  # Plot a line graph using the "Date" column as the x-values and the "WIND SPEED (km/h)" column as the y-values from the DataFrame df8. The line color is set to dark blue, and no error bars are included.

plt.title('Feature: {Wind Speed}', fontsize=25, color='red')  # Set the title of the plot to 'Feature: {Wind Speed}' with a font size of 25 and color red.
plt.ylabel("WIND SPEED (km/h)", fontsize=25, color='red')  # Set the label for the y-axis to 'WIND SPEED (km/h)' with a font size of 25 and color red.
plt.xlabel("Year", fontsize=25, color='red')  # Set the label for the x-axis to 'Year' with a font size of 25 and color red.

plt.xticks(fontsize=17)  # Set the font size of the x-axis tick labels to 17.
plt.yticks(range(0,46,2), fontsize=17)  # Set the tick values and font size for the y-axis. The tick values range from 0 to 46 with a step size of 2. The font size is set to 17.

plt.grid()  # Add a grid to the plot.

plt.show()  # Display the plot.

In [None]:
# The comments are the same with our first line plot. you can turn back to it and see what's going on

plt.figure(figsize=(25,14))

sns.lineplot(x=df8['Date'], y=df8["HUMIDITY"], color='deeppink', errorbar=None)

plt.title('Feature: {HUMIDITY}', fontsize=25, color = 'red')
plt.ylabel("HUMIDITY", fontsize=25, color = 'red')
plt.xlabel("Year", fontsize=25, color = 'red')

plt.xticks(fontsize = 17)
plt.yticks(np.arange(0.25,1.05,0.05),fontsize = 17)

plt.grid()


plt.show()

In [None]:
# The comments are the same with our first line plot. you can turn back to it and see what's going on

plt.figure(figsize=(25,14))
sns.lineplot(x=df8['Date'], y=df8["PRESSURE (millibars)"], color='gold', errorbar=None)

plt.title('Feature: {Pressure}', fontsize=25, color = 'red')
plt.ylabel("PRESSURE (millibars))", fontsize=25, color = 'red')
plt.xlabel("Year", fontsize=25, color = 'red')

plt.xticks(fontsize = 17)
plt.yticks(range(975,1055,5),fontsize = 17)

plt.grid()

plt.show()

In [None]:
# The comments are the same with our first line plot. you can turn back to it and see what's going on

plt.figure(figsize=(25,12))
sns.lineplot(x=df8['Date'], y=df8["TEMPERATURE (C)"], color='deepskyblue', errorbar=None)

plt.title('Feature: {Temperature}', fontsize=25, color = 'red')
plt.ylabel("TEMPERATURE (C))", fontsize=25, color = 'red')
plt.xlabel("Year", fontsize=25, color = 'red')

plt.xticks(fontsize = 17)
plt.yticks(range(-15,40,5),fontsize = 17)

plt.grid()

plt.show()

 #### From the last 4 plots, you can see the average, maximum, minimum of each feature during each year. It’s not that much clear, but you can find some useful information based on your aims. For instance, in the humidity fluctuation plot, we can see that the lowest values are mostly related to the middle of 2012 and so on.
  -------------------------------------------------------------------------------------------------------------------------
  #### These plots are the output of “lineplot” in seaborn library. So it uses the average values. Due to this fact, we can’t see some values like that maximum value of wind speed near to 60 km/h in fluctuation plot.
   -------------------------------------------------------------------------------------------------------------------------

In [None]:
df8

In [None]:
fig, ax = plt.subplots(figsize=(15, 10), nrows=2, ncols=2)
# Create a figure and a set of subplots with a size of 15 inches in width and 10 inches in height. It creates a 2x2 grid of subplots and assigns them to the variable `ax`.

ax[0, 0].scatter(x=df8['year'], y=df8["TEMPERATURE (C)"], color='cyan')
# Create a scatter plot on the first subplot (top-left) with the x-values taken from the 'year' column of the DataFrame `df8` and the y-values taken from the 'TEMPERATURE (C)' column. The points are displayed in cyan color.

ax[0, 1].scatter(x=df8['year'], y=df8["WIND SPEED (km/h)"], color='magenta')
# Create a scatter plot on the second subplot (top-right) with the x-values taken from the 'year' column of the DataFrame `df8` and the y-values taken from the 'WIND SPEED (km/h)' column. The points are displayed in magenta color.

ax[1, 0].scatter(x=df8['year'], y=df8["PRESSURE (millibars)"], color='darkcyan')
# Create a scatter plot on the third subplot (bottom-left) with the x-values taken from the 'year' column of the DataFrame `df8` and the y-values taken from the 'PRESSURE (millibars)' column. The points are displayed in dark cyan color.

ax[1, 1].scatter(x=df8['year'], y=df8["HUMIDITY"], color='deeppink')
# Create a scatter plot on the fourth subplot (bottom-right) with the x-values taken from the 'year' column of the DataFrame `df8` and the y-values taken from the 'HUMIDITY' column. The points are displayed in deep pink color.

ax[0][0].set_title('Temperature VS Year', fontsize=16, color='darkblue')
# Set the title for the first subplot to 'Temperature VS Year' with a font size of 16 and color dark blue.

ax[0][0].set_xticks(range(2006, 2017, 1))
# Set the x-axis tick positions for the first subplot to range from 2006 to 2016 with a step size of 1.

# Repeat the above two lines for the remaining subplots, customizing their titles and x-axis tick positions accordingly.

for i, a in enumerate(ax.flatten()):
    a.grid(True)
    # Add a grid to each subplot.

    if i == 0:
        a.set_ylabel('TEMPERATURE (C)', fontsize=14, color="red")
        a.set_xlabel('Year', fontsize=14, color="red")
        # Set the y-axis label for the first subplot to 'TEMPERATURE (C)' with a font size of 14 and color red. Set the x-axis label to 'Year' with the same font size and color.

    # Repeat the above two lines for the remaining subplots, customizing their y-axis labels and x-axis labels accordingly.

fig.suptitle("Scatter plots", color="red", fontsize=20, fontweight='bold')
# Set the super title of the figure to 'Scatter plots' with a color of red, font size of 20, and bold font weight.

fig.subplots_adjust(hspace=0.5, wspace=0.3)
# Adjust the spacing between subplots, setting the vertical spacing to 0.5 and the horizontal spacing to 0.3.

plt.show()
# Display the plot.

 #### **Temperature VS Year:** as you can see, the highest temperature is related to the year 2007 and the lowest one is related to the year 2012 that is lower than -20 C. In the year 2012, we have a larger range for temperature values than other years.
 -------------------------------------------------------------------------------------------------------------------------
 #### **Wind Speed VS Year:** as the plot represents, the highest value of wind speed is some value near to 65 km/h that is related to the year 2007 and from our last analysis, we understood that this wind speed is related to the rare condition which is “Dangerously windy and partly cloudy”. And the lowest value of wind speed is equal to zero for all years. 
 -------------------------------------------------------------------------------------------------------------------------
  #### **Pressure VS Year:** as you can see, the year 2015, has the largest range for the pressure values. And in the preceding plots, we saw some points with pressure under 985 millibars. Now we can understand that these points are for the year 2015. The highest pressure has occurred in the year 2008 which is near to 1050 millibars.
 -------------------------------------------------------------------------------------------------------------------------
  #### **Humidity VS Year:** as the plot represents, the lowest humidity is equal to zero that has occurred in year 2008, 2009, 2012. And the highest values is equal to 1 that it has happened in all the years.


In [None]:
# The comments are the same with our first subplots for scatter plots. you can turn back to that cell and see what's going on!

plt.figure (figsize = (8,12))
fig, ax = plt.subplots(figsize=(14, 10), nrows=2, ncols=2)

ax[0, 0].scatter(x=df8['month'], y=df8["TEMPERATURE (C)"], color='cyan')
ax[0, 1].scatter(x=df8['month'], y=df8["WIND SPEED (km/h)"], color='magenta')
ax[1, 0].scatter(x=df8['month'], y=df8["PRESSURE (millibars)"], color='darkcyan')
ax[1, 1].scatter(x=df8['month'], y=df8["HUMIDITY"], color='deeppink')


ax[0][0].set_title('Temperature VS Month', fontsize=16, color = 'darkblue')
ax[0][0].set_xticks(range(1,13,1))

ax[0][1].set_title('Wind Speed VS Month', fontsize=16, color = 'darkblue')
ax[0][1].set_xticks(range(1,13,1))

ax[1][0].set_title('Pressure VS Month', fontsize=16, color = 'darkblue')
ax[1][0].set_xticks(range(1,13,1))

ax[1][1].set_title('HUMIDITY VS Month', fontsize=16, color = 'darkblue')
ax[1][1].set_xticks(range(1,13,1))


for i, a in enumerate(ax.flatten()):
    a.grid(True)
    if i == 0:
        a.set_ylabel('TEMPERATURE (C)', fontsize=14, color = "Red")
        a.set_xlabel('Month', fontsize=14, color = "Red")
    if i == 1:
        a.set_ylabel('WIND SPEED (km/h)', fontsize=14, color = "Red")
        a.set_xlabel('Month', fontsize=14, color = "Red")
    if i == 2:
        a.set_ylabel('PRESSURE (millibars)', fontsize=12, color = "Red")
        a.set_xlabel('Month', fontsize=14, color = "Red")
    if i == 3:
        a.set_ylabel('HUMIDITY', fontsize=14, color = "Red")
        a.set_xlabel('Month', fontsize=14, color = "Red")
        
fig.suptitle("Scatter plots", color = "red", fontsize = 20, fontweight='bold')

fig.subplots_adjust(hspace=0.5, wspace=0.3) 


plt.show()


 #### **Temperature VS Month:** according to the plot, month 2 that is February, has the largest range for temperature values. The lowest temperature has occurred in this month too. And the highest one has occurred in month 7 that is July. It’s clear that by passing from month 1 to 7, the average temperature increases and by passing from month 7 to 12, the average temperature decreases.
 -------------------------------------------------------------------------------------------------------------------------

 #### **Wind Speed VS Month:** based on the plot, the highest wind speed that is near to 65 km/h, is related to the month 1 that is January. In the last page, we understood that this value has recorded in the year 2007. So we are completing our information about the dataset step by step. The lowest value is equal to 0 that is related to the condition that there is no movement of wind at a particular location. It’s clear that the month January, has the largest range of wind speed.
  -------------------------------------------------------------------------------------------------------------------------
 #### **Pressure VS Month:** as you can see in the middle of the plot, as we pass from winter to spring, the range of pressure values decreases as we have the highest one in the January and the lowest one in months 7, 8 that are related July, August. So it can be concluded that in summer we have the smallest range of pressure values. And when we pass from summer to autumn, this range increases. The lowest value of pressure has occurred in month 1 which is under 980 millibars, while other months pressures are higher than this value. And the maximum value for pressure has occurred in month 2 which February.  
 -------------------------------------------------------------------------------------------------------------------------
 #### **Humidity VS Month:** as you can see, for month 12 which is December, all the Humidity values are higher than 0.4 but there are some points equal to zero. So maybe something rare has occurred in that month that made the condition to have such a low humidity. It may need your further analysis. Or you can ask from the dataset owner. Month 3 and 7 which are March and July respectively, have the highest ranges of humidity changes.
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first subplots for scatter plots. you can turn back to that cell and see what's going on!

fig, ax = plt.subplots(figsize=(26, 12), nrows=2, ncols=2)

ax[0, 0].scatter(x=df8['day'], y=df8["TEMPERATURE (C)"], color='cyan')
ax[0, 1].scatter(x=df8['day'], y=df8["WIND SPEED (km/h)"], color='magenta')
ax[1, 0].scatter(x=df8['day'], y=df8["PRESSURE (millibars)"], color='darkcyan')
ax[1, 1].scatter(x=df8['day'], y=df8["HUMIDITY"], color='deeppink')


ax[0][0].set_title('Temperature VS Day', fontsize=16, color = 'darkblue')
ax[0][0].set_xticks(range(1,32,1))

ax[0][1].set_title('Wind Speed VS Day', fontsize=16, color = 'darkblue')
ax[0][1].set_xticks(range(1,32,1))

ax[1][0].set_title('Pressure VS Day', fontsize=16, color = 'darkblue')
ax[1][0].set_xticks(range(1,32,1))

ax[1][1].set_title('HUMIDITY VS Day', fontsize=16, color = 'darkblue')
ax[1][1].set_xticks(range(1,32,1))


for i, a in enumerate(ax.flatten()):
    a.grid(True)
    if i == 0:
        a.set_ylabel('TEMPERATURE (C)', fontsize=14, color = "Red")
        a.set_xlabel('Day', fontsize=14, color = "Red")
    if i == 1:
        a.set_ylabel('WIND SPEED (km/h)', fontsize=14, color = "Red")
        a.set_xlabel('Day', fontsize=14, color = "Red")
    if i == 2:
        a.set_ylabel('PRESSURE (millibars)', fontsize=14, color = "Red")
        a.set_xlabel('Day', fontsize=14, color = "Red")
    if i == 3:
        a.set_ylabel('HUMIDITY', fontsize=14, color = "Red")
        a.set_xlabel('Day', fontsize=14, color = "Red")
        
fig.suptitle("Scatter plots", color = "red", fontsize = 20, fontweight='bold')

fig.subplots_adjust(hspace=0.4, wspace=0.1) 


plt.show()

 #### **Temperature VS Day:** according to the plot, most of the temperatures are higher than 0 but there are some records in each day that the temperature is lower than 0. It’s clear that the average temperature of each day is a value in a range of 0-20
  -------------------------------------------------------------------------------------------------------------------------
 #### **Wind Speed VS Day:** the highest value of wind speed is higher than 60km/h which is related to the day 29. In general, the highest value of wind speed for each day is in range of 40-50 km/h and the lowest one is equal to 0. And the range for wind speed changes is approximately the same in each day.
  -------------------------------------------------------------------------------------------------------------------------
 #### **Pressure VS Day:** based on the plot, the highest ranges of pressure changes are related to the days 30 and 31 and the lowest ranges are related to the days 3 and 4. In the previous plots, there were some points with pressure lower than 985 millibars, and from this plot it can be seen that these pressures are related to the days 30 and 31. 
  -------------------------------------------------------------------------------------------------------------------------
  #### **Humidity VS Day:** the highest range for humidity changes is related to the day 17. We can see that the maximum humidity for each day is equal to 1, there are some points with the value of 0 that they can be seen in the days 2,3,5,8,9,10,11,17,20,21,25.
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first subplots for scatter plots. you can turn back to that cell and see what's going on!

fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2)

ax[0, 0].scatter(x=df8['hour'], y=df8["TEMPERATURE (C)"], color='cyan')
ax[0, 1].scatter(x=df8['hour'], y=df8["WIND SPEED (km/h)"], color='magenta')
ax[1, 0].scatter(x=df8['hour'], y=df8["PRESSURE (millibars)"], color='darkcyan')
ax[1, 1].scatter(x=df8['hour'], y=df8["HUMIDITY"], color='deeppink')


ax[0][0].set_title('Temperature vs Hour', fontsize=12, color = 'darkblue')
ax[0][0].set_xticks(range(0,24,1))
ax[0][1].set_title('Wind Speed vs Hour', fontsize=12, color = 'darkblue')
ax[0][1].set_xticks(range(0,24,1))
ax[1][0].set_title('Pressure vs Hour', fontsize=12, color = 'darkblue')
ax[1][0].set_xticks(range(0,24,1))
ax[1][1].set_title('HUMIDITY vs Hour', fontsize=12, color = 'darkblue')
ax[1][1].set_xticks(range(0,24,1))

for i, a in enumerate(ax.flatten()):
    a.grid(True)
    if i == 0:
        a.set_ylabel('TEMPERATURE (C)', fontsize=12, color = "Red")
        a.set_xlabel('Hour', fontsize=12, color = "Red")
    if i == 1:
        a.set_ylabel('WIND SPEED (km/h)', fontsize=12, color = "Red")
        a.set_xlabel('Hour', fontsize=12, color = "Red")
    if i == 2:
        a.set_ylabel('PRESSURE (millibars)', fontsize=12, color = "Red")
        a.set_xlabel('Hour', fontsize=12, color = "Red")
    if i == 3:
        a.set_ylabel('HUMIDITY', fontsize=12, color = "Red")
        a.set_xlabel('Hour', fontsize=12, color = "Red")
        
fig.suptitle("Scatter plots", color = "red", fontsize = 20, fontweight='bold')

fig.subplots_adjust(hspace=0.5, wspace=0.2) 



plt.show()

 #### **Temperature VS Hour:** according to the plot, the mean temperature for each hour is in range of 5-20 C. we can see a large range of temeprature changes in each hour. So based on the month and season, we can have different values for the temperature.
 -------------------------------------------------------------------------------------------------------------------------
 #### **Wind Speed VS Hour:** as the plot represents, we can see that the highest value of wind speed is related to the hour 12. In our last analysis we understood that this value is related to the year 2007 and the month January and the day 29. So now we have a complete information about that record. 
 -------------------------------------------------------------------------------------------------------------------------
 #### **Pressure VS Hour:** based on the plot, there are some points with pressure lower than 985 millibars. These points can be seen in almost every hour. But the average pressure of each hour is something near to 1020 millibars. The maximum value of each hour is less than 1050 millibars. 
 -------------------------------------------------------------------------------------------------------------------------
 #### **Humidity VS Hour:** from the plot above, it’s clear that the hours 0-3, have the lowest range of humidity changes. And the hour 11 and 13, have the largest one. In this plot, we can see some records with humidity equal to zero which they can be seen in 50 percent of hours. 
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
df14 = pd.DataFrame(df8.groupby("year")['TEMPERATURE (C)'].mean(), columns=["TEMPERATURE (C)"])
df14

In [None]:
df8

In [None]:
df13 = df8.copy()
# Create a copy of the DataFrame `df8` and assign it to the variable `df13`.

df13['Time'] = df13['Formatted Date'].dt.time
# Extract the time component from the 'Formatted Date' column of `df13` and assign it to a new column called 'Time'.

df13 = df13.set_index('Formatted Date')
# Set the 'Formatted Date' column as the index of `df13`.

df13 = df13.drop(["Date","Time","month","day","year","hour"], axis=1)
# Remove the columns 'Date', 'Time', 'month', 'day', 'year', and 'hour' from `df13` along the axis 1 (columns).

## Line Plots

In [None]:
df13

## Yearly Mean DataFrame

In [None]:
df13_numeric = df13.apply(pd.to_numeric, errors='coerce')
# Convert the values in `df13` to numeric type. If any value cannot be converted, it will be set to NaN (Not a Number).

df_yearly = df13_numeric.resample('y').mean()
# Resample the data in `df13_numeric` to a yearly frequency and calculate the mean value for each year. Assign the result to `df_yearly`.

df_yearly = df_yearly.drop(["SUMMARY","PRECIP TYPE"], axis=1)
# Remove the columns 'SUMMARY' and 'PRECIP TYPE' from `df_yearly` along the axis 1 (columns).

df_yearly
# Display the resulting DataFrame `df_yearly`.

In [None]:
plt.figure(figsize=(10,6))
# Create a new figure with a size of 10 inches in width and 6 inches in height.

sns.lineplot(x=df_yearly.index.year, y=df_yearly["TEMPERATURE (C)"], color='darkblue', errorbar=None)
# Create a line plot using seaborn (sns) where the x-values are the years extracted from the index of `df_yearly` and the y-values are the 'TEMPERATURE (C)' column of `df_yearly`. The line color is set to dark blue, and no error bars are displayed.

plt.title('Feature: {Temperature}', fontsize=15, color='darkred')
# Set the title of the plot to 'Feature: {Temperature}' with a font size of 15 and color dark red.

plt.ylabel("TEMPERATURE (C)", fontsize=15, color='darkred')
# Set the y-axis label to 'TEMPERATURE (C)' with a font size of 15 and color dark red.

plt.xlabel("Year", fontsize=15, color='darkred')
# Set the x-axis label to 'Year' with a font size of 15 and color dark red.

plt.grid()
# Add a grid to the plot.

plt.show()
# Display the plot.

 #### According to the plot, the maximum average of temperature level is related to the year 2014 which is higher than 12.75 C and the minimum temperatures are related to the years 2006 and 2010 with the value under 11.25 C,
 -------------------------------------------------------------------------------------------------------------------------
 #### A significant change in average temperature can be seen while passing from 2006 to 2007 and from 2009 to 2010.
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first lineplot. you can turn back to that cell and see what's going on!

plt.figure(figsize=(10,6))

sns.lineplot(x=df_yearly.index.year, y=df_yearly["WIND SPEED (km/h)"], color='darkblue',errorbar=None)

plt.title('Feature: {Wind Speed}', fontsize=15, color = 'darkred')
plt.ylabel("WIND SPEED (km/h)", fontsize=15, color = 'darkred')
plt.xlabel("Year", fontsize=15, color = 'darkred')

plt.grid()


plt.show()

 #### According to the plot, the lowest level of wind speed average is for the year 2011 and the highest one is related to the year 2009. There is a notable variation in the mean wind speed between 2010 and 2011, as well as between 2011 and 2012.
 -------------------------------------------------------------------------------------------------------------------------
 #### There is an increasing trend in the average wind speed as we pass from 2006 to 2009. The trend changes to the decreasing mode when we pass from 2009 to 2011. 
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first lineplot. you can turn back to that cell and see what's going on!

plt.figure(figsize=(10,6))
sns.lineplot(x=df_yearly.index.year, y=df_yearly["PRESSURE (millibars)"], color='darkblue', errorbar=None)

plt.title('Feature: {Pressure}', fontsize=15, color = 'darkred')
plt.ylabel("PRESSURE (millibars)", fontsize=15, color = 'darkred')
plt.xlabel("Year", fontsize=15, color = 'darkred')

plt.grid()


plt.show()

 #### As shown in the plot, the highest average pressure is related to the year 2011 which is near to 1019 millibars and the lowest one is related to the year 2010 which is near to 1014 millibars. A significant differences in the average of Pressure level is observed between 2010 and 2011. 
 -------------------------------------------------------------------------------------------------------------------------
 #### There is also an increasing trend for the average of pressure level by passing from 2006 to 2010.


In [None]:
# The comments are the same with our first lineplot. you can turn back to that cell and see what's going on!

plt.figure(figsize=(10,6))
sns.lineplot(x=df_yearly.index.year, y=df_yearly["HUMIDITY"]*100, color='darkblue', errorbar=None)

plt.title('Feature: {HUMIDITY}', fontsize=15, color = 'darkred')
plt.ylabel("HUMIDITY", fontsize=15, color = 'darkred')
plt.xlabel("Year", fontsize=15, color = 'darkred')

plt.grid()


plt.show()

 #### As the evidenced by the graphical display, the highest average of humidity is related to the year 2010 which is near to 80 %. And the lowest one is related to the year 2007 which is under 65%. A significant differences in the average of humidity level is observed between 2006-2007 and 2009-2010.
  -------------------------------------------------------------------------------------------------------------------------

## Monthly Mean DataFrame

In [None]:
df13_numeric2 = df13.apply(pd.to_numeric, errors='coerce')
# Convert the values in `df13` to numeric type. If any value cannot be converted, it will be set to NaN (Not a Number). Assign the result to `df13_numeric2`.

df_monthly = df13_numeric2.resample('MS').mean()
# Resample the data in `df13_numeric2` to a monthly frequency, starting on the first day of each month, and calculate the mean value for each month. Assign the result to `df_monthly`.

df_monthly = df_monthly.drop(["SUMMARY","PRECIP TYPE"], axis=1)
# Remove the columns 'SUMMARY' and 'PRECIP TYPE' from `df_monthly` along the axis 1 (columns).

df_monthly
# Display the resulting DataFrame `df_monthly`.

In [None]:
# The comments are the same with our first lineplot. you can turn back to that cell and see what's going on!

plt.figure(figsize=(10,6))

sns.lineplot(x=df_monthly.index.month, y=df_monthly["TEMPERATURE (C)"], color='darkblue', errorbar=None)

plt.title('Feature: {Temperature}', fontsize=15, color = 'red')
plt.ylabel("TEMPERATURE (C)", fontsize=15, color = 'red')
plt.xlabel("Month", fontsize=15, color = 'red')
plt.grid()


plt.show()

 #### The average temperature reaches its maximum value in the month 7 which is in summer and its minimum value in the month 1 which is in winter. The plot has a parabolic shape. As we pass from winter to the end of spring, the average temperature increase but when we pass from the early summer to the end of autumn, the average temperature decreases.
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
plt.figure(figsize=(8, 5))
# Create a new figure with a size of 8 inches in width and 5 inches in height.

plt.plot(df_monthly["TEMPERATURE (C)"].index, df_monthly["TEMPERATURE (C)"].values, color='darkblue')
# Create a line plot with x-values from the index of the "TEMPERATURE (C)" column of `df_monthly` and y-values from the "TEMPERATURE (C)" column. The line color is set to dark blue.

plt.xlabel("Year", fontsize=13, color='red')
# Set the x-axis label to "Year" with a font size of 13 and color red.

plt.ylabel("TEMPERATURE (C)", fontsize=13, color='red')
# Set the y-axis label to "TEMPERATURE (C)" with a font size of 13 and color red.

plt.title("Average Temperature of years during their months", fontsize=13, color='red')
# Set the title of the plot to "Average Temperature of years during their months" with a font size of 13 and color red.

plt.xticks(fontsize=11)
# Set the font size of the x-axis tick labels to 11.

plt.yticks(range(-8, 26, 2), fontsize=11)
# Set the y-axis tick positions from -8 to 24 (inclusive) with a step size of 2. Set the font size of the y-axis tick labels to 11.

plt.grid()
# Add a grid to the plot.

plt.show()
# Display the plot.

  #### And you can see the same trend with more details in the plot above.
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first lineplot. you can turn back to that cell and see what's going on!

plt.figure(figsize=(10,6))

sns.lineplot(x=df_monthly.index.month, y=df_monthly["WIND SPEED (km/h)"], color='darkblue', errorbar=None)

plt.title('Feature: {Wind Speed}', fontsize=15, color = 'red')
plt.ylabel("WIND SPEED (km/h)", fontsize=15, color = 'red')
plt.xlabel("Month", fontsize=15, color = 'red')

plt.grid()


plt.show()

 #### The average wind speed reaches its maximum value in the month 3. And it reaches its minimum value in the month 8. When we pass from January to March in winter, the average wind speed increases, but when we pass from winter to spring, the average wind speed decreases. The wind speed approximately remains constant from the month 6-7 (June-July). By passing from summer to autumn, the average wind speed increases. 
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first plt.plot. you can turn back to that cell and see what's going on!

plt.figure (figsize = (8,5))

plt.plot(df_monthly["WIND SPEED (km/h)"].index, df_monthly["WIND SPEED (km/h)"].values, color = 'darkblue')

plt.xlabel("Year", fontsize = 13, color = 'red')
plt.ylabel("WIND SPEED (km/h)", fontsize = 13, color = 'red')
plt.title("Average Wind Speed of years during their months", fontsize = 13, color = 'red')


plt.xticks( fontsize = 10)
plt.yticks(range(6,19,1), fontsize = 10)

plt.grid()

plt.show()

 #### And you can see the same trend with more details in the plot above.
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first lineplot. you can turn back to that cell and see what's going on!

plt.figure(figsize=(10,6))

sns.lineplot(x=df_monthly.index.month, y=df_monthly["PRESSURE (millibars)"], color='darkblue', errorbar=None)

plt.title('Feature: {Pressure}', fontsize=15, color = 'red')
plt.ylabel("PRESSURE (millibars)", fontsize=15, color = 'red')
plt.xlabel("Month", fontsize=15, color = 'red')

plt.grid()


plt.show()

 #### The highest average pressure is related to the month 12 that is higher than 1021 millibars and the lowest one is related to the month 5 with 1014 millibars. When we pass from month 1 to 5, the average pressure decreases and from 5 to 10, the average pressure increase. So it can be concluded that the average pressure in spring is less than other seasons and in autumn is higher than other seasons.
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first plt.plot. you can turn back to that cell and see what's going on!

plt.figure (figsize = (8,5))

plt.plot(df_monthly["PRESSURE (millibars)"].index, df_monthly["PRESSURE (millibars)"].values, color = 'darkblue')

plt.xlabel("Year", fontsize = 13, color = 'red')
plt.ylabel("PRESSURE (millibars)", fontsize = 13, color = 'red')
plt.title("Average Pressure of years during their months", fontsize = 13, color = 'red')


plt.xticks( fontsize = 10)
plt.yticks(range(1005,1036,2), fontsize = 10)

plt.grid()

plt.show()

   #### And you can see the same trend with more details in the plot above.
     -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first lineplot. you can turn back to that cell and see what's going on!

plt.figure(figsize=(10,6))

sns.lineplot(x=df_monthly.index.month, y=df_monthly["HUMIDITY"], color='darkblue', errorbar=None)

plt.title('Feature: {HUMIDITY}', fontsize=15, color = 'red')
plt.ylabel("HUMIDITY", fontsize=15, color = 'red')
plt.xlabel("Month", fontsize=15, color = 'red')

plt.grid()


plt.show()

 #### As the plot shows, the maximum humidity average is related to the month 12 which is December. And the lowest values are related to the months 4, 7, 8 with the values under 0.65. It’s clear in the plot that the average humidity is higher in winter and autumn and lower in summer and spring.
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first plt.plot. you can turn back to that cell and see what's going on!

plt.figure (figsize = (8,5))

plt.plot(df_monthly["HUMIDITY"].index, df_monthly["HUMIDITY"].values, color = 'darkblue')

plt.xlabel("Year", fontsize = 13, color = 'red')
plt.ylabel("HUMIDITY", fontsize = 13, color = 'red')
plt.title("Average HUMIDITY of years during their months", fontsize = 13, color = 'red')

plt.xticks(fontsize = 10)
plt.yticks(np.arange(0.45,1.02,0.05),fontsize = 10)

plt.grid()

plt.show()

 #### And you can see the same trend with more details in the plot above.
     -------------------------------------------------------------------------------------------------------------------------

## Daily Mean DataFrame

In [None]:
df13_numeric3 = df13.apply(pd.to_numeric, errors='coerce')
df_daily = df13_numeric3.resample('D').mean()
df_daily=df_daily.drop(["SUMMARY","PRECIP TYPE"],  axis = 1)
df_daily

In [None]:
df_daily.drop_duplicates()

In [None]:
# The comments are the same with our first lineplot. you can turn back to that cell and see what's going on!

plt.figure(figsize=(12,6))

sns.lineplot(x=df_daily.index.day, y=df_daily["TEMPERATURE (C)"], color='darkgreen',errorbar=None)

plt.title('Feature: {Temperature}', fontsize=15, color = 'darkblue')
plt.ylabel("TEMPERATURE (C)", fontsize=15, color = 'darkblue')
plt.xlabel("Day", fontsize=15, color = 'darkblue')
plt.xticks(range(0,32,1))

plt.grid()


plt.show()

 #### In the plot above, the highest value of temperature is related to the day 23 and the lowest one is related to the day 31. But most of the data are near to 11.8-12.2 C. 
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first plt.plot. you can turn back to that cell and see what's going on!

plt.figure (figsize = (8,5))

plt.plot(df_daily["TEMPERATURE (C)"].index, df_daily["TEMPERATURE (C)"].values, color = 'darkgreen')

plt.xlabel("Year", fontsize = 13, color = 'red')
plt.ylabel("TEMPERATURE (C)", fontsize = 13, color = 'red')
plt.title("Average Temperature of years during their days", fontsize = 13, color = 'red')


plt.xticks( fontsize = 11)
plt.yticks( fontsize = 11)

plt.grid()

plt.show()

 #### The lowest value of temperature average is related to a day in the year 2012 which was concluded in the last analysis too but there is no many differences in the maximum values. 
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first lineplot. you can turn back to that cell and see what's going on!

plt.figure(figsize=(12,6))

sns.lineplot(x=df_daily.index.day, y=df_daily["WIND SPEED (km/h)"], color='darkgreen', errorbar=None)

plt.title('Feature: {Wind Speed}', fontsize=15, color = 'darkblue')
plt.ylabel("WIND SPEED (km/h)", fontsize=15, color = 'darkblue')
plt.xlabel("Day", fontsize=15, color = 'darkblue')
plt.xticks(range(0,32,1))

plt.grid()


plt.show()

 #### As indicated by the plot, the lowest values of the wind speed average are related to the days 2 and 3 that are near to 10 km/h. And the highest one is related to the day 5 which is higher than 12 km/h. overall, the wind speed average is approximately the same in all days and there is no many differences between them.
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first plt.plot. you can turn back to that cell and see what's going on!

plt.figure (figsize = (8,5))

plt.plot(df_daily["WIND SPEED (km/h)"].index, df_daily["WIND SPEED (km/h)"].values, color = 'darkgreen')

plt.xlabel("Year", fontsize = 13, color = 'red')
plt.ylabel("WIND SPEED (km/h)", fontsize = 13, color = 'red')
plt.title("Average Wind Speed of years during their days", fontsize = 13, color = 'red')


plt.xticks( fontsize = 11)
plt.yticks( fontsize = 11)

plt.grid()

plt.show()

  #### From this plot we can understand that the lowest average of wind speed is related to a day in the year 2016 and the highest one is related to a day in the year 2008.
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first lineplot. you can turn back to that cell and see what's going on!

plt.figure(figsize=(12,6))

sns.lineplot(x=df_daily.index.day, y=df_daily["PRESSURE (millibars)"], color='darkgreen', errorbar=None)

plt.title('Feature: {Pressure}', fontsize=15, color = 'darkblue')
plt.ylabel("PRESSURE (millibars)", fontsize=15, color = 'darkblue')
plt.xlabel("Day", fontsize=15, color = 'darkblue')
plt.xticks(range(0,32,1))

plt.grid()


plt.show()

 #### The plot above shows the pressure average for each day. As you can see, the highest value is related to the day 26 and the lowest one is related to the day 5. There are not that much differences between the lowest and the highest amount of pressure in the vertical axis. So it can be concluded that all the days have approximately the same amount of pressure average during all year. 
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first plt.plot. you can turn back to that cell and see what's going on!

plt.figure (figsize = (8,5))

plt.plot(df_daily["PRESSURE (millibars)"].index, df_daily["PRESSURE (millibars)"].values, color = 'darkgreen')

plt.xlabel("Year", fontsize = 13, color = 'red')
plt.ylabel("PRESSURE (millibars)", fontsize = 13, color = 'red')
plt.title("Average PRESSURE (millibars) of years during their days", fontsize = 13, color = 'red')


plt.xticks( fontsize = 11)
plt.yticks( fontsize = 11)

plt.grid()

plt.show()

 #### It’s clear from the plot above that the lowest level of average pressure has occurred in the year 2015 and the highest one in 2006. 
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first lineplot. you can turn back to that cell and see what's going on!

plt.figure(figsize=(12,6))

sns.lineplot(x=df_daily.index.day, y=df_daily["HUMIDITY"], color='darkgreen', errorbar=None)

plt.title('Feature: {HUMIDITY}', fontsize=15, color = 'darkblue')
plt.ylabel("HUMIDITY", fontsize=15, color = 'darkblue')
plt.xlabel("Day", fontsize=15, color = 'darkblue')
plt.xticks(range(0,32,1))

plt.grid()


plt.show()

  #### As the plot represents, the highest humidity average is related to the day 11 and the lowest one is related to the day 8. 
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first plt.plot. you can turn back to that cell and see what's going on!

plt.figure (figsize = (8,5))

plt.plot(df_daily["HUMIDITY"].index, df_daily["HUMIDITY"].values, color = 'darkgreen')

plt.xlabel("Year", fontsize = 13, color = 'red')
plt.ylabel("HUMIDITY", fontsize = 13, color = 'red')
plt.title("Average HUMIDITY of years during their days", fontsize = 13, color = 'red')


plt.xticks( fontsize = 11)
plt.yticks( fontsize = 11)

plt.grid()

plt.show()

 #### The last plot has no point with humidity near to 1 because the line plot just takes the average of averages. But in this plot, you can see a larger range of humidity changes. The lowest humidity average has occurred in the year 2012. You can obtain some other information based on your aim.
  -------------------------------------------------------------------------------------------------------------------------

## Box plots 

## How to explain box plots?
 #### Boxplots are a graphical representation of data that summarize the distribution of a dataset. They are useful for identifying outliers, skewness, and the spread of the data.
 #### The box in a boxplot represents the interquartile range (IQR), which is the range between the 25th and 75th percentiles of the data. The line inside the box represents the median. The whiskers extend from the box to the minimum and maximum values that are not considered outliers. Outliers are plotted as individual points beyond the whiskers.
 #### To analyze a boxplot, you should first examine the shape of the box. If the box is symmetrical, then the data is evenly  distributed. If the box is skewed to one side or the other, then the data is not evenly distributed.
 #### Next, you should examine the length of the whiskers. If the whiskers are short, then the data is tightly clustered around the median. If the whiskers are long, then the data is more spread out.
 #### Finally, you should look for outliers. Outliers are data points that fall outside of the whiskers. They may represent errors in the data or extreme values that need to be further investigated.
 #### Overall, boxplots are a useful tool for analyzing data and identifying patterns and trends. By examining the shape, length of whiskers, and outliers, you can gain insight into the distribution of your data and make informed decisions based on your findings.


In [None]:
counts =  df8["SUMMARY"].value_counts()
counts

In [None]:
plt.figure(figsize=(12, 16))
# Create a new figure with a size of 12 inches in width and 16 inches in height.

sns.boxplot(x=df8['SUMMARY'], y=df8['HUMIDITY'])
# Create a box plot using seaborn (sns) where the x-values are taken from the 'SUMMARY' column of `df8` and the y-values are taken from the 'HUMIDITY' column.

plt.xticks(rotation=90, fontsize=12, color='darkblue')
# Rotate the x-axis tick labels by 90 degrees to avoid overlapping. Set the font size to 12 and color to dark blue.

plt.yticks(np.arange(0, 1.02, 0.05), fontsize=12, color='darkblue')
# Set the y-axis tick positions from 0 to 1 (inclusive) with a step size of 0.05. Set the font size to 12 and color to dark blue.

plt.xlabel("SUMMARY", fontsize=18, color='red')
# Set the x-axis label to "SUMMARY" with a font size of 18 and color red.

plt.ylabel("HUMIDITY", fontsize=18, color='red')
# Set the y-axis label to "HUMIDITY" with a font size of 18 and color red.

plt.title("HUMIDITY vs SUMMARY", fontsize=16, color='red')
# Set the title of the plot to "HUMIDITY vs SUMMARY" with a font size of 16 and color red.

plt.grid()
# Add a grid to the plot.

plt.subplots_adjust(bottom=0.3)
# Adjust the bottom spacing to 0.3 to accommodate the x-axis tick labels.

plt.show()
# Display the plot.

  #### The highest range of humidity changes is related to the “Partly cloudy” type. And the lowest one is related to the “Rain”.
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first boxplot. you can turn back to that cell and see what's going on!

plt.figure(figsize = (14,12))
sns.boxplot(x = df8['SUMMARY'], y = df8['TEMPERATURE (C)'])

plt.xticks(rotation = 90, fontsize = 12 , color = 'darkblue')
plt.yticks(fontsize = 12, color = 'darkblue')

plt.xlabel("SUMMARY", fontsize = 18, color = 'red')
plt.ylabel("TEMPERATURE (C)", fontsize = 18, color = 'red')
plt.title("Temperature vs SUMMARY", fontsize = 16,color = 'red' )



plt.grid()

plt.subplots_adjust(bottom=0.5)

plt.show()

  #### The highest ranges of temperature changes are related to the “mostly cloudy”, “partly cloudy”, “clear”, “Breezy and partly cloudy”, and “dry”.
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first boxplot. you can turn back to that cell and see what's going on!

plt.figure(figsize = (12,12))
sns.boxplot(x = df8['SUMMARY'], y = df8['PRESSURE (millibars)'])

plt.xticks(rotation = 90, fontsize = 12 , color = 'darkblue')
plt.yticks(fontsize = 12, color = 'darkblue')

plt.xlabel("SUMMARY", fontsize = 18, color = 'red')
plt.ylabel("PRESSURE (millibars)", fontsize = 18, color = 'red')
plt.title("Pressure vs SUMMARY", fontsize = 16,color = 'red' )



plt.grid()
plt.subplots_adjust(bottom=0.5)


plt.show()

 #### The types “over cast”, “foggy”, “Breezy and mostly cloudy”, “Breezy” have the highest range of pressure changes. And the “windy and foggy” has the lowest range of pressure changes. 
  -------------------------------------------------------------------------------------------------------------------------
 #### In the “Dry and mostly cloudy” type, the median line is close to the bottom of the box, it suggests that the data is skewed towards lower values.
 -------------------------------------------------------------------------------------------------------------------------

In [None]:
# The comments are the same with our first boxplot. you can turn back to that cell and see what's going on!

plt.figure(figsize = (12,12))
sns.boxplot(x = df8['SUMMARY'], y = df8['WIND SPEED (km/h)'])

plt.xticks(rotation = 90, fontsize = 12 , color = 'darkblue')
plt.yticks(fontsize = 12, color = 'darkblue')

plt.xlabel("SUMMARY", fontsize = 18, color = 'red')
plt.ylabel("WIND SPEED (km/h)", fontsize = 18, color = 'red')
plt.title("Wind Speed vs SUMMARY", fontsize = 16,color = 'red' )



plt.grid()
plt.subplots_adjust(bottom=0.5)


plt.show()

  #### There are some single lines for some summary types which are related to the type with only one data. The types “Dry”, “Dry and mostly cloudy” and “over cast” have the highest range of wind speed changes.
  -------------------------------------------------------------------------------------------------------------------------

In [None]:
fig, ax = plt.subplots(figsize=(12,10), nrows=2, ncols=2)
sns.boxplot(x = df8['PRECIP TYPE'], y = df8['TEMPERATURE (C)'], ax=ax[0][0], palette =["cyan", "seashell"])
sns.boxplot(x = df8['PRECIP TYPE'], y = df8['WIND SPEED (km/h)'], ax=ax[0][1], palette =["cyan", "seashell"])
sns.boxplot(x = df8['PRECIP TYPE'], y = df8['PRESSURE (millibars)'], ax=ax[1][0], palette =["cyan", "seashell"])
sns.boxplot(x = df8['PRECIP TYPE'], y = df8['HUMIDITY'], ax=ax[1][1], palette =["cyan", "seashell"])

ax[0][0].set_title('PRECIP TYPE vs Temperature', fontsize=12, color = 'darkblue')
ax[0][1].set_title('PRECIP TYPE vs Wind Speed', fontsize=12, color = 'darkblue')
ax[1][0].set_title('PRECIP TYPE vs Pressure', fontsize=12, color = 'darkblue')
ax[1][1].set_title('PRECIP TYPE vs HUMIDITY', fontsize=12, color = 'darkblue')

for i, a in enumerate(ax.flatten()):
    a.grid(True)
    if i == 0:
        a.set_ylabel('TEMPERATURE (C)', fontsize=12, color = "Red")
        a.set_xlabel('PRECIP TYPE', fontsize=12, color = "Red")
    if i == 1:
        a.set_ylabel('WIND SPEED (km/h)', fontsize=12, color = "Red")
        a.set_xlabel('PRECIP TYPE', fontsize=12, color = "Red")
    if i == 2:
        a.set_ylabel('PRESSURE (millibars)', fontsize=12, color = "Red")
        a.set_xlabel('PRECIP TYPE', fontsize=12, color = "Red")
    if i == 3:
        a.set_ylabel('HUMIDITY', fontsize=12, color = "Red")
        a.set_xlabel('PRECIP TYPE', fontsize=12, color = "Red")
        
fig.suptitle("PRECIP TYPE Boxplots", color = "red", fontsize = 20, fontweight='bold')

fig.subplots_adjust(hspace=0.5, wspace=0.3) 


plt.show()

  #### **Temperature VS Precip Type:** as you can see, “Snow” has only occurred when the temperature is lower than 0 and the “Rain” has only occurred when the temperature is higher than 0. The average value of temperature for “Rain” is near to 15 C and for “Snow” is near to -5 C. We can see some points that are having higher temperature than the maximum of the rain boxplot which are outliers. Similarly there are some outliers with the temperature under -10 C. As shown in the figure, the “Snow” boxplot median line is close to the top of the box, so the data are skewed towards higher values. 
 -------------------------------------------------------------------------------------------------------------------------
  #### **Wind Speed VS Precip Type:** as the plot represents, the average wind speed for “Rain” is near to 10 km/h and for “Snow” is near to 8 km/h. again, we can see some outliers in this plot. 
 -------------------------------------------------------------------------------------------------------------------------
 #### **Pressure VS Precip Type:** it can be concluded that the average pressure for “Rain” precipitation is in range of 1010-1020 millibars, while for the “Snow” is in range of 1020-1030 millibars. If you remember, in the pressure scatter plots, there were some points with pressure lower than 985 millibars that in this plot, we can see these poins as our outliers. 
 -------------------------------------------------------------------------------------------------------------------------
  #### **Humidity VS Precip Type:** mostly, the humidity for “Snow” and “Rain” is higher than 0.5. In general, “snow” occurs with higher humidity than “Rain”. As you can see, the “Snow” precipitation has more outliers for humidity than the “Rain” Precipitation. 
------------------------------------------------------------------------------------------------------------------------

## Heat map

In [None]:
correlation_matrix = df6.select_dtypes (["float64"]).corr()
plt.figure(figsize=(10, 8))  # Set the size of the figure
sns.heatmap(correlation_matrix, annot=True, cmap="YlGnBu", xticklabels=correlation_matrix.columns, yticklabels=correlation_matrix.columns, linewidths=.5, fmt=".2f", annot_kws={"size": 10}, cbar=False)  # Plot the heatmap
plt.title("Correlation Heatmap")  # Set the title of the plot
plt.xticks(rotation=45)  # Rotate the x-axis labels by 45 degrees
plt.yticks(rotation=45)
plt.subplots_adjust(bottom=0.3, left = 0.3)
plt.show()  # Show the plot


  #### From the plot above, you can see the correlation between each two numerical features. as the number increases, the correlation increases. each feature has its highest correlation with itself that it's equal to one.
  -------------------------------------------------------------------------------------------------------------------------