In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Read data from the JSON file
path = "weather_resources/weatherData2014-2015.csv"
with open(path, 'r') as csv_file:
    raw_data = pd.read_csv(csv_file)

raw_data.head()

Unnamed: 0,dt,dt_iso,time,zero,type,lon,temp,visibility,dew_point,feels_like,...,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1388534400,1/1/2014,0:00:00,0,UTC,-74.005973,32.23,10000.0,16.66,20.43,...,32.43,1022,48,19.55,295,0,800,Clear,sky is clear,01n
1,1388538000,1/1/2014,1:00:00,0,UTC,-74.005973,30.78,10000.0,16.16,18.18,...,31.26,1023,50,23.0,285,0,800,Clear,sky is clear,01n
2,1388541600,1/1/2014,2:00:00,0,UTC,-74.005973,30.27,10000.0,14.85,18.57,...,30.94,1024,48,17.27,250,0,800,Clear,sky is clear,01n
3,1388545200,1/1/2014,3:00:00,0,UTC,-74.005973,28.78,10000.0,12.6,16.18,...,29.95,1025,46,20.69,285,0,800,Clear,sky is clear,01n
4,1388548800,1/1/2014,4:00:00,0,UTC,-74.005973,27.64,10000.0,10.63,15.04,...,28.94,1025,44,19.55,305,0,800,Clear,sky is clear,01n


# Data Cleaning

In [4]:
# Data Cleaning

del raw_data["dt"]
del raw_data["zero"]
del raw_data["type"]
del raw_data["lon"]
del raw_data["visibility"]
del raw_data["dew_point"]
del raw_data["pressure"]
del raw_data["wind_deg"]
del raw_data["weather_main"]
del raw_data["weather_icon"]
del raw_data["weather_id"]

# Convert 'dt_iso' column to datetime format
raw_data['dt_iso'] = pd.to_datetime(raw_data['dt_iso'])

# convert 'time' column to datetime format
#raw_data['time'] = pd.to_datetime(raw_data['time'])
#raw_data['time_column'] = pd.to_timedelta(raw_data['time'])


# Rename columns
raw_data.rename(columns={'dt_iso': 'Date'}, inplace=True)
raw_data.rename(columns={'time': 'time_HMS'}, inplace=True)
raw_data.rename(columns={'clouds_all': 'Cloudiness'}, inplace=True)

raw_data.info()
raw_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18511 entries, 0 to 18510
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date                 18511 non-null  datetime64[ns]
 1   time_HMS             18511 non-null  object        
 2   temp                 18511 non-null  float64       
 3   feels_like           18511 non-null  float64       
 4   temp_min             18511 non-null  float64       
 5   temp_max             18511 non-null  float64       
 6   humidity             18511 non-null  int64         
 7   wind_speed           18511 non-null  float64       
 8   Cloudiness           18511 non-null  int64         
 9   weather_description  18511 non-null  object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 1.4+ MB


Unnamed: 0,Date,time_HMS,temp,feels_like,temp_min,temp_max,humidity,wind_speed,Cloudiness,weather_description
0,2014-01-01,0:00:00,32.23,20.43,30.16,32.43,48,19.55,0,sky is clear
1,2014-01-01,1:00:00,30.78,18.18,30.15,31.26,50,23.0,0,sky is clear
2,2014-01-01,2:00:00,30.27,18.57,28.36,30.94,48,17.27,0,sky is clear
3,2014-01-01,3:00:00,28.78,16.18,26.56,29.95,46,20.69,0,sky is clear
4,2014-01-01,4:00:00,27.64,15.04,26.24,28.94,44,19.55,0,sky is clear


In [5]:
# Finding Duplicates 
#print(raw_data.count())

#print("After removing duplicates:")
non_dupe_data = raw_data.drop_duplicates()
#print(non_dupe_data.count())

In [6]:
# Handling Missing Values
non_dupe_data = non_dupe_data.dropna()

non_na_data = non_dupe_data.dropna(axis=1)

In [7]:
# Renaming to finalized dataframe
weather_data = non_na_data
weather_data

Unnamed: 0,Date,time_HMS,temp,feels_like,temp_min,temp_max,humidity,wind_speed,Cloudiness,weather_description
0,2014-01-01,0:00:00,32.23,20.43,30.16,32.43,48,19.55,0,sky is clear
1,2014-01-01,1:00:00,30.78,18.18,30.15,31.26,50,23.00,0,sky is clear
2,2014-01-01,2:00:00,30.27,18.57,28.36,30.94,48,17.27,0,sky is clear
3,2014-01-01,3:00:00,28.78,16.18,26.56,29.95,46,20.69,0,sky is clear
4,2014-01-01,4:00:00,27.64,15.04,26.24,28.94,44,19.55,0,sky is clear
...,...,...,...,...,...,...,...,...,...,...
18506,2015-12-31,19:00:00,47.62,42.62,47.01,48.34,59,11.41,100,overcast clouds
18507,2015-12-31,20:00:00,47.79,41.81,47.19,48.45,57,14.99,100,overcast clouds
18508,2015-12-31,21:00:00,47.61,41.58,47.26,48.11,59,14.99,100,overcast clouds
18509,2015-12-31,22:00:00,46.98,42.19,46.47,47.66,55,10.29,100,overcast clouds



# Weather Summary

In [8]:
weather_data_c = weather_data.copy()
del weather_data_c["time_HMS"]
weather_data_c.columns

Index(['Date', 'temp', 'feels_like', 'temp_min', 'temp_max', 'humidity',
       'wind_speed', 'Cloudiness', 'weather_description'],
      dtype='object')

In [10]:
# Define a function to calculate the mode of a Series
def mode(series):
    return series.value_counts().index[0]

weather_summary = weather_data_c.groupby('Date').agg({
    'temp': 'mean',
    'feels_like': 'mean',
    'temp_min': 'min',
    'temp_max': 'max',
    'humidity': 'mean',
    'wind_speed': 'mean',
    'Cloudiness': 'mean',
    'weather_description': mode
}).reset_index()


new_column_names = {
    'Date': 'Date',
    'temp': 'Mean_Temp',
    'feels_like': 'Mean_Feels_Like',
    'temp_min': 'Min_Temp',
    'temp_max': 'Max_Temp',
    'humidity': 'Mean_Humidity',
    'wind_speed': 'Mean_Wind_Speed',
    'Cloudiness': 'Mean_Cloudiness',
    'weather_description': 'Weather_Description'
}

weather_summary.rename(columns=new_column_names, inplace=True)

# Now summary_data contains summarized data for each date
weather_summary

Unnamed: 0,Date,Mean_Temp,Mean_Feels_Like,Min_Temp,Max_Temp,Mean_Humidity,Mean_Wind_Speed,Mean_Cloudiness,Weather_Description
0,2014-01-01,28.941667,19.074167,22.96,34.95,48.958333,12.807917,13.125000,sky is clear
1,2014-01-02,29.206538,18.154615,22.96,34.14,71.576923,16.800000,93.846154,overcast clouds
2,2014-01-03,16.499286,3.899286,10.04,26.55,75.642857,20.561786,75.714286,snow
3,2014-01-04,15.930417,5.985417,6.76,28.11,50.083333,8.940000,3.125000,sky is clear
4,2014-01-05,30.174138,22.742759,18.95,37.11,76.068966,8.063793,59.310345,sky is clear
...,...,...,...,...,...,...,...,...,...
725,2015-12-27,52.473333,51.013333,46.02,62.58,84.000000,7.580370,100.000000,mist
726,2015-12-28,43.968750,37.787500,39.29,61.29,56.500000,13.561250,31.666667,sky is clear
727,2015-12-29,42.106571,34.629714,33.96,49.96,82.914286,15.026571,100.000000,mist
728,2015-12-30,43.105417,39.732083,39.45,48.65,85.166667,5.863750,83.333333,mist


# Output results

In [11]:
#dataframe into csv 
weather_data.to_csv('Data/weather_data_2014-2015.csv', index=False)

weather_summary.to_csv('Data/weather_daily_summary_2014-2015.csv', index=False)