In [35]:
#@title Introduction
# This notebook explores weather data across different cities to identify patterns and trends.
# The dataset contains weather information including temperature, humidity, precipitation, and wind speed for various locations at different times.

In [36]:
#@title Data Loading and Initial Inspection
# The weather data has been downloaded from Kaggle and loaded into a pandas DataFrame.
# The 'Date_Time' column was converted to datetime objects to facilitate time-based analysis.
# The following cells show the initial structure and summary of the loaded data.

In [2]:
import kagglehub

# bring the weather dataset from kaggle
path = kagglehub.dataset_download("prasad22/weather-data")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/prasad22/weather-data?dataset_version_number=1...


100%|██████████| 42.6M/42.6M [00:00<00:00, 104MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/prasad22/weather-data/versions/1


In [3]:
import pandas as pd
import os

# We need to find the CSV file from its directory.
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]

if csv_files:
    # this is a new project, so it should only find one csv.
    csv_file_path = os.path.join(path, csv_files[0])
    print("CSV file path:", csv_file_path)
    df = pd.read_csv(csv_file_path)
    df['Date_Time'] = pd.to_datetime(df['Date_Time'])
    print("DataFrame Head:")
    display(df.head())
else:
    print("No CSV files found in the downloaded dataset directory.")

CSV file path: /root/.cache/kagglehub/datasets/prasad22/weather-data/versions/1/weather_data.csv
DataFrame Head:


Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh
0,San Diego,2024-01-14 21:12:46,10.683001,41.195754,4.020119,8.23354
1,San Diego,2024-05-17 15:22:10,8.73414,58.319107,9.111623,27.715161
2,San Diego,2024-05-11 09:30:59,11.632436,38.820175,4.607511,28.732951
3,Philadelphia,2024-02-26 17:32:39,-8.628976,54.074474,3.18372,26.367303
4,San Antonio,2024-04-29 13:23:51,39.808213,72.899908,9.598282,29.898622


In [37]:
#@title Data Inspection - First Rows and Shape


In [4]:
df.shape

(1000000, 6)

In [5]:
df.describe()

Unnamed: 0,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh
count,1000000,1000000.0,1000000.0,1000000.0,1000000.0
mean,2024-03-10 10:40:58.896321792,14.779705,60.02183,5.109639,14.997598
min,2024-01-01 00:00:06,-19.969311,30.000009,9e-06,5.1e-05
25%,2024-02-04 16:28:23.750000128,2.269631,45.0085,2.580694,7.490101
50%,2024-03-10 11:43:28,14.778002,60.018708,5.109917,14.993777
75%,2024-04-14 03:51:32.500000,27.270489,75.043818,7.61375,22.51411
max,2024-05-18 19:44:10,39.999801,89.999977,14.971583,29.999973
std,,14.482558,17.324022,2.947997,8.663556


In [6]:
df.count()

Unnamed: 0,0
Location,1000000
Date_Time,1000000
Temperature_C,1000000
Humidity_pct,1000000
Precipitation_mm,1000000
Wind_Speed_kmh,1000000


In [7]:
newdf = df.groupby(df['Location']).count()

In [8]:
newdf.shape

(10, 5)

In [9]:
newdf

Unnamed: 0_level_0,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Chicago,100164,100164,100164,100164,100164
Dallas,99936,99936,99936,99936,99936
Houston,100076,100076,100076,100076,100076
Los Angeles,99922,99922,99922,99922,99922
New York,99972,99972,99972,99972,99972
Philadelphia,100122,100122,100122,100122,100122
Phoenix,100209,100209,100209,100209,100209
San Antonio,99962,99962,99962,99962,99962
San Diego,99774,99774,99774,99774,99774
San Jose,99863,99863,99863,99863,99863


In [10]:
df_sorted = df.sort_values(by=['Location', 'Temperature_C'])
display(df_sorted.head())

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh
371489,Chicago,2024-04-17 16:57:13,-9.999959,71.924975,1.964295,4.238524
696761,Chicago,2024-05-11 00:08:40,-9.999896,50.206267,8.088351,11.47018
341394,Chicago,2024-04-30 23:21:21,-9.998975,74.422793,3.429914,23.09226
567026,Chicago,2024-05-12 18:35:34,-9.998923,63.206223,0.086821,16.161789
278463,Chicago,2024-02-28 01:10:39,-9.998746,62.809237,6.699151,15.970875


In [11]:
Chicago_df = df[df['Location'] == 'Chicago']
df_sorted_Chikago = Chicago_df.sort_values(by=['Date_Time'])
display(df_sorted_Chikago.head())

Dallas_df = df[df['Location'] == 'Dallas']
display(Dallas_df.head())

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh
996349,Chicago,2024-01-01 00:03:25,5.222404,46.395687,6.984159,1.219788
898802,Chicago,2024-01-01 00:07:14,-6.403267,42.400326,0.451659,0.979239
484085,Chicago,2024-01-01 00:09:36,8.227097,58.197003,9.686955,10.769515
989809,Chicago,2024-01-01 00:10:35,-2.955866,86.983992,0.783134,20.737256
576663,Chicago,2024-01-01 00:10:59,28.382266,35.75563,0.064871,3.50288


Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh
12,Dallas,2024-02-27 21:07:10,32.016898,53.194371,3.552671,3.050196
31,Dallas,2024-02-22 14:16:18,12.865779,44.725912,4.809865,5.56755
35,Dallas,2024-03-22 02:34:26,17.439495,56.210161,9.728971,9.497027
40,Dallas,2024-04-03 22:07:55,6.514284,42.006015,1.405197,18.385767
60,Dallas,2024-05-06 17:55:33,-5.378187,84.479654,7.589245,28.508273


In [12]:
hot_days = df[df['Temperature_C'] > 30]
hot_days

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh
4,San Antonio,2024-04-29 13:23:51,39.808213,72.899908,9.598282,29.898622
10,New York,2024-03-04 13:47:15,35.145559,54.752866,8.349195,25.430310
12,Dallas,2024-02-27 21:07:10,32.016898,53.194371,3.552671,3.050196
13,Houston,2024-05-09 00:53:10,38.641269,85.952726,0.470782,20.779264
14,Houston,2024-05-12 15:57:55,39.666772,72.747026,1.263722,6.479492
...,...,...,...,...,...,...
999971,San Jose,2024-02-07 18:39:55,38.428298,42.200630,2.999543,10.208343
999972,San Antonio,2024-05-02 05:32:10,39.206090,66.627466,2.939854,1.334344
999980,Los Angeles,2024-03-27 12:25:39,34.641340,78.108101,8.131816,19.607961
999985,San Diego,2024-03-27 13:29:35,38.202771,83.619146,8.083618,7.834117


In [13]:
hot_days.sort_values(by=['Location','Temperature_C'])

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh
498616,Chicago,2024-02-13 23:34:10,30.000617,62.650852,6.633886,22.137326
932544,Chicago,2024-04-19 10:02:39,30.001282,81.875026,6.094817,27.967705
944311,Chicago,2024-03-30 00:33:23,30.001718,38.379020,9.709881,12.501967
170152,Chicago,2024-02-21 23:12:21,30.003339,82.866993,5.120783,2.984287
395023,Chicago,2024-01-04 03:31:51,30.005278,83.049402,3.489842,29.407188
...,...,...,...,...,...,...
852372,San Jose,2024-01-12 14:34:26,39.996586,53.617733,5.457450,2.326957
440085,San Jose,2024-02-29 03:44:50,39.997152,84.451862,8.940124,26.055338
387098,San Jose,2024-04-08 14:47:56,39.997461,47.011099,2.567605,5.178690
45079,San Jose,2024-02-17 10:32:21,39.997461,56.800032,0.505715,26.864884


In [14]:
avg_temp_by_location = df_sorted.groupby('Location')['Temperature_C'].mean().reset_index()
avg_temp_by_location.sort_values(by='Temperature_C', ascending=False, inplace=True)
display(avg_temp_by_location)

Unnamed: 0,Location,Temperature_C
3,Los Angeles,15.081063
5,Philadelphia,15.035892
7,San Antonio,15.027761
4,New York,15.013389
0,Chicago,15.009208
1,Dallas,15.009097
9,San Jose,14.956108
2,Houston,14.942985
8,San Diego,14.933539
6,Phoenix,12.792479


In [15]:
import plotly.express as px

fig = px.bar(avg_temp_by_location, x='Location', y='Temperature_C',
             title='Average Temperature by Location')

fig.update_layout(xaxis_title='Location', yaxis_title='Average Temperature (°C)')

fig.show()

In [16]:
# Group by Location and month, then calculate the mean
monthly_avg_location = df.groupby(['Location', pd.Grouper(key='Date_Time', freq='ME')]).agg({'Temperature_C': 'mean','Humidity_pct': 'mean'}).reset_index()

display(monthly_avg_location.head())

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct
0,Chicago,2024-01-31,14.85855,59.985599
1,Chicago,2024-02-29,15.115803,60.000772
2,Chicago,2024-03-31,15.076532,60.209846
3,Chicago,2024-04-30,15.08725,60.032718
4,Chicago,2024-05-31,14.852429,60.290724


In [17]:
monthly_avg_location.shape

(50, 4)

In [18]:
monthly_avg_location.describe()

Unnamed: 0,Date_Time,Temperature_C,Humidity_pct
count,50,50.0,50.0
mean,2024-03-31 00:00:00,14.801895,60.023027
min,2024-01-31 00:00:00,9.990732,59.8035
25%,2024-02-29 00:00:00,14.926788,59.931711
50%,2024-03-31 00:00:00,14.987152,60.00125
75%,2024-04-30 00:00:00,15.059033,60.106264
max,2024-05-31 00:00:00,15.321841,60.290724
std,,0.996369,0.128224


In [19]:
monthly_avg_location.sort_values(by=['Date_Time', 'Location'])

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct
0,Chicago,2024-01-31,14.85855,59.985599
5,Dallas,2024-01-31,14.937326,59.931464
10,Houston,2024-01-31,14.890978,60.171391
15,Los Angeles,2024-01-31,15.045351,60.029803
20,New York,2024-01-31,14.91393,60.095414
25,Philadelphia,2024-01-31,15.002061,59.955053
30,Phoenix,2024-01-31,9.99815,59.999574
35,San Antonio,2024-01-31,14.970648,59.838812
40,San Diego,2024-01-31,14.999351,60.202882
45,San Jose,2024-01-31,14.923275,59.973502


In [20]:
import plotly.express as px

fig = px.line(monthly_avg_location, x='Date_Time', y='Temperature_C', color='Location',
              title='Monthly Average Temperature by Location Over Time')
fig.update_layout(xaxis_title='Date', yaxis_title='Average Temperature')
fig.show()

In [21]:
temp_stats = monthly_avg_location['Temperature_C'].describe()
print(temp_stats)

count    50.000000
mean     14.801895
std       0.996369
min       9.990732
25%      14.926788
50%      14.987152
75%      15.059033
max      15.321841
Name: Temperature_C, dtype: float64


In [22]:
Q1 = monthly_avg_location['Temperature_C'].quantile(0.25)
Q3 = monthly_avg_location['Temperature_C'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

temperature_outliers = monthly_avg_location[(monthly_avg_location['Temperature_C'] < lower_bound) | (monthly_avg_location['Temperature_C'] > upper_bound)]
display(temperature_outliers)

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct
19,Los Angeles,2024-05-31,15.321841,59.865373
30,Phoenix,2024-01-31,9.99815,59.999574
31,Phoenix,2024-02-29,9.990732,59.994


In [23]:
outlier_cities = temperature_outliers['Location'].unique().tolist()
print("Outlier cities based on monthly average temperature:", outlier_cities)

Outlier cities based on monthly average temperature: ['Los Angeles', 'Phoenix']


In [24]:
import plotly.express as px

fig = px.box(monthly_avg_location, x='Location', y='Temperature_C',
             title='Distribution of Monthly Average Temperature by Location with Outliers')
fig.update_layout(xaxis_title='Location', yaxis_title='Average Temperature (°C)')
fig.show()

In [25]:
dfffff = df.query("Temperature_C > 30 and Humidity_pct > 70")
dfffff

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh
4,San Antonio,2024-04-29 13:23:51,39.808213,72.899908,9.598282,29.898622
13,Houston,2024-05-09 00:53:10,38.641269,85.952726,0.470782,20.779264
14,Houston,2024-05-12 15:57:55,39.666772,72.747026,1.263722,6.479492
22,San Antonio,2024-02-14 04:43:05,30.739684,85.603779,9.250559,24.375952
44,San Diego,2024-03-03 11:02:14,35.666565,74.060956,1.328726,24.161295
...,...,...,...,...,...,...
999920,Los Angeles,2024-05-11 15:37:43,33.572328,89.874507,0.664834,18.979422
999946,Dallas,2024-04-21 22:50:56,31.874522,83.823196,6.731808,23.113868
999954,Chicago,2024-01-22 01:18:42,31.870160,71.849950,0.842965,22.321245
999980,Los Angeles,2024-03-27 12:25:39,34.641340,78.108101,8.131816,19.607961


In [26]:
# Add a new column: temperature in Fahrenheit
df['temp_F'] = df['Temperature_C'].transform(lambda x: (x * 9/5) + 32)

In [27]:
df.head()

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh,temp_F
0,San Diego,2024-01-14 21:12:46,10.683001,41.195754,4.020119,8.23354,51.229402
1,San Diego,2024-05-17 15:22:10,8.73414,58.319107,9.111623,27.715161,47.721452
2,San Diego,2024-05-11 09:30:59,11.632436,38.820175,4.607511,28.732951,52.938385
3,Philadelphia,2024-02-26 17:32:39,-8.628976,54.074474,3.18372,26.367303,16.467843
4,San Antonio,2024-04-29 13:23:51,39.808213,72.899908,9.598282,29.898622,103.654783


In [28]:
df['year'] = df['Date_Time'].dt.year
df['month'] = df['Date_Time'].dt.month
df['day'] = df['Date_Time'].dt.day
df['weekday'] = df['Date_Time'].dt.day_name()

In [29]:
df.head()

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh,temp_F,year,month,day,weekday
0,San Diego,2024-01-14 21:12:46,10.683001,41.195754,4.020119,8.23354,51.229402,2024,1,14,Sunday
1,San Diego,2024-05-17 15:22:10,8.73414,58.319107,9.111623,27.715161,47.721452,2024,5,17,Friday
2,San Diego,2024-05-11 09:30:59,11.632436,38.820175,4.607511,28.732951,52.938385,2024,5,11,Saturday
3,Philadelphia,2024-02-26 17:32:39,-8.628976,54.074474,3.18372,26.367303,16.467843,2024,2,26,Monday
4,San Antonio,2024-04-29 13:23:51,39.808213,72.899908,9.598282,29.898622,103.654783,2024,4,29,Monday


In [30]:
# Apply custom function row-wise
def comfort_index(row):
    return row['Temperature_C'] - (0.55 - 0.0055*row['Humidity_pct']) * (row['Temperature_C'] - 14.5)

df['comfort_index'] = df.apply(comfort_index, axis=1)

In [31]:
df

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh,temp_F,year,month,day,weekday,comfort_index
0,San Diego,2024-01-14 21:12:46,10.683001,41.195754,4.020119,8.233540,51.229402,2024,1,14,Sunday,11.917508
1,San Diego,2024-05-17 15:22:10,8.734140,58.319107,9.111623,27.715161,47.721452,2024,5,17,Friday,10.055934
2,San Diego,2024-05-11 09:30:59,11.632436,38.820175,4.607511,28.732951,52.938385,2024,5,11,Saturday,12.597340
3,Philadelphia,2024-02-26 17:32:39,-8.628976,54.074474,3.183720,26.367303,16.467843,2024,2,26,Monday,-2.786819
4,San Antonio,2024-04-29 13:23:51,39.808213,72.899908,9.598282,29.898622,103.654783,2024,4,29,Monday,36.036011
...,...,...,...,...,...,...,...,...,...,...,...,...
999995,Dallas,2024-01-01 20:29:48,23.416877,37.705024,3.819833,16.538119,74.150379,2024,1,1,Monday,20.361756
999996,San Antonio,2024-01-20 15:59:48,6.759080,40.731036,8.182785,29.005558,44.166344,2024,1,20,Saturday,9.282460
999997,New York,2024-04-14 08:30:09,15.664465,62.201884,3.987558,0.403909,60.196037,2024,4,14,Sunday,15.422385
999998,Chicago,2024-05-12 20:10:43,18.999994,63.703245,4.294325,6.326036,66.199989,2024,5,12,Sunday,18.101651


In [32]:
# Extract data between two dates
subset = df.loc[(df['Date_Time'] >= '2024-02-01') & (df['Date_Time'] <= '2024-03-03')]
subset

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh,temp_F,year,month,day,weekday,comfort_index
3,Philadelphia,2024-02-26 17:32:39,-8.628976,54.074474,3.183720,26.367303,16.467843,2024,2,26,Monday,-2.786819
12,Dallas,2024-02-27 21:07:10,32.016898,53.194371,3.552671,3.050196,89.630416,2024,2,27,Tuesday,27.507506
16,San Antonio,2024-02-10 15:05:28,16.349790,65.812607,0.109090,6.597039,61.429621,2024,2,10,Saturday,16.001972
21,New York,2024-02-19 12:26:07,-7.383811,54.089973,1.905731,6.637064,18.709140,2024,2,19,Monday,-1.858036
22,San Antonio,2024-02-14 04:43:05,30.739684,85.603779,9.250559,24.375952,87.331432,2024,2,14,Wednesday,29.453839
...,...,...,...,...,...,...,...,...,...,...,...,...
999983,Chicago,2024-03-02 07:20:55,-6.500893,49.713982,3.050012,7.596484,20.298393,2024,3,2,Saturday,-0.692611
999988,Dallas,2024-02-20 02:47:16,8.533033,55.741400,5.099943,0.162536,47.359459,2024,2,20,Tuesday,9.985526
999991,New York,2024-02-14 03:55:10,4.210758,45.683075,2.053384,22.351735,39.579365,2024,2,14,Wednesday,7.284598
999993,Los Angeles,2024-02-15 00:12:56,33.255060,57.785182,1.402973,28.194420,91.859108,2024,2,15,Thursday,28.900482


In [33]:
avg_humidity_by_location = df.groupby('Location')['Humidity_pct'].mean().reset_index()

fig = px.pie(avg_humidity_by_location, values='Humidity_pct', names='Location',
             title='Average Humidity by Location')
fig.show()

In [34]:
avg_windspeed_by_location = df.groupby('Location')['Wind_Speed_kmh'].mean().reset_index()

fig = px.pie(avg_windspeed_by_location, values='Wind_Speed_kmh', names='Location',
             title='Average Wind Speed by Location')
fig.show()