In [2]:
import pandas as pd, numpy as np
from pathlib import Path
import fsspec

S3_STATIONS_TXT   = "s3://noaa-ghcn-pds/ghcnd-stations.txt"
S3_INVENTORY_TXT  = "s3://noaa-ghcn-pds/ghcnd-inventory.txt"
S3_BY_STATION     = "s3://noaa-ghcn-pds/csv/by_station/{id}.csv"
STOR = {"anon": True}

OUTDIR = Path('../data'); OUTDIR.mkdir(parents=True, exist_ok=True)
OUT_PARQUET = OUTDIR / 'ghcn_il_top4_daily.parquet'
OUT_CSV = OUTDIR / 'ghcn_il_top4_daily.csv'
print('Output:', OUT_PARQUET.resolve())

Output: /home/ek33/ATMS523/ATMS-523-Module-3-pandas-datetime-climate/data/ghcn_il_top4_daily.parquet


In [2]:
colspecs = [(0,11),(12,20),(21,30),(31,37),(38,40),(41,71),(72,75),(76,79),(80,85)]
names = ['ID','LATITUDE','LONGITUDE','ELEVATION','STATE','NAME','GSN_FLAG','HCN_CRN_FLAG','WMO_ID']

stations = pd.read_fwf(S3_STATIONS_TXT, colspecs=colspecs, names=names, dtype={'ID':str,'STATE':str,'WMO_ID':str}, storage_options=STOR)
stations['NAME'] = stations['NAME'].str.strip(); stations['STATE'] = stations['STATE'].fillna('').str.strip()

inventory = pd.read_csv(
    S3_INVENTORY_TXT, sep=r'\s+', names=['ID','LAT','LON','ELEMENT','FIRSTYEAR','LASTYEAR'],
    dtype={'ID':str,'ELEMENT':str,'FIRSTYEAR':int,'LASTYEAR':int}, engine='python', storage_options=STOR
)

stations.head(), inventory.head()

(            ID  LATITUDE  LONGITUDE  ELEVATION STATE                   NAME  \
 0  ACW00011604   17.1167   -61.7833       10.1        ST JOHNS COOLIDGE FLD   
 1  ACW00011647   17.1333   -61.7833       19.2                     ST JOHNS   
 2  AE000041196   25.3330    55.5170       34.0          SHARJAH INTER. AIRP   
 3  AEM00041194   25.2550    55.3640       10.4                   DUBAI INTL   
 4  AEM00041217   24.4330    54.6510       26.8               ABU DHABI INTL   
 
   GSN_FLAG HCN_CRN_FLAG WMO_ID  
 0      NaN          NaN    NaN  
 1      NaN          NaN    NaN  
 2      GSN          NaN  41196  
 3      NaN          NaN  41194  
 4      NaN          NaN  41217  ,
             ID      LAT      LON ELEMENT  FIRSTYEAR  LASTYEAR
 0  ACW00011604  17.1167 -61.7833    TMAX       1949      1949
 1  ACW00011604  17.1167 -61.7833    TMIN       1949      1949
 2  ACW00011604  17.1167 -61.7833    PRCP       1949      1949
 3  ACW00011604  17.1167 -61.7833    SNOW       1949      194

In [3]:
df = pd.read_csv(
...      "s3://noaa-ghcn-pds/csv/by_station/USC00087205.csv",
...      storage_options={"anon": True},  # passed to `s3fs.S3FileSystem`
         dtype={'Q_FLAG': 'object', 'M_FLAG': 'object'},
         parse_dates=['DATE']
... ).set_index('DATE')

  df = pd.read_csv(


In [4]:
df

Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1892-09-01,USC00087205,TMAX,322,,,6,
1892-09-02,USC00087205,TMAX,317,,,6,
1892-09-03,USC00087205,TMAX,317,,,6,
1892-09-04,USC00087205,TMAX,322,,,6,
1892-09-05,USC00087205,TMAX,333,,,6,
...,...,...,...,...,...,...,...
2025-12-05,USC00087205,PRCP,0,,,H,1600.0
2025-12-06,USC00087205,PRCP,0,,,H,1600.0
2025-12-07,USC00087205,PRCP,185,,,H,1600.0
2025-12-08,USC00087205,PRCP,406,,,H,1600.0


In [5]:
df_t = df.copy()

In [6]:
#make date the index
df_t.index = pd.to_datetime(df_t.index)

In [7]:
df_t

Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1892-09-01,USC00087205,TMAX,322,,,6,
1892-09-02,USC00087205,TMAX,317,,,6,
1892-09-03,USC00087205,TMAX,317,,,6,
1892-09-04,USC00087205,TMAX,322,,,6,
1892-09-05,USC00087205,TMAX,333,,,6,
...,...,...,...,...,...,...,...
2025-12-05,USC00087205,PRCP,0,,,H,1600.0
2025-12-06,USC00087205,PRCP,0,,,H,1600.0
2025-12-07,USC00087205,PRCP,185,,,H,1600.0
2025-12-08,USC00087205,PRCP,406,,,H,1600.0


In [8]:
df_recent = df_t.sort_index()
df_recent = df_recent.loc["1990-01-01":"2020-02-01"]
df_recent

Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-01-01,USC00087205,TMAX,256,,,0,1800.0
1990-01-01,USC00087205,PRCP,0,,,0,1800.0
1990-01-01,USC00087205,SNWD,0,,,0,
1990-01-01,USC00087205,TOBS,167,,,0,1800.0
1990-01-01,USC00087205,SNOW,0,P,,0,
...,...,...,...,...,...,...,...
2020-01-31,USC00087205,PRCP,0,,,7,1600.0
2020-02-01,USC00087205,TMIN,156,,,7,1600.0
2020-02-01,USC00087205,PRCP,61,,,7,1600.0
2020-02-01,USC00087205,TOBS,183,,,7,1600.0


In [9]:
data = df_recent[df_recent['ELEMENT'] == 'TMIN']
data

Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-01-01,USC00087205,TMIN,144,,,0,1800.0
1990-01-02,USC00087205,TMIN,61,,,0,1800.0
1990-01-03,USC00087205,TMIN,106,,,0,1800.0
1990-01-04,USC00087205,TMIN,128,,,0,1800.0
1990-01-05,USC00087205,TMIN,189,,,0,1800.0
...,...,...,...,...,...,...,...
2020-01-28,USC00087205,TMIN,117,,,7,1600.0
2020-01-29,USC00087205,TMIN,83,,,7,1600.0
2020-01-30,USC00087205,TMIN,106,,,7,1600.0
2020-01-31,USC00087205,TMIN,133,,,7,1600.0


In [10]:
data['DATA_VALUE'] = data['DATA_VALUE']/10
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['DATA_VALUE'] = data['DATA_VALUE']/10


Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-01-01,USC00087205,TMIN,14.4,,,0,1800.0
1990-01-02,USC00087205,TMIN,6.1,,,0,1800.0
1990-01-03,USC00087205,TMIN,10.6,,,0,1800.0
1990-01-04,USC00087205,TMIN,12.8,,,0,1800.0
1990-01-05,USC00087205,TMIN,18.9,,,0,1800.0
...,...,...,...,...,...,...,...
2020-01-28,USC00087205,TMIN,11.7,,,7,1600.0
2020-01-29,USC00087205,TMIN,8.3,,,7,1600.0
2020-01-30,USC00087205,TMIN,10.6,,,7,1600.0
2020-01-31,USC00087205,TMIN,13.3,,,7,1600.0


In [11]:
months = [1,10,11,12]

filtered_data = data[
    (data.index.month.isin(months))]

In [12]:
filtered_data

Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-01-01,USC00087205,TMIN,14.4,,,0,1800.0
1990-01-02,USC00087205,TMIN,6.1,,,0,1800.0
1990-01-03,USC00087205,TMIN,10.6,,,0,1800.0
1990-01-04,USC00087205,TMIN,12.8,,,0,1800.0
1990-01-05,USC00087205,TMIN,18.9,,,0,1800.0
...,...,...,...,...,...,...,...
2020-01-27,USC00087205,TMIN,12.2,,,7,1600.0
2020-01-28,USC00087205,TMIN,11.7,,,7,1600.0
2020-01-29,USC00087205,TMIN,8.3,,,7,1600.0
2020-01-30,USC00087205,TMIN,10.6,,,7,1600.0


In [13]:
# Convert to fahrenheit
filtered_data['TempF'] = filtered_data['DATA_VALUE']*1.8+32
filtered_data['TempF']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['TempF'] = filtered_data['DATA_VALUE']*1.8+32


DATE
1990-01-01    57.92
1990-01-02    42.98
1990-01-03    51.08
1990-01-04    55.04
1990-01-05    66.02
              ...  
2020-01-27    53.96
2020-01-28    53.06
2020-01-29    46.94
2020-01-30    51.08
2020-01-31    55.94
Name: TempF, Length: 3648, dtype: float64

In [14]:
# Attempting to interpret what "mean number of days per month" signifies

# Frost Risk
frost = filtered_data[filtered_data['TempF']<= 32]
frost

Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME,TempF
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1992-01-17,USC00087205,TMIN,-2.2,,,0,1800.0,28.04
1992-01-21,USC00087205,TMIN,0.0,,,0,1800.0,32.00
1992-01-25,USC00087205,TMIN,0.0,,,0,1800.0,32.00
1993-12-27,USC00087205,TMIN,-1.1,,,0,1600.0,30.02
1995-12-11,USC00087205,TMIN,0.0,,,0,1600.0,32.00
...,...,...,...,...,...,...,...,...
2018-01-04,USC00087205,TMIN,-2.8,,,7,1600.0,26.96
2018-01-05,USC00087205,TMIN,-1.7,,,7,1600.0,28.94
2018-01-18,USC00087205,TMIN,-5.6,,,7,1600.0,21.92
2018-01-19,USC00087205,TMIN,0.0,,,7,1600.0,32.00


In [15]:
print(len(frost))

75


In [16]:
print(len(filtered_data))

3648


In [17]:
# Freeze Risk
freeze = filtered_data[filtered_data['TempF']<= 28]
freeze

Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME,TempF
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1995-12-25,USC00087205,TMIN,-3.3,,,0,1600.0,26.06
1995-12-26,USC00087205,TMIN,-3.3,,,0,1600.0,26.06
1995-12-27,USC00087205,TMIN,-7.8,,,0,1600.0,17.96
1997-01-19,USC00087205,TMIN,-3.9,,,0,1600.0,24.98
1999-01-06,USC00087205,TMIN,-2.8,,,0,1600.0,26.96
1999-01-07,USC00087205,TMIN,-2.8,,,0,1600.0,26.96
1999-01-08,USC00087205,TMIN,-3.3,,,0,1600.0,26.06
2001-01-01,USC00087205,TMIN,-3.3,,,0,1600.0,26.06
2003-01-24,USC00087205,TMIN,-3.3,,,0,1600.0,26.06
2008-01-03,USC00087205,TMIN,-2.8,,,0,1600.0,26.96


In [18]:
print(len(freeze))

20


In [19]:
# Get a count of the days and divide by number of years to get average amount
frost['count'] = frost['TempF']/frost['TempF']
frost['count']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frost['count'] = frost['TempF']/frost['TempF']


DATE
1992-01-17    1.0
1992-01-21    1.0
1992-01-25    1.0
1993-12-27    1.0
1995-12-11    1.0
             ... 
2018-01-04    1.0
2018-01-05    1.0
2018-01-18    1.0
2018-01-19    1.0
2020-01-22    1.0
Name: count, Length: 75, dtype: float64

In [20]:
# and divide by number of years to get average amount
frost['count'] = frost['count']/30
frost['count']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frost['count'] = frost['count']/30


DATE
1992-01-17    0.033333
1992-01-21    0.033333
1992-01-25    0.033333
1993-12-27    0.033333
1995-12-11    0.033333
                ...   
2018-01-04    0.033333
2018-01-05    0.033333
2018-01-18    0.033333
2018-01-19    0.033333
2020-01-22    0.033333
Name: count, Length: 75, dtype: float64

In [21]:
frost_risk = frost.groupby([frost.index.month])["count"].sum()

In [22]:
frost_risk

DATE
1     1.866667
11    0.033333
12    0.600000
Name: count, dtype: float64

In [23]:
# Freeze Risk
freeze['count'] = freeze['TempF']/freeze['TempF']
freeze['count'] = freeze['count']/30
freeze['count']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  freeze['count'] = freeze['TempF']/freeze['TempF']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  freeze['count'] = freeze['count']/30


DATE
1995-12-25    0.033333
1995-12-26    0.033333
1995-12-27    0.033333
1997-01-19    0.033333
1999-01-06    0.033333
1999-01-07    0.033333
1999-01-08    0.033333
2001-01-01    0.033333
2003-01-24    0.033333
2008-01-03    0.033333
2009-01-22    0.033333
2010-01-06    0.033333
2010-01-07    0.033333
2010-01-10    0.033333
2010-01-11    0.033333
2010-12-28    0.033333
2010-12-29    0.033333
2014-01-25    0.033333
2018-01-04    0.033333
2018-01-18    0.033333
Name: count, dtype: float64

In [24]:
freeze_risk = freeze.groupby([freeze.index.month])["count"].sum()
freeze_risk

DATE
1     0.500000
12    0.166667
Name: count, dtype: float64

In [25]:
# Correlations with ENSO

# Import ENSO
enso = pd.read_csv('https://www.cpc.ncep.noaa.gov/data/indices/sstoi.indices', delim_whitespace=True)
enso

  enso = pd.read_csv('https://www.cpc.ncep.noaa.gov/data/indices/sstoi.indices', delim_whitespace=True)


Unnamed: 0,YR,MON,NINO1+2,ANOM,NINO3,ANOM.1,NINO4,ANOM.2,NINO3.4,ANOM.3
0,1982,1,24.28,-0.24,25.84,0.17,28.01,-0.21,26.65,0.08
1,1982,2,25.38,-0.72,26.26,-0.11,27.99,-0.11,26.54,-0.20
2,1982,3,25.22,-1.38,26.92,-0.25,28.18,-0.05,27.09,-0.14
3,1982,4,24.57,-1.16,27.52,-0.05,28.61,0.10,27.83,0.02
4,1982,5,24.00,-0.62,27.70,0.49,29.19,0.40,28.37,0.49
...,...,...,...,...,...,...,...,...,...,...
522,2025,7,22.29,0.46,25.92,0.04,28.84,0.05,27.24,-0.06
523,2025,8,21.09,0.23,24.97,-0.24,28.63,-0.06,26.58,-0.33
524,2025,9,20.40,-0.18,24.60,-0.41,28.41,-0.27,26.32,-0.44
525,2025,10,20.83,-0.04,24.74,-0.35,28.36,-0.33,26.29,-0.48


In [26]:
enso['MON']

0       1
1       2
2       3
3       4
4       5
       ..
522     7
523     8
524     9
525    10
526    11
Name: MON, Length: 527, dtype: int64

In [27]:
enso['month'] = enso['MON']
enso['year'] = enso['YR']
enso['day'] = 1
enso

Unnamed: 0,YR,MON,NINO1+2,ANOM,NINO3,ANOM.1,NINO4,ANOM.2,NINO3.4,ANOM.3,month,year,day
0,1982,1,24.28,-0.24,25.84,0.17,28.01,-0.21,26.65,0.08,1,1982,1
1,1982,2,25.38,-0.72,26.26,-0.11,27.99,-0.11,26.54,-0.20,2,1982,1
2,1982,3,25.22,-1.38,26.92,-0.25,28.18,-0.05,27.09,-0.14,3,1982,1
3,1982,4,24.57,-1.16,27.52,-0.05,28.61,0.10,27.83,0.02,4,1982,1
4,1982,5,24.00,-0.62,27.70,0.49,29.19,0.40,28.37,0.49,5,1982,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,2025,7,22.29,0.46,25.92,0.04,28.84,0.05,27.24,-0.06,7,2025,1
523,2025,8,21.09,0.23,24.97,-0.24,28.63,-0.06,26.58,-0.33,8,2025,1
524,2025,9,20.40,-0.18,24.60,-0.41,28.41,-0.27,26.32,-0.44,9,2025,1
525,2025,10,20.83,-0.04,24.74,-0.35,28.36,-0.33,26.29,-0.48,10,2025,1


In [29]:
enso['date'] = pd.to_datetime(enso[['year', 'month', 'day']])
enso

Unnamed: 0,YR,MON,NINO1+2,ANOM,NINO3,ANOM.1,NINO4,ANOM.2,NINO3.4,ANOM.3,month,year,day,date
0,1982,1,24.28,-0.24,25.84,0.17,28.01,-0.21,26.65,0.08,1,1982,1,1982-01-01
1,1982,2,25.38,-0.72,26.26,-0.11,27.99,-0.11,26.54,-0.20,2,1982,1,1982-02-01
2,1982,3,25.22,-1.38,26.92,-0.25,28.18,-0.05,27.09,-0.14,3,1982,1,1982-03-01
3,1982,4,24.57,-1.16,27.52,-0.05,28.61,0.10,27.83,0.02,4,1982,1,1982-04-01
4,1982,5,24.00,-0.62,27.70,0.49,29.19,0.40,28.37,0.49,5,1982,1,1982-05-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,2025,7,22.29,0.46,25.92,0.04,28.84,0.05,27.24,-0.06,7,2025,1,2025-07-01
523,2025,8,21.09,0.23,24.97,-0.24,28.63,-0.06,26.58,-0.33,8,2025,1,2025-08-01
524,2025,9,20.40,-0.18,24.60,-0.41,28.41,-0.27,26.32,-0.44,9,2025,1,2025-09-01
525,2025,10,20.83,-0.04,24.74,-0.35,28.36,-0.33,26.29,-0.48,10,2025,1,2025-10-01


In [30]:
enso.set_index('date', inplace = True)
enso

Unnamed: 0_level_0,YR,MON,NINO1+2,ANOM,NINO3,ANOM.1,NINO4,ANOM.2,NINO3.4,ANOM.3,month,year,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1982-01-01,1982,1,24.28,-0.24,25.84,0.17,28.01,-0.21,26.65,0.08,1,1982,1
1982-02-01,1982,2,25.38,-0.72,26.26,-0.11,27.99,-0.11,26.54,-0.20,2,1982,1
1982-03-01,1982,3,25.22,-1.38,26.92,-0.25,28.18,-0.05,27.09,-0.14,3,1982,1
1982-04-01,1982,4,24.57,-1.16,27.52,-0.05,28.61,0.10,27.83,0.02,4,1982,1
1982-05-01,1982,5,24.00,-0.62,27.70,0.49,29.19,0.40,28.37,0.49,5,1982,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-07-01,2025,7,22.29,0.46,25.92,0.04,28.84,0.05,27.24,-0.06,7,2025,1
2025-08-01,2025,8,21.09,0.23,24.97,-0.24,28.63,-0.06,26.58,-0.33,8,2025,1
2025-09-01,2025,9,20.40,-0.18,24.60,-0.41,28.41,-0.27,26.32,-0.44,9,2025,1
2025-10-01,2025,10,20.83,-0.04,24.74,-0.35,28.36,-0.33,26.29,-0.48,10,2025,1


In [49]:
filtered_new = filtered_data.copy()
filtered_temp = filtered_new['TempF']
filtered_temp = filtered_temp.resample('1M').mean()
filtered_temp

  filtered_temp = filtered_temp.resample('1M').mean()


DATE
1990-01-31    54.801935
1990-02-28          NaN
1990-03-31          NaN
1990-04-30          NaN
1990-05-31          NaN
                ...    
2019-09-30          NaN
2019-10-31    71.989032
2019-11-30    58.851034
2019-12-31    59.338400
2020-01-31    54.430323
Freq: ME, Name: TempF, Length: 361, dtype: float64

In [50]:
filtered_temp = filtered_temp.dropna()
filtered_temp.index = filtered_temp.index.map(lambda d: d.replace(day=1))
filtered_temp

DATE
1990-01-01    54.801935
1990-10-01    66.606452
1990-11-01    57.362000
1990-12-01    53.460645
1991-01-01    57.316129
                ...    
2019-01-01    49.001290
2019-10-01    71.989032
2019-11-01    58.851034
2019-12-01    59.338400
2020-01-01    54.430323
Name: TempF, Length: 121, dtype: float64

In [51]:
merged = pd.merge(filtered_temp, enso, left_index=True, right_index=True)
merged

Unnamed: 0,TempF,YR,MON,NINO1+2,ANOM,NINO3,ANOM.1,NINO4,ANOM.2,NINO3.4,ANOM.3,month,year,day
1990-01-01,54.801935,1990,1,23.92,-0.60,25.27,-0.40,28.20,-0.02,26.45,-0.11,1,1990,1
1990-10-01,66.606452,1990,10,20.08,-0.79,24.91,-0.17,28.96,0.28,26.95,0.18,10,1990,1
1990-11-01,57.362000,1990,11,20.73,-0.90,24.84,-0.36,28.82,0.15,26.73,-0.09,11,1990,1
1990-12-01,53.460645,1990,12,22.12,-0.72,25.18,-0.08,28.84,0.39,26.86,0.22,12,1990,1
1991-01-01,57.316129,1991,1,23.73,-0.78,25.63,-0.05,28.62,0.40,26.89,0.33,1,1991,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-01-01,49.001290,2019,1,25.32,0.80,26.41,0.73,29.06,0.83,27.24,0.67,1,2019,1
2019-10-01,71.989032,2019,10,19.94,-0.93,25.25,0.16,29.53,0.84,27.28,0.52,10,2019,1
2019-11-01,58.851034,2019,11,21.49,-0.13,25.57,0.37,29.40,0.73,27.28,0.46,11,2019,1
2019-12-01,59.338400,2019,12,23.11,0.27,25.56,0.29,29.29,0.85,27.08,0.44,12,2019,1


In [52]:
numeric_df = merged[['TempF','ANOM','ANOM.1','ANOM.2','ANOM.3']]
numeric_df

Unnamed: 0,TempF,ANOM,ANOM.1,ANOM.2,ANOM.3
1990-01-01,54.801935,-0.60,-0.40,-0.02,-0.11
1990-10-01,66.606452,-0.79,-0.17,0.28,0.18
1990-11-01,57.362000,-0.90,-0.36,0.15,-0.09
1990-12-01,53.460645,-0.72,-0.08,0.39,0.22
1991-01-01,57.316129,-0.78,-0.05,0.40,0.33
...,...,...,...,...,...
2019-01-01,49.001290,0.80,0.73,0.83,0.67
2019-10-01,71.989032,-0.93,0.16,0.84,0.52
2019-11-01,58.851034,-0.13,0.37,0.73,0.46
2019-12-01,59.338400,0.27,0.29,0.85,0.44


In [53]:
# Correlations
corr_matrix = numeric_df.corr(method='pearson')
# Compute R² by squaring correlation coefficients
r_squared_matrix = corr_matrix ** 2

# Print results
print("Correlation matrix (Pearson):")
print(corr_matrix, "\n")

print("R² matrix (Coefficient of Determination):")
print(r_squared_matrix)

Correlation matrix (Pearson):
           TempF      ANOM    ANOM.1    ANOM.2    ANOM.3
TempF   1.000000  0.076600  0.108143  0.143696  0.122766
ANOM    0.076600  1.000000  0.865486  0.529156  0.744179
ANOM.1  0.108143  0.865486  1.000000  0.798120  0.962156
ANOM.2  0.143696  0.529156  0.798120  1.000000  0.906447
ANOM.3  0.122766  0.744179  0.962156  0.906447  1.000000 

R² matrix (Coefficient of Determination):
           TempF      ANOM    ANOM.1    ANOM.2    ANOM.3
TempF   1.000000  0.005868  0.011695  0.020649  0.015071
ANOM    0.005868  1.000000  0.749066  0.280006  0.553802
ANOM.1  0.011695  0.749066  1.000000  0.636995  0.925743
ANOM.2  0.020649  0.280006  0.636995  1.000000  0.821646
ANOM.3  0.015071  0.553802  0.925743  0.821646  1.000000


The highest correlation with the temperatures is: ANOM.2

In [54]:
merged2 = pd.merge(frost_new, enso, left_index=True, right_index=True)
merged2

Unnamed: 0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME,TempF,count,YR,...,ANOM,NINO3,ANOM.1,NINO4,ANOM.2,NINO3.4,ANOM.3,month,year,day
1992-01-01,USC00087205,TMIN,-2.2,,,0,1800.0,28.04,0.033333,1992,...,0.07,26.86,1.18,28.80,0.58,28.23,1.67,1,1992,1
1992-01-01,USC00087205,TMIN,0.0,,,0,1800.0,32.00,0.033333,1992,...,0.07,26.86,1.18,28.80,0.58,28.23,1.67,1,1992,1
1992-01-01,USC00087205,TMIN,0.0,,,0,1800.0,32.00,0.033333,1992,...,0.07,26.86,1.18,28.80,0.58,28.23,1.67,1,1992,1
1993-12-01,USC00087205,TMIN,-1.1,,,0,1600.0,30.02,0.033333,1993,...,-0.37,25.25,-0.02,28.74,0.29,26.74,0.10,12,1993,1
1995-12-01,USC00087205,TMIN,0.0,,,0,1600.0,32.00,0.033333,1995,...,-0.86,24.31,-0.96,27.96,-0.48,25.73,-0.91,12,1995,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-01-01,USC00087205,TMIN,-2.8,,,7,1600.0,26.96,0.033333,2018,...,-0.74,24.59,-1.09,28.02,-0.20,25.87,-0.70,1,2018,1
2018-01-01,USC00087205,TMIN,-1.7,,,7,1600.0,28.94,0.033333,2018,...,-0.74,24.59,-1.09,28.02,-0.20,25.87,-0.70,1,2018,1
2018-01-01,USC00087205,TMIN,-5.6,,,7,1600.0,21.92,0.033333,2018,...,-0.74,24.59,-1.09,28.02,-0.20,25.87,-0.70,1,2018,1
2018-01-01,USC00087205,TMIN,0.0,,,7,1600.0,32.00,0.033333,2018,...,-0.74,24.59,-1.09,28.02,-0.20,25.87,-0.70,1,2018,1


In [55]:
numeric_df = merged2[['TempF','ANOM','ANOM.1','ANOM.2','ANOM.3']]

# Correlations
corr_matrix = numeric_df.corr(method='pearson')
# Compute R² by squaring correlation coefficients
r_squared_matrix = corr_matrix ** 2

# Print results
print("Correlation matrix (Pearson):")
print(corr_matrix, "\n")

print("R² matrix (Coefficient of Determination):")
print(r_squared_matrix)

Correlation matrix (Pearson):
           TempF      ANOM    ANOM.1    ANOM.2    ANOM.3
TempF   1.000000  0.185249  0.115522  0.019083  0.072327
ANOM    0.185249  1.000000  0.823232  0.611319  0.700054
ANOM.1  0.115522  0.823232  1.000000  0.831465  0.957924
ANOM.2  0.019083  0.611319  0.831465  1.000000  0.931800
ANOM.3  0.072327  0.700054  0.957924  0.931800  1.000000 

R² matrix (Coefficient of Determination):
           TempF      ANOM    ANOM.1    ANOM.2    ANOM.3
TempF   1.000000  0.034317  0.013345  0.000364  0.005231
ANOM    0.034317  1.000000  0.677711  0.373710  0.490076
ANOM.1  0.013345  0.677711  1.000000  0.691334  0.917618
ANOM.2  0.000364  0.373710  0.691334  1.000000  0.868251
ANOM.3  0.005231  0.490076  0.917618  0.868251  1.000000


The highest correlation with the frost temperatures is: ANOM
I Know I did not do it right to finish, but I ran out of time.