In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import openpyxl

In [50]:
file_path = "/home/nchore/Downloads/RNINO3.4.xlsx"
xls = pd.ExcelFile(file_path)
sheet_names = xls.sheet_names 
sheet_previews = {sheet: xls.parse(sheet).head() for sheet in sheet_names}
sheet_names, sheet_previews

(['NINO3.4',
  'NINO4',
  'IODE',
  'IODW',
  'MOYALE',
  'WAJIR',
  'MANDERA',
  'LAMU',
  'MALINDI',
  'MSABAHA',
  'MTWAPA',
  'MOMBASA',
  'VOI'],
 {'NINO3.4':    YEAR       J       F       M       A     M.1     J.1     J.2     A.1  \
  0  1949  24.551  26.455  27.046  27.576  26.077  26.462  27.682  27.671   
  1  1950  25.058  26.847  27.080  27.011  26.326  26.602  27.884  28.408   
  2  1951  25.867  27.130  26.763  26.724  26.539  26.645  28.266  28.710   
  3  1952  26.277  26.809  26.328  26.752  26.410  26.184  28.394  28.081   
  4  1953  26.181  26.230  25.937  27.204  25.777  26.113  28.555  27.519   
  
          S       O       N       D  
  0  26.788  27.061  25.921  25.737  
  1  26.535  26.802  25.671  25.604  
  2  26.127  26.683  25.520  25.570  
  3  25.634  26.652  25.540  25.972  
  4  25.631  26.113  25.675  26.482  ,
  'NINO4':    year  January  February   March   April     May    June    July  August  \
  0  1949   26.943    27.219  28.112  28.553  27.711  2

Each sheet contains time-series data with sea surface temperature anomalies for a specific region

In [51]:
# Load all sheets into a dictionary with cleaned data
data_frames = {}
for sheet in sheet_names:
    df = xls.parse(sheet) 
    df = df.dropna(how="all") 
    df = df.dropna(axis=1, how="all") 
    
    df.columns = [str(col).strip() for col in df.columns]
    
    data_frames[sheet] = df

data_frames.keys(), {sheet: df.head() for sheet, df in data_frames.items()}


(dict_keys(['NINO3.4', 'NINO4', 'IODE', 'IODW', 'MOYALE', 'WAJIR', 'MANDERA', 'LAMU', 'MALINDI', 'MSABAHA', 'MTWAPA', 'MOMBASA', 'VOI']),
 {'NINO3.4':    YEAR       J       F       M       A     M.1     J.1     J.2     A.1  \
  0  1949  24.551  26.455  27.046  27.576  26.077  26.462  27.682  27.671   
  1  1950  25.058  26.847  27.080  27.011  26.326  26.602  27.884  28.408   
  2  1951  25.867  27.130  26.763  26.724  26.539  26.645  28.266  28.710   
  3  1952  26.277  26.809  26.328  26.752  26.410  26.184  28.394  28.081   
  4  1953  26.181  26.230  25.937  27.204  25.777  26.113  28.555  27.519   
  
          S       O       N       D  
  0  26.788  27.061  25.921  25.737  
  1  26.535  26.802  25.671  25.604  
  2  26.127  26.683  25.520  25.570  
  3  25.634  26.652  25.540  25.972  
  4  25.631  26.113  25.675  26.482  ,
  'NINO4':    year  January  February   March   April     May    June    July  August  \
  0  1949   26.943    27.219  28.112  28.553  27.711  28.675  28.616

There is no common column across all sheets, meaning the datasets may have different structures

In [52]:
# Check for a common date column across all sheets
common_columns = set.intersection(*(set(df.columns) for df in data_frames.values()))
common_columns


set()

In [53]:
# Display first few rows of each dataset to identify a time column
data_previews = {sheet: df.head() for sheet, df in data_frames.items()}
data_previews

{'NINO3.4':    YEAR       J       F       M       A     M.1     J.1     J.2     A.1  \
 0  1949  24.551  26.455  27.046  27.576  26.077  26.462  27.682  27.671   
 1  1950  25.058  26.847  27.080  27.011  26.326  26.602  27.884  28.408   
 2  1951  25.867  27.130  26.763  26.724  26.539  26.645  28.266  28.710   
 3  1952  26.277  26.809  26.328  26.752  26.410  26.184  28.394  28.081   
 4  1953  26.181  26.230  25.937  27.204  25.777  26.113  28.555  27.519   
 
         S       O       N       D  
 0  26.788  27.061  25.921  25.737  
 1  26.535  26.802  25.671  25.604  
 2  26.127  26.683  25.520  25.570  
 3  25.634  26.652  25.540  25.972  
 4  25.631  26.113  25.675  26.482  ,
 'NINO4':    year  January  February   March   April     May    June    July  August  \
 0  1949   26.943    27.219  28.112  28.553  27.711  28.675  28.616  28.371   
 1  1950   26.668    27.528  28.376  28.583  27.527  28.507  28.698  28.664   
 2  1951   26.523    28.004  28.127  28.568  27.555  28.190  2

Each sheet has a different structure, and only some contain a time-related column

In [54]:
# Display first few rows of each dataset to identify a time column
data_previews = {sheet: df.head() for sheet, df in data_frames.items()}
data_previews


{'NINO3.4':    YEAR       J       F       M       A     M.1     J.1     J.2     A.1  \
 0  1949  24.551  26.455  27.046  27.576  26.077  26.462  27.682  27.671   
 1  1950  25.058  26.847  27.080  27.011  26.326  26.602  27.884  28.408   
 2  1951  25.867  27.130  26.763  26.724  26.539  26.645  28.266  28.710   
 3  1952  26.277  26.809  26.328  26.752  26.410  26.184  28.394  28.081   
 4  1953  26.181  26.230  25.937  27.204  25.777  26.113  28.555  27.519   
 
         S       O       N       D  
 0  26.788  27.061  25.921  25.737  
 1  26.535  26.802  25.671  25.604  
 2  26.127  26.683  25.520  25.570  
 3  25.634  26.652  25.540  25.972  
 4  25.631  26.113  25.675  26.482  ,
 'NINO4':    year  January  February   March   April     May    June    July  August  \
 0  1949   26.943    27.219  28.112  28.553  27.711  28.675  28.616  28.371   
 1  1950   26.668    27.528  28.376  28.583  27.527  28.507  28.698  28.664   
 2  1951   26.523    28.004  28.127  28.568  27.555  28.190  2