In [None]:
import xarray as xr
import pandas as pd

#Opening the netcdf file
ds = xr.open_dataset("train.nc")
print(ds)

#Converting .nc file to dataframe
df = ds.to_dataframe().reset_index()        
print(df.head())

#Copy of the dataframe
df1 = df.copy()

#Pivoting the dataframe to wide format
train_wide = df1.pivot_table(index=["timestamp", "location", "out","tracked"], columns="feature", values="weather").reset_index()
print(train_wide.head())
print(train_wide.tail())

train_wide.describe()

<xarray.Dataset> Size: 159MB
Dimensions:    (location: 83, timestamp: 2161, feature: 109)
Coordinates:
  * location   (location) <U5 2kB '26001' '26003' '26005' ... '26163' '26165'
  * feature    (feature) object 872B 'SBT113' 'SBT114' 'SBT123' ... 'wz' 'wz_1'
  * timestamp  (timestamp) datetime64[ns] 17kB 2023-04-01 ... 2023-06-30
    state      (location) <U2 664B ...
Data variables:
    tracked    (location, timestamp) float64 1MB ...
    out        (location, timestamp) float64 1MB ...
    weather    (location, timestamp, feature) float64 156MB ...
Attributes:
    time_start:  2022-01-01T00:00:00
    time_end:    2022-01-31T23:00:00
    time_now:    2025-07-08T14:59:10
  location  timestamp feature  tracked  out     weather state
0    26001 2023-04-01  SBT113  12588.0  0.0  221.470001    26
1    26001 2023-04-01  SBT114  12588.0  0.0  252.800003    26
2    26001 2023-04-01  SBT123  12588.0  0.0  229.880005    26
3    26001 2023-04-01  SBT124  12588.0  0.0  263.500000    26
4    260

feature,timestamp,out,tracked,SBT113,SBT114,SBT123,SBT124,aod,bgrun,blh,...,vddsf,veg,veril,vgtyp,vis,vstm,vucsh,vvcsh,wz,wz_1
count,179363,179363.0,179363.0,179363.0,179363.0,179363.0,179363.0,179363.0,179363.0,179363.0,...,179363.0,179363.0,179363.0,179363.0,179363.0,179363.0,179363.0,179363.0,179363.0,179363.0
mean,2023-05-16 00:00:00.000000256,45.248334,60026.752965,230.961661,277.051781,239.413582,278.654294,0.0,0.0,599.394865,...,71.264572,40.283333,0.017252,11.470231,31383.149256,-3.730822,1.631568,0.294952,-0.000461,-0.000515
min,2023-04-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-26.245855,-23.225407,-25.795647,-0.114232,-0.10309
25%,2023-04-23 12:00:00,0.0,14332.0,228.979996,272.0,236.929993,273.200012,0.0,0.0,134.369926,...,0.0,21.1,0.0,5.0,17400.0,-8.950899,-1.493742,-2.627814,-0.008439,-0.016458
50%,2023-05-16 00:00:00,0.0,23218.0,231.029999,280.399994,239.539993,281.579987,0.0,0.0,394.71933,...,28.5,31.4,0.0,14.0,26600.0,-4.890234,1.036793,-0.035397,-0.000319,-0.000426
75%,2023-06-07 12:00:00,1.0,43825.0,233.399994,287.100006,242.770004,288.700012,0.0,0.0,884.530884,...,127.300003,60.0,0.0,14.0,42600.0,0.968472,4.203418,2.748543,0.007881,0.014827
max,2023-06-30 00:00:00,23346.0,922425.0,250.179993,305.600006,259.820007,307.899994,0.0,0.0,3401.091309,...,763.0,100.0,14.3488,17.0,90000.0,24.902107,29.292791,33.150009,0.091607,0.090685
std,,452.269701,129549.071944,8.29356,17.734626,9.168975,17.791722,0.0,0.0,591.962678,...,94.986445,26.013205,0.117491,4.209413,18460.532102,7.154978,5.060016,5.181022,0.010552,0.018586


In [None]:
train_wide.isnull().sum()

feature
timestamp    0
location     0
out          0
tracked      0
SBT113       0
            ..
vstm         0
vucsh        0
vvcsh        0
wz           0
wz_1         0
Length: 113, dtype: int64

In [None]:
train_wide.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
179358    False
179359    False
179360    False
179361    False
179362    False
Length: 179363, dtype: bool

In [None]:
numeric_df = train_wide.select_dtypes(include=['number'])

In [None]:
print((numeric_df < 0).sum())

feature
out             0
tracked         0
SBT113          0
SBT114          0
SBT123          0
            ...  
vstm       128691
vucsh       70285
vvcsh       90404
wz          92250
wz_1        91417
Length: 111, dtype: int64


In [None]:
neg_cols = numeric_df.columns[(numeric_df < 0).any()]
print("Columns with negative values:", list(neg_cols))


Columns with negative values: ['cin', 'cpofp', 'gflux', 'gh_1', 'ishf', 'lftx', 'lftx4', 'refc', 'refd', 'slhtf', 'u', 'u10', 'unknown_3', 'ustm', 'v', 'v10', 'vstm', 'vucsh', 'vvcsh', 'wz', 'wz_1']


In [None]:
neg_counts = (numeric_df[neg_cols] < 0).sum()
print("\nCount of negatives in each column:\n", neg_counts)



Count of negatives in each column:
 feature
cin           19011
cpofp        179197
gflux         60506
gh_1              4
ishf          81229
lftx          10636
lftx4         13966
refc         154853
refd         166448
slhtf          5003
u             91707
u10           89802
unknown_3    170113
ustm          79495
v             92509
v10           95165
vstm         128691
vucsh         70285
vvcsh         90404
wz            92250
wz_1          91417
dtype: int64


In [None]:
zero_col = numeric_df.columns[(numeric_df == 0).any()]
print(zero_col)

Index(['out', 'tracked', 'SBT113', 'SBT114', 'SBT123', 'SBT124', 'aod',
       'bgrun', 'blh', 'cape',
       ...
       'vddsf', 'veg', 'veril', 'vgtyp', 'vis', 'vstm', 'vucsh', 'vvcsh', 'wz',
       'wz_1'],
      dtype='object', name='feature', length=111)


In [None]:
zero_counts = (numeric_df[zero_col] == 0).sum()
print("\nCount of zaro in each column:\n", zero_counts)


Count of zaro in each column:
 feature
out        126403
tracked      2463
SBT113        186
SBT114        186
SBT123        186
            ...  
vstm          166
vucsh         166
vvcsh         166
wz            166
wz_1          166
Length: 111, dtype: int64


In [None]:
unique_counts = train_wide['tracked'].value_counts()

print(unique_counts)

tracked
0.0         2463
922425.0    2161
413676.0    2161
84953.0     2161
43558.0     2064
            ... 
16293.0        1
11911.0        1
9360.0         1
130059.0       1
24050.0        1
Name: count, Length: 1755, dtype: int64


In [None]:
train_wide['tracked'].nunique()

1755