<a href="https://colab.research.google.com/github/DariusTheGeek/Radiant-Earth-Spot-the-Crop-Challenge/blob/main/Brainiac_Numpy_Extraction_for_Month_Start_Month_End.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -qq install rasterio tifffile

[K     |████████████████████████████████| 19.3 MB 89 kB/s 
[?25h

In [None]:
import os
import glob
import shutil
import gc
from joblib import Parallel, delayed
from tqdm import tqdm_notebook
import h5py

import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt


import rasterio
import tifffile as tiff

%matplotlib inline
pd.set_option('display.max_colwidth', None)

In [None]:
%%time
# os.mkdir('radiant')
shutil.unpack_archive('/content/drive/MyDrive/CompeData/Radiant/Radiant_Data.zip', '/content/radiant')
gc.collect()

In [None]:
train = pd.concat([pd.read_csv(f'/content/radiant/train{i}.csv', parse_dates=['datetime']) for i in range(1, 5)]).reset_index(drop = True)
test = pd.concat([pd.read_csv(f'/content/radiant/test{i}.csv', parse_dates=['datetime']) for i in range(1, 5)]).reset_index(drop = True)
train.file_path = train.file_path.apply(lambda x: '/'.join(['/content', 'radiant'] + x.split('/')[2:]))
test.file_path = test.file_path.apply(lambda x: '/'.join(['/content', 'radiant'] + x.split('/')[2:]))
train.datetime, test.datetime = pd.to_datetime(train.datetime.dt.date), pd.to_datetime(test.datetime.dt.date)
train['month'], test['month'] = train.datetime.dt.month, test.datetime.dt.month
train.head()

In [None]:
train.month.unique()

In [None]:
train.tile_id.unique()[50:60]

In [None]:
bands = ['B01','B02','B03','B04','B05','B06','B07','B08','B8A','B09','B11','B12','CLM']

In [None]:
date_cols = []
for i in range(4, 12):
  for x in range(1, 3):
    date_cols.append(str(i) + '_' + str(x))
date_cols

In [None]:
def process_tile_train(tile):
  tile_df = train[(train.tile_id == tile)].reset_index(drop = True)

  y = np.expand_dims(rasterio.open(tile_df[tile_df.asset == 'labels'].file_path.values[0]).read(1).flatten(), axis = 1)
  fields = np.expand_dims(rasterio.open(tile_df[tile_df.asset == 'field_ids'].file_path.values[0]).read(1).flatten(), axis = 1)

  tile_df = train[(train.tile_id == tile) & (train.satellite_platform == 's2')].reset_index(drop = True)

  dates = []
  for month in range(4, 12):
    dates.append(tile_df[tile_df.month == month].datetime.sort_values().tolist()[0])
    dates.append(tile_df[tile_df.month == month].datetime.sort_values().tolist()[-1])

  X_tile = np.empty((256 * 256, 0))

  colls = []
  for date, datec in zip(dates, date_cols):
    for band in bands:
      tif_file = tile_df[(tile_df.asset == band) & (tile_df.datetime == date)].file_path.values[0]
      X_tile = np.append(X_tile, (np.expand_dims(rasterio.open(tif_file).read(1).flatten(), axis = 1)), axis = 1)
      colls.append(datec + '_' + band)
  df = pd.DataFrame(X_tile, columns = colls)
  df['y'], df['fields'] = y, fields
  return df

In [None]:
tiles = train.tile_id.unique()
chunks = [tiles[x:x+265] for x in range(0, len(tiles), 265)]
[len(x) for x in chunks]

In [None]:
for i in range(len(chunks)):
  pd.DataFrame(np.vstack(Parallel(n_jobs=-1, verbose=1, backend="multiprocessing")(map(delayed(process_tile_train), [x for x in chunks[i]])))).to_csv(f'/content/drive/MyDrive/CompeData/Radiant/Start_end/train{i}.csv', index = False)
  gc.collect()
  print(i)

In [None]:
def process_tile_test(tile):
  tile_df = train[(train.tile_id == tile)].reset_index(drop = True)

  # y = np.expand_dims(rasterio.open(tile_df[tile_df.asset == 'labels'].file_path.values[0]).read(1).flatten(), axis = 1)
  fields = np.expand_dims(rasterio.open(tile_df[tile_df.asset == 'field_ids'].file_path.values[0]).read(1).flatten(), axis = 1)

  tile_df = train[(train.tile_id == tile) & (train.satellite_platform == 's2')].reset_index(drop = True)

  dates = []
  for month in range(4, 12):
    dates.append(tile_df[tile_df.month == month].datetime.sort_values().tolist()[0])
    dates.append(tile_df[tile_df.month == month].datetime.sort_values().tolist()[-1])

  X_tile = np.empty((256 * 256, 0))

  colls = []
  for date, datec in zip(dates, date_cols):
    for band in bands:
      tif_file = tile_df[(tile_df.asset == band) & (tile_df.datetime == date)].file_path.values[0]
      X_tile = np.append(X_tile, (np.expand_dims(rasterio.open(tif_file).read(1).flatten(), axis = 1)), axis = 1)
      colls.append(datec + '_' + band)
  df = pd.DataFrame(X_tile, columns = colls)
  df['fields'] = fields
  return df

In [None]:
tiles = train.tile_id.unique()
chunks = [tiles[x:x+265] for x in range(0, len(tiles), 265)]
[len(x) for x in chunks]

In [None]:
for i in range(len(chunks)):
  pd.DataFrame(np.vstack(Parallel(n_jobs=-1, verbose=1, backend="multiprocessing")(map(delayed(process_tile_train), [x for x in chunks[i]])))).to_csv(f'/content/drive/MyDrive/CompeData/Radiant/Start_end/test{i}.csv', index = False)
  gc.collect()
  print(i)