In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
import torch
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from PIL import Image
from tqdm import tqdm


def count_images_by_year(root_dir, years=['2022', '2023']):
    # Create a dictionary to hold the count of images per year
    image_count = {year: 0 for year in years}
    all_files = []
    # Walk through the root directory
    for subdir, dirs, files in os.walk(root_dir):
        # Check if the subdir contains any of the years we're interested in
        for year in years:
            if year in subdir:
                # Increment the count for the year by the number of image files
                image_count[year] += len([file for file in files if file.lower().endswith(('.png', '.jpg', '.jpeg'))])
                for file in files:
                  all_files.append(file)
    return image_count, all_files

def load_classified_image_paths(csv_path):
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        return set(df['Filename'])
    return set()

# Define a Dataset for inference
class InferenceDataset(Dataset):
    def __init__(self, root_dir, year, classified_image_paths, transform=None):
        self.image_paths = [os.path.join(dp, f) for dp, dn, filenames in os.walk(root_dir)
                            for f in filenames if str(year) in dp and f.lower().endswith(('.png', '.jpg', '.jpeg')) and os.path.join(dp, f) not in classified_image_paths]
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, image_path


def extract_patches(image, patch_size, stride):
    patches = []
    c, height, width = image.size()

    for y in range(0, height - patch_size[1] + 1, stride):
        for x in range(0, width - patch_size[0] + 1, stride):
            patch = image[:, y:y + patch_size[1], x:x + patch_size[0]]
            patches.append(patch)

    return patches

def classify_images_with_patches(data_loader, model, patch_size, stride, save_path, save_interval=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.eval()
    predictions = []
    batch_count = 0
    with torch.no_grad():
        for images, paths in tqdm(data_loader, desc='Classifying'):
            images.to(device)
            for image, path in zip(images, paths):
                # Extract patches from the image
                patches = extract_patches(image, patch_size, stride)
                patches = torch.stack(patches).to(device)
                # Classify each patch and aggregate the results
                patch_outputs = model(patches)
                logits = patch_outputs.logits
                patch_predictions = torch.mean(logits, dim=0)

                _, predicted = torch.max(patch_predictions, 0)
                predictions.append((path, predicted.item()))

            # Save intermediate results every 'save_interval' batches
            batch_count += 1
            if batch_count % save_interval == 0:
                intermediate_df = pd.DataFrame(predictions, columns=['Filename', 'Class'])
                if not os.path.isfile(save_path):
                      # If not, write with header
                      intermediate_df.to_csv(save_path, mode='w', index=False, header=True)
                else:
                      # If it exists, append without writing the header
                      intermediate_df.to_csv(save_path, mode='a', index=False, header=False)

    return predictions


# Define the custom transformation sequence
class TopCropTransform:
    """Crops the top 30% of the image."""
    def __call__(self, img):
        width, height = img.size
        return img.crop((0, height * 0.3, width, height))

transformations = transforms.Compose([
    TopCropTransform(),                 # Custom crop
    transforms.ToTensor(),              # Convert image to PyTorch tensor
    transforms.Normalize(               # Normalize the image
        mean=[0.485, 0.456, 0.406],     # Mean for each channel
        std=[0.229, 0.224, 0.225]       # Standard deviation for each channel
    )
])


imagesRoot = '/content/drive/MyDrive/GSV-CropType-Thailand/images/'
root_dir_path = imagesRoot + 'ThailandFieldsByYear'

image_counts, files = count_images_by_year(root_dir_path)

for year, count in image_counts.items():
    print(f"Number of images in {year}: {count}")

df = pd.DataFrame(files, columns=['Filename'])
print('Pre-Duplicates dropped', len(df))

print('Post-Duplicates dropped', len(df.drop_duplicates()))
unique = df.drop_duplicates()
first_column = df.columns[0]

print(df)
# Check for duplicates
duplicates = df.duplicated(subset=[first_column], keep=False)

# Print duplicate rows
print("Duplicate Rows based on the first column:")
print(df[duplicates])

# Check for duplicates in the 'Filename' column
duplicates = df[df.duplicated(subset=['Filename'], keep=False)]

# Print the DataFrame with duplicates for reference
print("DataFrame with Duplicates:")
print('DUPLICATES', duplicates)

df = df.drop_duplictes()

In [None]:
from transformers import ViTForImageClassification
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=5)

model_state_dict = torch.load(imagesRoot+'ViT-Thailand-4cropsOther-lr2e-4-ep10_epoch_19.pth')
model.load_state_dict(model_state_dict)

model.eval()

year_to_classify = 2022

# results_csv_path = imagesRoot + 'ViTClassifiedImagesNoDup.csv'
results_csv_path = imagesRoot + 'ViTClassifiedImages3.csv'
df = pd.read_csv(results_csv_path)

classified_image_paths = load_classified_image_paths(results_csv_path)
# print(classified_image_paths)
# classified_image_paths = dfnew['Filename'].tolist()

inference_dataset = InferenceDataset(root_dir_path, year_to_classify, classified_image_paths, transform=transformations)

print(len(inference_dataset.image_paths))
# print(inference_dataset.image_paths[0]
# Create a DataLoader for batch processing
data_loader = DataLoader(inference_dataset, batch_size=32, num_workers=4)

# Define patch size and stride (adjust these values as needed)
patch_size = (224, 224)
stride = 30

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Perform inference with patches
predictions = classify_images_with_patches(data_loader, model, patch_size, stride, results_csv_path)


results_csv_path = imagesRoot + 'ViTClassifiedImages3Post.csv'

# Save predictions to a CSV file
df = pd.DataFrame(predictions, columns=['Filename', 'Class'])

df.to_csv(results_csv_path, mode='a', index=False, header=False)


In [None]:
import os
import shutil

def move_images(source_folder, target_root_folder, images_per_folder=500):
    if not os.path.exists(source_folder):
        print(f"Source folder '{source_folder}' does not exist.")
        return

    if not os.path.exists(target_root_folder):
        print(f"Target root folder '{target_root_folder}' does not exist.")
        return
    filenames = []
    image_files = [f for f in os.listdir(source_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    total_images = len(image_files)
    folder_count = 0
    image_count = 0

    for i, image_file in enumerate(image_files):
        if i % images_per_folder == 0:
            folder_count += 1
            target_folder = os.path.join(target_root_folder, f"2022_10_{folder_count}")
            os.makedirs(target_folder, exist_ok=True)

        source_path = os.path.join(source_folder, image_file)
        target_path = os.path.join(target_folder, image_file)
        filenames.append(image_file)
        shutil.move(source_path, target_path)

        image_count += 1
        if image_count >= images_per_folder:
            image_count = 0

    print(f"Moved {total_images} images into {folder_count} folders.")
    return filenames
source_folder = imagesRoot + 'ThailandFieldOrNot/field'
target_root_folder = imagesRoot + 'ThailandFieldsByYear'
filenames = move_images(source_folder, target_root_folder)


In [None]:
import csv
import re

def extractParamFromFilenameAndFull(filename):
  if filename.startswith('/content'):
    parts = filename.split('/')
    filename = parts[-1]

    if filename[0:5] == '&date':

      panoRe = r"&panoid(.*?)&GSV"
      match = re.search(panoRe, filename)
      if match:
          date = filename[5:12]
          panoId = match.group(1)
          GSVLatIdx = filename.find('&GSVLat')
          GSVLonIdx = filename.find('&GSVLon')
          headIdx = filename.find('&head')
          GSVLat =  filename[GSVLatIdx+7:GSVLonIdx]
          GSVLon = filename[GSVLonIdx+7:headIdx]
          headRe = r"&head(.*?)&area"
          match = re.search(headRe, filename)
          GSVLat = GSVLat
          GSVLon = GSVLon

          if match:
            head = match.group(1)
            return [filename, panoId, date, head, GSVLat, GSVLon]
      else:
         return [None, None, None,None, None, None]
    else:

      date = filename[0:7]
      GSVLatIdx = filename.find('GSVLat')
      GSVLonIdx = filename.find('GSVLon')

      latIdx = filename.find('&lat')
      lonIdx = filename.find('&lon')
      endIdx = filename.find('.jpg')
      lat = filename[latIdx+4:lonIdx]
      lon = filename[lonIdx+4:endIdx]
      GSVLat = filename[GSVLatIdx+6:GSVLonIdx]
      GSVLon = filename[GSVLonIdx+6:latIdx-5]
      head = filename[29:latIdx]
      if GSVLatIdx > 0:
        head = computeBearing( (float(GSVLat),float(GSVLon)) , (float(lat),float(lon)) )

      GSVLat = GSVLat
      GSVLon = GSVLon
      panoId = filename[7:29]
      return [filename, panoId, date, head, GSVLat, GSVLon]


def save_to_csv(lons, lats, y_pred, folds, filenames, outfilename):
    # Check if the file already exists
    file_exists = os.path.isfile(outfilename)
    fieldnames=['latitude','longitude','cropland_type', 'fold']

    # Write the data to the CSV file
    training_data = []
    for i, lon in enumerate(lons):

        if len(lon) >= 1:
          # training_data.append({'latitude': lats[i][0], 'longitude': lons[i][0], 'cropland_type': y_pred[i], 'fold': folds[i] })
          training_data.append((lats[i], lons[i], y_pred[i], folds[i] ))

    with open(outfilename, 'a', newline='') as csvfile:
      writer = csv.writer(csvfile)
      if not file_exists:
        writer.writerow(fieldnames)
      writer.writerows(training_data)

def generateGEEPoints(df):
  labels = df.iloc[:, 1].tolist()
  filenames = df.iloc[:, 0].tolist()
  meta = [extractParamFromFilenameAndFull(filename) for filename in filenames]
  STARTYEAR = 2022
  ENDYEAR = 2022
  STARTMONTH = 5
  ENDMONTH = 10
  lons = []
  lats = []
  y_pred = []
  folds = []
  heads = []
  filenames_filtered = []
  for i, point in enumerate(meta):
    # print(point)
    if withinDate(STARTMONTH, STARTYEAR, ENDMONTH, ENDYEAR, point[2]):
      pt_lat, pt_lon = computePointOnField((float(point[-2]), float(point[-1])), float(point[-3]), 30)
      # print(str(pt_lon))
      lons.append(str(pt_lon))
      lats.append(str(pt_lat))
      y_pred.append(labels[i])
      heads.append(str(point[-3]))
      folds.append(0)
      filenames_filtered.append(point[0])

  print('Before', len(meta))
  print('After', len(y_pred))
  df = pd.DataFrame({'lon': lons, 'lat': lats, 'head': heads})
  duplicates = df.duplicated(subset=['lon', 'lat'], keep='first')

  # Print duplicate rows
  print("Duplicate Rows based on the first column:")
  print(df[duplicates])
  save_to_csv(lons, lats, y_pred, folds, filenames_filtered, imagesRoot + 'ViTClassifiedImagesProcClean2.csv')


imagesRoot = '/content/drive/MyDrive/GSV-CropType-Thailand/images/'

results_csv_path = imagesRoot + 'ViTClassifiedImages2Post.csv'

df = pd.read_csv(results_csv_path)
generateGEEPoints(df)


In [None]:
print(df.iloc[0,0][0:7])
print(extractParamFromFilenameAndFull(df.iloc[0,0]))

In [None]:
def computePointOnField(fro, theta, d):
    import math
    #calc distance of a point d away given point and bearing
    R = 6371e3
    #angular distance between start pt and destination pt
    Ad = d/R
    theta = math.radians(theta)
    la1 = math.radians(fro[0])
    lo1 = math.radians(fro[1])
    #Inverse Haversine to calculate lat and lon coords of destination point
    la2 =  math.asin(math.sin(la1) * math.cos(Ad) + math.cos(la1) * math.sin(Ad) * math.cos(theta))
    lo2 = lo1 + math.atan2(math.sin(theta) * math.sin(Ad) * math.cos(la1) , math.cos(Ad) - math.sin(la1) * math.sin(la2))
    return (math.degrees(la2),math.degrees(lo2))
print(computePointOnField((14.82151854,	101.8204055), 97.8, 30))
print(computePointOnField((14.82151854,	101.8204055), 97.8+180, 30))


def withinDate(startMonth, startYear, endMonth, endYear, date):
  year = int(date[0:4])
  month = int(date[5:7])

  if year > startYear and year < endYear:
      return True
  elif year == startYear and month >= startMonth:
      return True
  elif year == endYear and month <= endMonth:
      return True
  else:
      return False
def computeBearing(fro, to):
    import math
    # calculates distance in lat and lon btwn 2 pts and uses sin and cos to calculate components of direction vectors (y and x)
    # then uses atan2 to calculate the angle between the direction vector and the x-axis
    fro = (math.radians(fro[0]),	math.radians(fro[1]))
    to = (math.radians(to[0]),	math.radians(to[1]))
    y = math.sin(to[1]-fro[1]) * math.cos(to[0])
    x = math.cos(fro[0])*math.sin(to[0]) - math.sin(fro[0])*math.cos(to[0])*math.cos(to[1]-fro[1])
    θ = math.atan2(y, x)
    brng = (θ*180/math.pi + 360) % 360
    return brng


(14.821481924251044, 101.82068200011909)
(14.821555155418972, 101.82012899978739)


In [None]:
#Split csv file to avoid over memory errors in GEE
import pandas as pd
import os
imagesRoot = '/content/drive/MyDrive/GSV-CropType-Thailand/images/'

csv_file = imagesRoot + 'ViTClassifiedImagesProc.csv'
data = pd.read_csv(csv_file)

# Create a folder 'splits' to store the parts
output_folder = 'preds8splits'
# os.makedirs(imagesRoot + output_folder, exist_ok=True)

# Calculate the size of each split
split_size = len(data) // 8

# Split and save
for i in range(8):
    start = i * split_size
    end = None if i == 7 else (i + 1) * split_size  # Ensure the last split contains all remaining data
    split_data = data.iloc[start:end]
    split_data.to_csv(os.path.join(imagesRoot + output_folder, f'split_{i+1}.csv'), index=False)

print("CSV file split into 8 parts and saved in 'splits' folder.")

CSV file split into 8 parts and saved in 'splits' folder.
