# Test of Google Drive Fetcher

In [17]:
from google_drive_fetcher import GoogleDriveFetcher 

gdf = GoogleDriveFetcher()
folder_url = "https://drive.google.com/drive/folders/1PKdRw01LZYVkb0a2ok--oBuudSFepyQU?hl=da"

In [18]:
gdf.list_directory(folder_url=folder_url)

[{'id': '1YPvWsE2kAw1PZ-_S_nIl1dj6PMyzozDe',
  'name': 'aisdk-2023-01-01.parquet Shared folder',
  'type': 'file',
  'url': 'https://drive.google.com/file/d/1YPvWsE2kAw1PZ-_S_nIl1dj6PMyzozDe/view'},
 {'id': '1NlnKO8D7KLrD8v6UeQm8wIWm4GJ8nPmZ',
  'name': 'aisdk-2023-01-02.parquet Shared folder',
  'type': 'file',
  'url': 'https://drive.google.com/file/d/1NlnKO8D7KLrD8v6UeQm8wIWm4GJ8nPmZ/view'},
 {'id': '1R8PD_f-tJXY4MrdcURvIeWMGdMkJCUOc',
  'name': 'aisdk-2023-01-03.parquet Shared folder',
  'type': 'file',
  'url': 'https://drive.google.com/file/d/1R8PD_f-tJXY4MrdcURvIeWMGdMkJCUOc/view'},
 {'id': '1k9q1MENTiXgvADJfvLhAF42qrcJw-vVI',
  'name': 'aisdk-2023-01-04.parquet Shared folder',
  'type': 'file',
  'url': 'https://drive.google.com/file/d/1k9q1MENTiXgvADJfvLhAF42qrcJw-vVI/view'},
 {'id': '1QkDzyqItIiPm6F7uat2blOqRXcvKQyUz',
  'name': 'aisdk-2023-01-05.parquet Shared folder',
  'type': 'file',
  'url': 'https://drive.google.com/file/d/1QkDzyqItIiPm6F7uat2blOqRXcvKQyUz/view'},
 {'id

# Example: Download Parquet File and Convert to DataFrame

The `GoogleDriveFetcher` class provides two ways to convert parquet files to pandas DataFrames:

1. **Direct conversion from URL**: `fetch_parquet_to_dataframe(url)` - Downloads and converts in one step
2. **Download from folder**: `download_parquet_from_folder(folder_url, filename, return_dataframe=True)` - Finds and converts a specific file in a folder

In [19]:
# First, let's see what files are actually in the folder
folder_url = "https://drive.google.com/drive/folders/1v4zupQQfvN_7ueft265XDrCrZj6fTRZR?hl=da"

print("Listing files in the folder...")
items = gdf.list_directory(folder_url)
print(f"\nFound {len(items)} items:\n")

# List all parquet FILES (not folders)
parquet_files = [item for item in items if item['name'].endswith('.parquet') and item['type'] == 'file']
print(f"Parquet files ({len(parquet_files)}):")
for item in parquet_files:
    print(f"  - {item['name']}")

# List folders (including those ending with .parquet)
folders = [item for item in items if item['type'] == 'folder']
print(f"\nFolders ({len(folders)}):")
for item in folders:
    print(f"  - {item['name']}")

Listing files in the folder...

Found 1 items:

Parquet files (0):

Folders (0):

Found 1 items:

Parquet files (0):

Folders (0):


In [20]:
# Navigate into the .parquet folders to find the actual files
print("\n" + "="*60)
print("Looking inside .parquet folders...")
print("="*60)

all_parquet_files = []

for folder in folders:
    if '.parquet' in folder['name']:
        print(f"\nüìÅ Exploring folder: {folder['name']}")
        try:
            # Navigate into the folder
            folder_contents = gdf.list_directory(folder['url'])
            
            # Find parquet files in this folder
            files_in_folder = [item for item in folder_contents if item['type'] == 'file' and item['name'].endswith('.parquet')]
            
            print(f"   Found {len(files_in_folder)} parquet file(s):")
            for file in files_in_folder:
                print(f"      - {file['name']}")
                all_parquet_files.append({
                    'name': file['name'],
                    'url': file['url'],
                    'folder': folder['name']
                })
        except Exception as e:
            print(f"   Error accessing folder: {e}")

print(f"\n{'='*60}")
print(f"Total parquet files found across all folders: {len(all_parquet_files)}")
print(f"{'='*60}")


Looking inside .parquet folders...

Total parquet files found across all folders: 0


In [21]:
# Now download a parquet file from inside the folders
if all_parquet_files:
    # Get the first file
    file_info = all_parquet_files[0]
    print(f"\nDownloading '{file_info['name']}' from folder '{file_info['folder']}'...")
    
    try:
        # Use the direct URL method since we have it
        df = gdf.fetch_parquet_to_dataframe(file_info['url'])
        
        print(f"\n‚úì Successfully loaded DataFrame!")
        print(f"Shape: {df.shape} (rows: {df.shape[0]}, columns: {df.shape[1]})")
        print(f"\nColumns: {list(df.columns)}")
        print(f"\nFirst few rows:")
        print(df.head())
        print(f"\nData types:")
        print(df.dtypes)
        
    except Exception as e:
        print(f"Error: {e}")
else:
    print("\nNo parquet files found in any folders!")


No parquet files found in any folders!


In [16]:
# Now let's download the first parquet file we find
if parquet_files:
    filename = parquet_files[0]['name']  # Use the actual filename from the folder
    print(f"\n\nDownloading '{filename}' and converting to DataFrame...")
    
    try:
        df = gdf.download_parquet_from_folder(
            folder_url=folder_url,
            file_name=filename,
            return_dataframe=True
        )
        
        if df is not None:
            print(f"\n‚úì Successfully loaded DataFrame!")
            print(f"Shape: {df.shape} (rows: {df.shape[0]}, columns: {df.shape[1]})")
            print(f"\nColumns: {list(df.columns)}")
            print(f"\nFirst few rows:")
            print(df.head())
            print(f"\nData types:")
            print(df.dtypes)
        else:
            print(f"File '{filename}' not found in folder")
            
    except Exception as e:
        print(f"Error: {e}")
else:
    print("\nNo parquet files found in the folder!")


No parquet files found in the folder!


In [22]:
# Alternative Method: If you know the exact filename, use it directly
# (Make sure to run the listing cell above first to get the actual filename)

# Example: Replace 'your-file.parquet' with an actual filename from the listing above
specific_filename = "17dd8e3b4a914443bb1cb5d0ea850a61-0.parquet"  # Update this with actual filename

try:
    print(f"Attempting to download '{specific_filename}'...")
    df_specific = gdf.download_parquet_from_folder(
        folder_url=folder_url,
        file_name=specific_filename,
        return_dataframe=True
    )
    
    if df_specific is not None:
        print(f"‚úì Successfully loaded!")
        print(f"Shape: {df_specific.shape}")
    else:
        print(f"‚ö† File '{specific_filename}' not found. Check the listing above for available files.")
        
except Exception as e:
    print(f"Error: {e}")

Attempting to download '17dd8e3b4a914443bb1cb5d0ea850a61-0.parquet'...
‚ö† File '17dd8e3b4a914443bb1cb5d0ea850a61-0.parquet' not found. Check the listing above for available files.
‚ö† File '17dd8e3b4a914443bb1cb5d0ea850a61-0.parquet' not found. Check the listing above for available files.


In [10]:
# Method 2: Direct conversion from file URL
# First, get the file URL from the directory listing (uses 'items' from above)
if parquet_files:
    file_url = parquet_files[0]['url']
    filename_direct = parquet_files[0]['name']
    
    try:
        print(f"\nUsing direct URL conversion for '{filename_direct}'...")
        df2 = gdf.fetch_parquet_to_dataframe(file_url)
        print(f"‚úì Successfully loaded DataFrame directly!")
        print(f"Shape: {df2.shape}")
        print(f"\nFirst 3 rows:")
        print(df2.head(3))
    except Exception as e:
        print(f"Error: {e}")
else:
    print("No parquet files found in the folder!")

No parquet files found in the folder!


## Alternative: Download to Disk Instead

If you want to save the parquet file to disk instead of loading it directly to memory, set `return_dataframe=False`:

In [4]:
# Download to disk, then load with pandas
import pandas as pd

if parquet_files:
    filename_disk = parquet_files[0]['name']
    
    saved_path = gdf.download_parquet_from_folder(
        folder_url=folder_url,
        file_name=filename_disk,
        return_dataframe=False  # Save to ./downloads/ directory
    )
    
    if saved_path:
        print(f"File saved to: {saved_path}")
        # Load it manually with pandas
        df3 = pd.read_parquet(saved_path)
        print(f"Loaded DataFrame from disk with shape: {df3.shape}")
    else:
        print("File not found")
else:
    print("No parquet files available to download")

File not found


# Simple Test: Access Parquet File from Nested Directories

This test navigates through subdirectories to find and load parquet files.

In [23]:
# Simple function to find and load the first parquet file
def find_and_load_first_parquet(base_url, max_depth=3, current_depth=0):
    """
    Recursively search through directories to find the first parquet file.
    
    Args:
        base_url: Google Drive folder URL
        max_depth: Maximum depth to search (default 3)
        current_depth: Current recursion depth
    
    Returns:
        pandas DataFrame or None
    """
    if current_depth >= max_depth:
        return None
    
    print("  " * current_depth + f"üîç Searching at depth {current_depth}...")
    
    try:
        items = gdf.list_directory(base_url)
        
        # First, check for parquet files at this level
        for item in items:
            if item['type'] == 'file' and item['name'].endswith('.parquet'):
                print("  " * current_depth + f"‚úì Found: {item['name']}")
                return gdf.fetch_parquet_to_dataframe(item['url'])
        
        # If no files found, search subdirectories
        for item in items:
            if item['type'] == 'folder':
                print("  " * current_depth + f"üìÅ Entering: {item['name']}")
                result = find_and_load_first_parquet(item['url'], max_depth, current_depth + 1)
                if result is not None:
                    return result
        
        return None
    except Exception as e:
        print("  " * current_depth + f"‚ùå Error: {e}")
        return None


# Test with your parent directory
parent_url = "https://drive.google.com/drive/folders/1ZPUA8ZUcVKuJWJGo69OaD6W1al8IbDyi?hl=da"

print("Starting search for parquet files...\n")
df_test = find_and_load_first_parquet(parent_url)

if df_test is not None:
    print("\n" + "="*60)
    print("‚úì SUCCESS! Loaded DataFrame")
    print("="*60)
    print(f"Shape: {df_test.shape}")
    print(f"Columns: {list(df_test.columns)}")
    print(f"\nFirst 5 rows:")
    print(df_test.head())
else:
    print("\n‚ùå No parquet files found")

Starting search for parquet files...

üîç Searching at depth 0...

‚ùå No parquet files found

‚ùå No parquet files found
