In [1]:
import pandas as pd


In [13]:
import pandas as pd
import json
import os
import random # For shuffling and splitting data
from IPython.display import display # For better DataFrame display in notebooks

# Define paths
DATA_DIR = "data" # Ensure this folder exists in your project root
# --- !!! IMPORTANT: Place your disasterIND.csv file inside the 'data' folder !!! ---
CSV_FILE_PATH = os.path.join(DATA_DIR, "disasterIND.csv") 
OUTPUT_TRAIN_FILE = os.path.join(DATA_DIR, "train.jsonl")
OUTPUT_VALID_FILE = os.path.join(DATA_DIR, "validation.jsonl")

# Create data directory if it doesn't exist
os.makedirs(DATA_DIR, exist_ok=True)

print(f"--- Setup ---")
print(f"Looking for CSV at: {CSV_FILE_PATH}")
print(f"Output training data will be saved to: {OUTPUT_TRAIN_FILE}")
print(f"Output validation data will be saved to: {OUTPUT_VALID_FILE}")

--- Setup ---
Looking for CSV at: data\disasterIND.csv
Output training data will be saved to: data\train.jsonl
Output validation data will be saved to: data\validation.jsonl


In [14]:
print(f"\n--- Loading CSV Dataset ---")
try:
    # Specify encoding, check common ones if utf-8 fails (e.g., 'latin1', 'iso-8859-1')
    df_disasters = pd.read_csv(CSV_FILE_PATH, encoding='utf-8') 
    print(f"Successfully loaded {CSV_FILE_PATH}. Shape: {df_disasters.shape}")
    print("\nColumns:", df_disasters.columns.tolist())
    print("\nSample Data (first 5 rows):")
    display(df_disasters.head()) 
except FileNotFoundError:
    print(f"ERROR: CSV file not found at {CSV_FILE_PATH}. Please download it from Kaggle and place it in the '{DATA_DIR}' folder.")
    raise SystemExit("Stopping: CSV file required.")
except Exception as e:
    print(f"Error loading CSV: {e}")
    raise SystemExit(f"Stopping: Error loading CSV - {e}")


--- Loading CSV Dataset ---
Successfully loaded data\disasterIND.csv. Shape: (783, 46)

Columns: ['DisNo.', 'Historic', 'Classification Key', 'Disaster Group', 'Disaster Subgroup', 'Disaster Type', 'Disaster Subtype', 'External IDs', 'Event Name', 'ISO', 'Country', 'Subregion', 'Region', 'Location', 'Origin', 'Associated Types', 'OFDA/BHA Response', 'Appeal', 'Declaration', "AID Contribution ('000 US$)", 'Magnitude', 'Magnitude Scale', 'Latitude', 'Longitude', 'River Basin', 'Start Year', 'Start Month', 'Start Day', 'End Year', 'End Month', 'End Day', 'Total Deaths', 'No. Injured', 'No. Affected', 'No. Homeless', 'Total Affected', "Reconstruction Costs ('000 US$)", "Reconstruction Costs, Adjusted ('000 US$)", "Insured Damage ('000 US$)", "Insured Damage, Adjusted ('000 US$)", "Total Damage ('000 US$)", "Total Damage, Adjusted ('000 US$)", 'CPI', 'Admin Units', 'Entry Date', 'Last Update']

Sample Data (first 5 rows):


Unnamed: 0,DisNo.,Historic,Classification Key,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,External IDs,Event Name,ISO,...,Reconstruction Costs ('000 US$),"Reconstruction Costs, Adjusted ('000 US$)",Insured Damage ('000 US$),"Insured Damage, Adjusted ('000 US$)",Total Damage ('000 US$),"Total Damage, Adjusted ('000 US$)",CPI,Admin Units,Entry Date,Last Update
0,1900-9001-IND,Yes,nat-cli-dro-dro,Natural,Climatological,Drought,Drought,,,IND,...,,,,,,,2.730451,,2006-12-01,2023-09-25
1,1905-0003-IND,Yes,nat-geo-ear-gro,Natural,Geophysical,Earthquake,Ground movement,,,IND,...,,,,,25000.0,847777.0,2.948887,,2003-07-01,2023-09-25
2,1907-0001-IND,Yes,nat-bio-epi-bac,Natural,Biological,Epidemic,Bacterial disease,,Bubonic,IND,...,,,,,,,3.058105,,2003-07-01,2023-09-25
3,1916-0004-IND,Yes,nat-met-sto-tro,Natural,Meteorological,Storm,Tropical cyclone,,,IND,...,,,,,,,3.576717,,2003-07-01,2023-09-25
4,1920-0001-IND,Yes,nat-bio-epi-bac,Natural,Biological,Epidemic,Bacterial disease,,Bubonic,IND,...,,,,,,,6.562784,,2003-07-01,2023-09-25


In [20]:
print("\n--- Exploring and Filtering Data ---")
print("\nExploring Disaster Types:")
# Display more rows if needed: pd.set_option('display.max_rows', None)
print(df_disasters['Disaster Type'].value_counts()) 
# pd.reset_option('display.max_rows')

# Example: Filter for specific disaster types you want to focus on
relevant_types = ['Flood', 'Storm', 'Drought', 'Wildfire', 'Earthquake','Extereme Temprature'] # Adjust this list
df_filtered = df_disasters[df_disasters['Disaster Type'].isin(relevant_types)].copy() # Use .copy() to avoid warnings

# Handle missing values in columns you'll use for prompts (fill with 'Unknown')
# Adjust this list based on columns you actually use in create_training_example_from_csv_row
cols_to_fill = ['Event Name', 'Disaster Group', 'Disaster Type', 'Disaster Subtype', 'Total Deaths', 'Total Damages']
print("\nHandling missing values (filling with 'Unknown')...")
for col in cols_to_fill:
     if col in df_filtered.columns:
          # Convert to string before filling NA to avoid type issues later
          df_filtered[col] = df_filtered[col].astype(str).fillna('Unknown') 
     else:
          print(f"Warning: Column '{col}' not found in CSV during fillna.")


print(f"\nFiltered down to {len(df_filtered)} relevant events.")
if not df_filtered.empty:
     print("\nSample Filtered Data:")
     display(df_filtered.head())
else:
     print("Warning: Filtered DataFrame is empty. No examples will be generated from CSV.")


--- Exploring and Filtering Data ---

Exploring Disaster Types:
Disaster Type
Flood                          325
Storm                          214
Epidemic                        69
Extreme temperature             64
Mass movement (wet)             58
Earthquake                      27
Drought                         16
Wildfire                         4
Glacial lake outburst flood      3
Mass movement (dry)              2
Infestation                      1
Name: count, dtype: int64

Handling missing values (filling with 'Unknown')...

Filtered down to 586 relevant events.

Sample Filtered Data:


Unnamed: 0,DisNo.,Historic,Classification Key,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,External IDs,Event Name,ISO,...,Reconstruction Costs ('000 US$),"Reconstruction Costs, Adjusted ('000 US$)",Insured Damage ('000 US$),"Insured Damage, Adjusted ('000 US$)",Total Damage ('000 US$),"Total Damage, Adjusted ('000 US$)",CPI,Admin Units,Entry Date,Last Update
0,1900-9001-IND,Yes,nat-cli-dro-dro,Natural,Climatological,Drought,Drought,,,IND,...,,,,,,,2.730451,,2006-12-01,2023-09-25
1,1905-0003-IND,Yes,nat-geo-ear-gro,Natural,Geophysical,Earthquake,Ground movement,,,IND,...,,,,,25000.0,847777.0,2.948887,,2003-07-01,2023-09-25
3,1916-0004-IND,Yes,nat-met-sto-tro,Natural,Meteorological,Storm,Tropical cyclone,,,IND,...,,,,,,,3.576717,,2003-07-01,2023-09-25
7,1924-0003-IND,Yes,nat-met-sto-tro,Natural,Meteorological,Storm,Tropical cyclone,,,IND,...,,,,,,,5.61118,,2003-07-01,2023-09-25
8,1925-0004-IND,Yes,nat-met-sto-tro,Natural,Meteorological,Storm,Tropical cyclone,,,IND,...,,,,,,,5.742436,,2003-07-01,2023-09-25


In [29]:
print("\n--- Defining Example Creation Logic ---")

def create_training_example_from_csv_row(row):
    """
    Generates example training pairs (prompt/completion) from a CSV row.
    Returns a LIST of examples, as one row might inspire multiple tasks.
    Requires MANUAL WRITING of the 'output_text' completions.
    """
    examples = []
    
    # --- Extract relevant info from the row ---
    # Use .get() with default to handle potential missing columns gracefully
    disaster_type = row.get('Disaster Type', 'Unknown Type')
    location = row.get('Locationsort', 'Unknown Location')
    start_date = row.get('Start Date', 'Unknown Date') 
    event_name = row.get('Event Names', 'Unnamed Event')
    deaths = row.get('Total Deaths', 'N/A')
    damage_usd = row.get('Total Damages', 'N/A') 

    # --- Task 1: Generate a Concise Event Summary ---
    prompt_summary = f"Based on historical records, provide a brief factual summary of this event:\nEvent Type: {disaster_type}\nLocation: {location}\nStart Date: {start_date}\nDetails: {event_name}\nImpact: Deaths - {deaths}, Damage (USD Est.) - {damage_usd}\n\nSummary:"
    
    # !!! Manual Completion Writing !!! 
    completion_summary = None # Default to None
    # --- Start of your manual logic ---
    if disaster_type == 'Flood' and 'Assam' in location and deaths != 'N/A':
         completion_summary = f"Historical record: Flooding impacted {location}, Assam, starting around {start_date}. The event resulted in {deaths} reported deaths and an estimated damage of {damage_usd} INR. So Kindly Take the necessary steps to protect yourself"
    elif disaster_type == 'Storm' and 'Odisha' in location and event_name != 'Unnamed Event':
         completion_summary = f"Historical record: The storm '{event_name}' affected {location}, Odisha, beginning {start_date}, causing {deaths} deaths and damage estimated at {damage_usd} INR."
    elif disaster_type == 'Earthquake' and deaths != 'N/A':
         completion_summary = f"Historical record: An earthquake occurred in {location} starting {start_date}, described as '{event_name}'. It caused {deaths} deaths and estimated damages of {damage_usd} INR."
    elif disaster_type == 'Landslide' and 'Himalayas' in location and event_name != 'Unnamed Event':
         completion_summary = f"Historical record: A landslide occurred in {location}, Himalayas, during {start_date}, described as '{event_name}'. It resulted in {deaths} deaths and damage estimated at {damage_usd} INR."
    elif disaster_type == 'cylone' and 'Chennai' in location and event_name != 'Unnamed Event':
         completion_summary = f"Historical record: A Cyclone occurred in {location}, Chennai, around {start_date}, described as '{event_name}'. It resulted in {deaths} deaths and damage estimated at {damage_usd} INR. Check for other source for update of Cyclone and Take the necessary steps to protect yourself"
    elif disaster_type == 'Tsunami' and 'Kerala' in location and event_name != 'Unnamed Event':
         completion_summary = f"Historical record: A Tsunami occurred in {location}, Kerala, around {start_date}, described as '{event_name}'. It resulted in {deaths} deaths and damage estimated at {damage_usd} INR. Check for other source for update of Tsunami and Take the"
    else:
         completion_summary = f"Historical record: An event of type '{disaster_type}' occurred in {location}, around {start_date}, described as '{event_name}'. It resulted in {deaths} deaths and damage estimated at {damage_usd} INR. Check for other source for update of {disaster_type} and Take the necessary steps to protect yourself"
    # --- End of your manual logic ---
    
    if completion_summary:
        examples.append({"input_text": prompt_summary, "output_text": completion_summary})
    
    # --- Task 2: Identify Likely Primary Impacts ---
    prompt_impacts = f"For a historical {disaster_type} event in {location}, list the likely primary impacts based on the disaster type."
    
    # !!! Manual Completion Writing !!!
    completion_impacts = None
    # --- Start of your manual logic ---
    if disaster_type == 'Flood' and 'Assam' in location and deaths != 'N/A':
         completion_impacts = f"Historical record: Floods in {location}, Assam, impacted the following primary impacts: Flooded areas, Loss of life, Damage to infrastructure, Property damage, and displacement of people."
    elif disaster_type == 'Storm' and 'Odisha' in location and event_name != 'Unnamed Event':
         completion_impacts = f"Historical record: The storm '{event_name}' impacted the following primary impacts: Storm damage, Loss of life, Property damage, Displacement of people, and Infrastructure damage."
    elif disaster_type == 'Earthquake' and deaths != 'N/A':
         completion_impacts = f"Historical record: An earthquake occurred in {location} that impacted the following primary impacts: Earthquake damage, Loss of life, Property damage, and Infrastructure damage."
    elif disaster_type == 'Landslide' and 'Himalayas' in location and event_name != 'Unnamed Event':
         completion_impacts = f"Historical record: A landslide occurred in {location}, Himalayas, that impacted the following primary impacts: Landslide damage, Loss of life, Property damage, and Infrastructure damage."
    elif disaster_type == 'cylone' and 'Chennai' in location and event_name != 'Unnamed Event':
         completion_impacts = f"Historical record: A cyclone occurred in {location}, Chennai, that impacted the following primary impacts: Cyclone damage, Loss of life, Property damage, and Infrastructure damage."
    elif disaster_type == 'Tsunami' and 'Kerala' in location and event_name != 'Unnamed Event':
         completion_impacts = f"Historical record: A tsunami occurred in {location}, Kerala, that impacted the following primary impacts: Tsunami damage, Loss of life, Property damage, and Infrastructure damage."
    else:
         completion_impacts = f"Historical record: An event of type '{disaster_type}' occurred in {location}, that impacted the following primary impacts: {event_name} damage, Loss of life, Property damage, and Infrastructure damage."
    # --- End of your manual logic ---
    
    if completion_impacts:
        examples.append
    # --- End of your manual logic ---
    
    if completion_summary:
        examples.append({"input_text": prompt_summary, "output_text": completion_summary})
    
   # --- Task 2: Identify Likely Primary Impacts ---
    prompt_impacts = f"For a historical {disaster_type} event in {location}, list the likely primary impacts based on the disaster type."
    
    # !!! Manual Completion Writing !!!
    completion_impacts = None
    # --- Start of your manual logic ---
    if disaster_type == 'Flood':
        completion_impacts = "Primary impacts likely included: displacement of people, damage to homes and infrastructure, loss of crops/livestock, potential water contamination, and disruption of transportation."
    elif disaster_type == 'Storm':
         completion_impacts = "Primary impacts likely included: damage from high winds (trees, buildings, power lines), heavy rainfall causing localized flooding, possible storm surge if coastal, and disruption of essential services."
    elif disaster_type == 'Earthquake':
         completion_impacts = "Primary impacts likely included: ground shaking causing structural damage or collapse, potential injuries/fatalities, landslides in hilly areas, disruption of utilities (water, power, gas), and possible aftershocks."
    elif disaster_type == 'Landslide':
         completion_impacts = "Primary impacts likely included: burial of homes/infrastructure, blocking of roads/rivers, potential injuries/fatalities, and damage to the surrounding environment."
    elif disaster_type == 'Drought':
         completion_impacts = "Primary impacts likely included: water scarcity (drinking, agriculture, industry), crop failure, livestock losses, increased wildfire risk, and potential economic hardship."
    elif disaster_type == 'Extreme temperature': # Assume heatwave
         completion_impacts = "Primary impacts likely included: heat stress illnesses (heat exhaustion, heatstroke), increased mortality especially among vulnerable groups, strain on healthcare and power grids, and potential impact on agriculture/livestock."
    # --- End of your manual logic ---
    
    if completion_impacts:
        examples.append({"input_text": prompt_impacts, "output_text": completion_impacts})
        
        if disaster_type == 'Flood':
             completion_impacts = "Primary impacts likely included: displacement of people, damage to homes and infrastructure, loss of crops/livestock, potential water contamination, and disruption of transportation."
        elif disaster_type == 'Storm':
             completion_impacts = "Primary impacts likely included: damage from high winds (trees, buildings, power lines), heavy rainfall causing localized flooding, possible storm surge if coastal, and disruption of essential services."
        elif disaster_type == 'Earthquake':
             completion_impacts = "Primary impacts likely included: ground shaking causing structural damage or collapse, potential injuries/fatalities, landslides in hilly areas, disruption of utilities (water, power, gas), and possible aftershocks."
        elif disaster_type == 'Landslide':
             completion_impacts = "Primary impacts likely included: burial of homes/infrastructure, blocking of roads/rivers, potential injuries/fatalities, and damage to the surrounding environment."
        elif disaster_type == 'Drought':
             completion_impacts = "Primary impacts likely included: water scarcity (drinking, agriculture, industry), crop failure, livestock losses, increased wildfire risk, and potential economic hardship."
        elif disaster_type == 'Extreme temperature': # Assume heatwave
             completion_impacts = "Primary impacts likely included: heat stress illnesses (heat exhaustion, heatstroke), increased mortality especially among vulnerable groups, strain on healthcare and power grids, and potential impact on agriculture/livestock."

     
    # --- End of your manual logic ---
        
    # --- Task 3: Suggest Preparedness Actions (Generic based on type) ---
    # (Add prompt_prep and completion_prep logic here as in previous example if desired)
    {
         "prompt": "What disaster occurred in Odisha in October 1999?",
         "completion": "A massive cyclone hit Odisha in October 1999, causing over $4.5 billion in damages." 
      }
    return examples



--- Defining Example Creation Logic ---


In [30]:
print("\n--- Generating Examples from CSV Data ---")
all_examples = []
if not df_filtered.empty:
    # Limit processing for testing? Uncomment below:
    # sample_df = df_filtered.sample(n=min(200, len(df_filtered)), random_state=42) 
    sample_df = df_filtered # Process all filtered rows
    
    processed_rows = 0
    for index, row in sample_df.iterrows():
        generated = create_training_example_from_csv_row(row)
        all_examples.extend(generated) 
        processed_rows += 1
        if processed_rows % 100 == 0: # Print progress
             print(f"Processed {processed_rows}/{len(sample_df)} rows...")
             
    print(f"Finished processing CSV. Generated {len(all_examples)} examples from {len(sample_df)} CSV rows.")
else:
    print("Filtered DataFrame is empty, cannot generate examples from CSV.")


--- Generating Examples from CSV Data ---
Processed 100/586 rows...
Processed 200/586 rows...
Processed 300/586 rows...
Processed 400/586 rows...
Processed 500/586 rows...
Finished processing CSV. Generated 1754 examples from 586 CSV rows.


In [31]:
print("\n--- Adding Manually Created Examples ---")
# Add examples that are NOT derived from the CSV, like summarizing actual weather reports
# or generating alerts based on hypothetical scenarios.
manual_examples = [
    {"input_text": "Summarize this forecast: Mumbai - Max Temp 34C, Min Temp 28C. Humidity 85%. Heavy rainfall warning issued for the next 24 hours due to active monsoon conditions. Possibility of waterlogging in low-lying areas.", 
     "output_text": "Heavy Rain Alert for Mumbai: Expect heavy rainfall over the next 24 hours with high humidity (85%) and temperatures between 28-34C. Risk of waterlogging in low-lying areas. Take necessary precautions."},
    {"input_text": "Generate a public safety message for the following heatwave warning: Delhi - Temperatures expected to reach 45-47C for the next 3 days. IMD advises avoiding direct sun exposure between 11 AM and 4 PM.", 
     "output_text": "Public Safety Advisory - Heatwave in Delhi: Extreme heat predicted for the next 3 days (45-47C). Avoid sun exposure between 11 AM - 4 PM. Stay hydrated, wear light clothing, and check on vulnerable individuals."},
    {"input_text": "Context:\n- Current Weather: Heavy rain warning in Chennai, possibility of urban flooding.\n- News: Waterlogging reported in T. Nagar and Velachery.\nUser Query: Is it safe to travel across the city?",
     "output_text": "Travel Advisory for Chennai: Due to heavy rain warnings and reported waterlogging in areas like T. Nagar and Velachery, non-essential travel across the city is discouraged. Check official traffic updates and avoid flooded streets."},
    {"input_text": "What are the safety steps during an earthquake?",
     "output_text": "During an earthquake: DROP to the ground, take COVER under a sturdy table or desk, and HOLD ON until the shaking stops. Stay away from windows, outer walls, and anything that could fall. If outdoors, move to an open area away from buildings and power lines."},
    # --- !!! ADD MANY MORE (Aim for 100+ initially) HIGH-QUALITY MANUAL EXAMPLES !!! ---
    # Cover different disaster types, locations, and tasks (summarize, risks, safety steps, Q&A)
]
all_examples.extend(manual_examples)
print(f"Added {len(manual_examples)} manual examples.")
print(f"Total examples for fine-tuning: {len(all_examples)}")


--- Adding Manually Created Examples ---
Added 4 manual examples.
Total examples for fine-tuning: 1758


In [34]:
print("\n--- Splitting and Saving Data ---")
if all_examples:
    # Shuffle the data thoroughly before splitting
    random.seed(42) # for reproducible splits
    random.shuffle(all_examples)
    
    # Split data (e.g., 90% train, 10% validation)
    # Ensure there's at least 1 validation example if the dataset is tiny
    num_validation = max(1, int(len(all_examples) * 0.1)) 
    if len(all_examples) - num_validation <= 0: # Handle very small datasets
         print("Warning: Dataset too small for validation split. Using all for training.")
         train_data = all_examples
         validation_data = []
    else:
         validation_data = all_examples[:num_validation]
         train_data = all_examples[num_validation:]

    print(f"Final Training set size: {len(train_data)}")
    print(f"Final Validation set size: {len(validation_data)}")

    # Function to save data to JSONL
    def save_to_jsonl(data, filename):
        count = 0
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                for item in data:
                    # Ensure item is a dictionary before dumping
                    if isinstance(item, dict) and "input_text" in item and "output_text" in item:
                         f.write(json.dumps(item, ensure_ascii=False) + '\n')
                         count += 1
                    else:
                         print(f"Warning: Skipping invalid item during save (must be dict with keys): {item}")
            print(f"{count} examples successfully saved to {filename}")
        except Exception as e:
            print(f"ERROR saving file {filename}: {e}")

    # Save the files
    if train_data:
        save_to_jsonl(train_data, OUTPUT_TRAIN_FILE)
    if validation_data:
        save_to_jsonl(validation_data, OUTPUT_VALID_FILE)
    
    # Display some final examples (optional)
    print("\nSample Training Data Entries (First 3):")
    for i in range(min(3, len(train_data))):
        if isinstance(train_data[i], dict):
             print(json.dumps(train_data[i], indent=2, ensure_ascii=False))
        
else:
    print("\nNo training examples were generated or added. Cannot save JSONL files.")



--- Splitting and Saving Data ---
Final Training set size: 1583
Final Validation set size: 175
1583 examples successfully saved to data\train.jsonl
175 examples successfully saved to data\validation.jsonl

Sample Training Data Entries (First 3):
{
  "input_text": "Based on historical records, provide a brief factual summary of this event:\nEvent Type: Flood\nLocation: Unknown Location\nStart Date: Unknown Date\nDetails: Unnamed Event\nImpact: Deaths - 133.0, Damage (USD Est.) - N/A\n\nSummary:",
  "output_text": "Historical record: An event of type 'Flood' occurred in Unknown Location, around Unknown Date, described as 'Unnamed Event'. It resulted in 133.0 deaths and damage estimated at N/A INR. Check for other source for update of Flood and Take the necessary steps to protect yourself"
}
{
  "input_text": "For a historical Storm event in Unknown Location, list the likely primary impacts based on the disaster type.",
  "output_text": "Primary impacts likely included: damage from high 

In [35]:
print("\n--- Notebook Complete ---")
print(f"Review the generated files in the '{DATA_DIR}' folder:")
print(f"- {os.path.basename(OUTPUT_TRAIN_FILE)}")
print(f"- {os.path.basename(OUTPUT_VALID_FILE)}")
print("\nEnsure the format (one JSON per line with 'input_text' and 'output_text') is correct for Vertex AI tuning.")
print("Next steps for fine-tuning Gemini:")
print("1. Create a Google Cloud Project and enable Vertex AI API.")
print(f"2. Create a Google Cloud Storage (GCS) bucket.")
print(f"3. Upload '{os.path.basename(OUTPUT_TRAIN_FILE)}' (and optionally validation file) to the GCS bucket.")
print("4. Go to Vertex AI in the Google Cloud Console -> Model Garden or Generative AI Studio.")
print("5. Find a tunable base model (e.g., 'gemini-1.0-pro-002' - check availability).")
print("6. Start a tuning job, pointing it to your data on GCS.")
print("7. After tuning, deploy the tuned model to a Vertex AI Endpoint.")
print("8. Update your backend API code to call the Vertex AI Endpoint instead of the base Gemini model.")



--- Notebook Complete ---
Review the generated files in the 'data' folder:
- train.jsonl
- validation.jsonl

Ensure the format (one JSON per line with 'input_text' and 'output_text') is correct for Vertex AI tuning.
Next steps for fine-tuning Gemini:
1. Create a Google Cloud Project and enable Vertex AI API.
2. Create a Google Cloud Storage (GCS) bucket.
3. Upload 'train.jsonl' (and optionally validation file) to the GCS bucket.
4. Go to Vertex AI in the Google Cloud Console -> Model Garden or Generative AI Studio.
5. Find a tunable base model (e.g., 'gemini-1.0-pro-002' - check availability).
6. Start a tuning job, pointing it to your data on GCS.
7. After tuning, deploy the tuned model to a Vertex AI Endpoint.
8. Update your backend API code to call the Vertex AI Endpoint instead of the base Gemini model.


In [36]:
%pip install pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import json
import os
# Define paths
DATA_DIR = "data"
OUTPUT_TRAIN_FILE = os.path.join(DATA_DIR, "train.jsonl")
OUTPUT_VALID_FILE = os.path.join(DATA_DIR, "validation.jsonl")
# Path to any raw data you might load (e.g., downloaded CSVs, PDF folder)
RAW_DATA_PATH = os.path.join(DATA_DIR, "raw") 

# Create data directory if it doesn't exist
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(RAW_DATA_PATH, exist_ok=True)

print(f"Output training data will be saved to: {OUTPUT_TRAIN_FILE}")
print(f"Output validation data will be saved to: {OUTPUT_VALID_FILE}")

Output training data will be saved to: data\train.jsonl
Output validation data will be saved to: data\validation.jsonl


In [3]:
raw_texts = [
    "IMD Bulletin: Heavy rainfall warning for coastal Kerala for next 48 hours. Fishermen advised not to venture into sea. Strong winds likely.",
    "USGS Report: Magnitude 4.8 earthquake recorded 50km NE of Shimla, Himachal Pradesh. Depth 10km. No immediate damage reports.",
    "News Headline: Flash floods trigger landslides in Uttarakhand, several roads blocked. Rescue operations underway.",
    "Forecast Discussion: Monsoon trough active over central India. Expect widespread moderate rainfall with isolated heavy spells over MP and Maharashtra.",
    "Heatwave conditions persist over Rajasthan and Gujarat, with temperatures exceeding 45C in many places. Avoid sun exposure."
]
print(f"Loaded {len(raw_texts)} raw text snippets (example).")


Loaded 5 raw text snippets (example).


In [5]:
def create_gemini_tuning_example(raw_text, task_type="summarize"):
    """Creates a prompt/completion pair for Gemini fine-tuning."""
    
    if task_type == "summarize":
        prompt = f"Summarize the key information from this report concisely for a public alert:\n\nReport:\n{raw_text}\n\nSummary:"
        # --- !!! MANUALLY WRITE THE IDEAL SUMMARY HERE based on raw_text !!! ---
        if "Kerala" in raw_text:
             completion = "Heavy Rain & Wind Warning: Coastal Kerala expects heavy rain & strong winds for 48 hrs. Fishermen advised stay ashore."
        elif "Shimla" in raw_text:
             completion = "Earthquake Update: Magnitude 4.8 quake hit near Shimla, HP (Depth 10km). No immediate damage reported. Monitor updates."
        elif "Uttarakhand" in raw_text:
             completion = "Flood & Landslide Alert: Flash floods & landslides block roads in Uttarakhand. Rescue ops ongoing. Avoid affected areas."
        elif "Monsoon trough" in raw_text:
             completion = "Monsoon Rain Forecast: Widespread moderate rain, with isolated heavy spells, expected over Madhya Pradesh & Maharashtra due to active monsoon trough."
        elif "Heatwave" in raw_text:
             completion = "Heatwave Warning: Persistent heatwave in Rajasthan & Gujarat (>45C). Avoid sun exposure, stay hydrated."
        else:
             completion = "Summary not available for this example." # Placeholder
             
    elif task_type == "identify_risks":
        prompt = f"Identify the main risks mentioned or implied in this report:\n\nReport:\n{raw_text}\n\nRisks:"
        # --- !!! MANUALLY WRITE THE IDEAL RISK LIST HERE based on raw_text !!! ---
        if "Kerala" in raw_text:
             completion = "- Flooding due to heavy rain\n- Damage from strong winds\n- Danger to fishermen at sea"
        elif "Shimla" in raw_text:
             completion = "- Potential aftershocks\n- Possible minor structural damage (though none reported yet)"
        elif "Uttarakhand" in raw_text:
             completion = "- Danger from flash floods\n- Risk of being trapped due to landslides/road blockages"
        # ... add more completions for other examples ...
        else:
             completion = "Risk identification not available for this example."

    # Add more task types as needed (e.g., 'generate_alert_tweet', 'answer_faq')
    
    else: # Default or unknown task
        return None

    # Return in the format needed for Vertex AI tuning (check docs)
    return {"input_text": prompt, "output_text": completion}

In [10]:
training_examples = []
for text in raw_texts:
    # Create different types of examples from the same raw text
    summary_example = create_gemini_tuning_example(text, task_type="summarize")
    risk_example = create_gemini_tuning_example(text, task_type="identify_risks")
    
    if summary_example and summary_example["output_text"] != "Summary not available for this example.":
        training_examples.append(summary_example)
    if risk_example and risk_example["output_text"] != "Risk identification not available for this example.":
         training_examples.append(risk_example)

    print(f"Generated {len(training_examples)} training examples.")
if training_examples:
    print("\nSample Example:")
    print(json.dumps(training_examples[0], indent=2))

Generated 2 training examples.
Generated 4 training examples.
Generated 6 training examples.
Generated 7 training examples.
Generated 8 training examples.

Sample Example:
{
}


In [11]:
if training_examples:
    df = pd.DataFrame(training_examples)
    
    # Simple split for demo: 80% train, 20% validation
    validation_df = df.sample(frac=0.2, random_state=42) # Use frac=0.2 for 20% validation
    train_df = df.drop(validation_df.index)

    print(f"\nTraining set size: {len(train_df)}")
    print(f"Validation set size: {len(validation_df)}")

    # Save to JSONL format (one JSON object per line)
    train_df.to_json(OUTPUT_TRAIN_FILE, orient='records', lines=True)
    validation_df.to_json(OUTPUT_VALID_FILE, orient='records', lines=True)

    print(f"\nTraining data saved to {OUTPUT_TRAIN_FILE}")
    print(f"Validation data saved to {OUTPUT_VALID_FILE}")
else:
    print("\nNo training examples generated, skipping save.")



Training set size: 6
Validation set size: 2

Training data saved to data\train.jsonl
Validation data saved to data\validation.jsonl
