In [75]:
!pip uninstall -qqy jupyterlab  # Remove unused conflicting packages
!pip install -U -q "google-genai==1.7.0"

[0m

In [76]:
from google import genai
from google.genai import types

genai.__version__

'1.7.0'

In [77]:
import os, json, uuid, time
import pandas as pd
from tqdm import tqdm

In [78]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
gcp_key = user_secrets.get_secret("gcp_credentials")
GOOGLE_API_KEY = user_secrets.get_secret("GOOGLE_API_KEY")
client = genai.Client(api_key=GOOGLE_API_KEY)

In [79]:
for model in client.models.list():
    if "createTunedModel" in model.supported_actions:
        print(model.name)

models/gemini-1.5-flash-001-tuning


In [80]:
with open("/tmp/gcp_key.json", "w") as f:
    f.write(gcp_key)

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/tmp/gcp_key.json"

# Set your GCP project and region
PROJECT_ID = "ai-detection-457406"
REGION = "us-central1"
os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID

from google.cloud import aiplatform
from google.cloud import storage

In [81]:
import pandas as pd

# Load your dataset
df = pd.read_csv("/kaggle/input/ai-vs-human-text/AI_Human.csv")
df.head()
# Normalize column names
df.columns = [col.strip().lower() for col in df.columns]

# Convert label to readable format
df["generated"] = df["generated"].apply(lambda x: "ai" if x == 1.0 else "human")
df["text"] = df["text"].astype(str).str.strip()
df = df[["text", "generated"]]


# Estimate max samples per class (≈ 35%)
ai_count = df[df["generated"] == "ai"].shape[0]
human_count = df[df["generated"] == "human"].shape[0]
target_total = int(0.35 * (ai_count + human_count))
target_per_class = target_total // 2

print(f"Target per class: {target_per_class} samples")

# Sample balanced set
df_sampled = (
    df.groupby("generated")
    .apply(lambda x: x.sample(min(len(x), target_per_class), random_state=42))
    .reset_index(drop=True)
)

print(f"✅ Final sample size: {len(df_sampled)} rows")
df["generated"].value_counts()
df_sampled["generated"].value_counts()


Target per class: 85266 samples


  .apply(lambda x: x.sample(min(len(x), target_per_class), random_state=42))


✅ Final sample size: 170532 rows


generated
ai       85266
human    85266
Name: count, dtype: int64

In [82]:
import pandas as pd

# 1. See which labels appear
print(df['generated'].value_counts())
# e.g. 
# human    1200
# ai         45
# Name: generated, dtype: int64

# 2. Check whether any “ai” rows exist
has_ai = df['generated'].eq('ai').any()
print("Any AI‑generated text?", has_ai)

# 3. How many?
ai_count = df['generated'].eq('ai').sum()
print(f"Found {ai_count} AI‑generated examples out of {len(df)} total.")


generated
human    305797
ai       181438
Name: count, dtype: int64
Any AI‑generated text? True
Found 181438 AI‑generated examples out of 487235 total.


In [83]:
df_sampled["generated"].unique()

array(['ai', 'human'], dtype=object)

In [84]:
ai_rows = df_sampled[df_sampled.get("generated") == "ai"]
if ai_rows.empty:
    print("No AI-generated rows found in your dataset.")
else:
    # 2b.1: Sample a random AI-generated row
    random_ai_row = ai_rows.sample(n=1).iloc[0]
    print("Random AI-generated row:\n", random_ai_row)

    # 2b.2: Retrieve a specific AI-generated row by position
    row_index = 0  # change index as needed (0-based)
    specific_ai_row = ai_rows.iloc[row_index]
    print(f"\nAI-generated row at position {row_index}:\n", specific_ai_row)


Random AI-generated row:
 text         Title: Car-Free Cities: The Future or a Fantas...
generated                                                   ai
Name: 23142, dtype: object

AI-generated row at position 0:
 text         I believe that online classes and video confer...
generated                                                   ai
Name: 0, dtype: object


In [91]:
from collections.abc import Iterable
import random

import random

max_epochs = 1
max_steps = 250_000
max_examples = max_steps // max_epochs

tuning_examples = [
    types.TuningExample(
        text_input=row["text"],
        output=(
            "This text was generated by AI."
            if row["generated"] == "ai"
            else "This text was written by a human."
        )
    )
    for _, row in df_sampled.iterrows()
]

# 2. Wrap in a TuningDataset
training_dataset = types.TuningDataset(examples=tuning_examples)




# If you are re-running this lab, add your model_id here.
model_id = None

# Or try and find a recent tuning job.
if not model_id:
  queued_model = None
  # Newest models first.
  for m in reversed(client.tunings.list()):
    # Only look at newsgroup classification models.
    if m.name.startswith('tunedModels/newsgroup-classification-model'):
      # If there is a completed model, use the first (newest) one.
      if m.state.name == 'JOB_STATE_SUCCEEDED':
        model_id = m.name
        print('Found existing tuned model to reuse.')
        break

      elif m.state.name == 'JOB_STATE_RUNNING' and not queued_model:
        # If there's a model still queued, remember the most recent one.
        queued_model = m.name
  else:
    if queued_model:
      model_id = queued_model
      print('Found queued model, still waiting.')


# Upload the training data and queue the tuning job.
if not model_id:
    try:
        print("🚀 Launching tuning job (non-blocking)...")
        tuning_op = client.tunings.tune(
            base_model="models/gemini-1.5-flash-001",
            training_dataset=training_dataset,
            config=types.CreateTuningJobConfig(
                tuned_model_display_name=f"ai-detector-{int(time.time())}",
                batch_size=4,
                epoch_count=1,
            ),
        )
        model_id = tuning_op.name
        print(f"✅ Job launched! Model ID: {model_id}")
    except Exception as e:
        print("❌ Failed to launch tuning job:")
        print(e)

print(model_id)

🚀 Launching tuning job (non-blocking)...
❌ Failed to launch tuning job:
400 FAILED_PRECONDITION. {'error': {'code': 400, 'message': 'models/gemini-1.5-flash-001 is not found for CREATE TUNED MODEL at API version v1beta.', 'status': 'FAILED_PRECONDITION'}}
None


In [89]:
for job in client.tunings.list(config={'page_size': 50}):
    print(job.name, job.state.name, job.tuned_model.model if job.tuned_model else None)


tunedModels/aidetectormodel-qv7hrundwrke JOB_STATE_SUCCEEDED tunedModels/aidetectormodel-qv7hrundwrke


In [None]:
from google.genai import types

job_name = "tunedModels/aidetectormodel-qv7hrundwrke"  # from your client.tunings.tune(...) call

# Fetch its latest status
status = client.tunings.get(name=job_name)
print(status.state.name)  # e.g. “JOB_STATE_RUNNING”, then eventually “JOB_STATE_SUCCEEDED”


In [None]:
tuned_model_name = status.tuned_model.model
print("Ready model:", tuned_model_name)
# e.g. "tunedModels/az2mb0bpw6i"

In [None]:
response = client.models.generate_content(
    model=tuned_model_name,
    contents='''genertaed by ai: Cars are one of the major contributors to environmental pollution, releasing harmful emissions that significantly impact air quality and contribute to climate change. The combustion of fossil fuels in car engines produces pollutants such as carbon dioxide (CO2), nitrogen oxides (NOx), particulate matter, and volatile organic compounds (VOCs). CO2, a greenhouse gas, is the primary driver of global warming, while NOx and particulate matter can cause respiratory issues and other health problems. Additionally, the production and disposal of vehicles generate waste and consume vast amounts of resources. The growing adoption of electric vehicles and advancements in clean energy technologies are essential steps toward reducing the environmental footprint of transportation and protecting our planet for future generations.''',
)
print(response.text)