In [None]:
!pip install google-cloud-aiplatform
!pip install --user datasets
!pip install --user google-cloud-pipeline-components

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [None]:
import IPython
from google.cloud import aiplatform
from google.colab import auth as google_auth
google_auth.authenticate_user()

In [None]:
import vertexai
PROJECT_ID = "YOUR_PROJECT_ID"
vertexai.init(project=PROJECT_ID)

In [None]:
region = "us-central1"
REGION = "us-central1"
project_id = "YOUR_PROJECT_ID"

In [None]:
! gcloud config set project {project_id}

In [None]:

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import warnings
warnings.filterwarnings('ignore')
import vertexai
vertexai.init(project=PROJECT_ID, location=REGION)
import kfp
import sys
import uuid
import json
import vertexai
import pandas as pd
from google.auth import default
from datasets import load_dataset
from google.cloud import aiplatform
from vertexai.preview.language_models import TextGenerationModel, EvaluationTextSummarizationSpec

In [None]:
BUCKET_NAME = 'YOUR_BUCKET'
BUCKET_URI = f"gs://YOUR_BUCKET/TRAIN.jsonl"
REGION = "us-central1"

If we have to use it with the colab repository

In [None]:
df=pd.read_json("/content/drive/MyDrive/Training_Data_BTP/train.jsonl", lines=True)
df.head()

If we have to use the data which is there in bucket then we can do this by this code


In [None]:
json_url = 'https://storage.googleapis.com/YOUR_BUCKET/TRAIN.jsonl'
df = pd.read_json(json_url, lines=True)
print (df)

In [None]:
print(df.shape)

In [None]:
model_display_name = 'cpg_to_ud_finetunned_model'
tuned_model = TextGenerationModel.from_pretrained("gemini-2.0-flash-lite-001")
tuned_model.tune_model(
training_data=df,
train_steps=100,
tuning_job_location="us-central1",
tuned_model_location="us-central1",
)

In [None]:
tuned_model_name = tuned_model._endpoint.gca_resource.deployed_models[0].model
tuned_model_1 = TextGenerationModel.get_tuned_model(tuned_model_name)
response = tuned_model_1.predict("""<Sentence id='1'>
1	((	NP	<fs name='NP' drel='r6:NP2'>
1.1	करीब	RP_RPD	<fs af='करीब,avy,,,,,,' name='करीब' posn='10'>
1.2	40,000	QT_QTC	<fs af='40000,num,any,any,,any,,' name='40,000' posn='20'>
1.3	अंडों	N_NN	<fs af='अंड,n,m,pl,3,o,0,0' name='अंडों' posn='30'>
1.4	का	PSP	<fs af='का,psp,m,sg,,d,,' name='का' posn='40'>
	))
2	((	NP	<fs name='NP2' drel='k1:VGF'>
2.1	वजन	N_NN	<fs af='वजन,n,m,sg,3,d,0,0' name='वजन' posn='50'>
	))
3	((	NP	<fs name='NP3' drel='k1u:VGF'>
3.1	केवल	RP_RPD	<fs af='केवल,avy,,,,,,' name='केवल' posn='60'>
3.2	एक	QT_QTC	<fs af='एक,num,any,any,,any,,' name='एक' posn='70'>
3.3	आउंस	N_NN	<fs af='आउंस,n,m,sg,3,o,0,0' name='आउंस' posn='80'>
3.4	के	PSP	<fs af='के,psp,,,,,,' name='के' posn='90'>
3.5	लगभग	RP_RPD	<fs af='लगभग,avy,,,,,,' name='लगभग' posn='100'>
	))
4	((	VGF	<fs name='VGF' voicetype='active' stype='declarative'>
4.1	होता	V_VM	<fs af='हो,v,m,sg,any,,ता,wA' name='होता' posn='110'>
4.2	है	V_VAUX	<fs af='है,v,any,sg,3,,0,0' name='है' posn='120'>
	))
5	((	BLK	<fs name='BLK' drel='rsym:VGF'>
5.1	।	RD_PUNC	<fs af='।,punc,,,,,,' name='।' posn='130'>
	))
</Sentence>""")
print(response.text)

Model Predictions for CPG Files into UD files.

In [None]:

import os

def get_cpg_files_from_directory(root_dir):
    cpg_files = []
    for root, _, files in os.walk(root_dir):
        for file in files:
            cpg_files.append(os.path.join(root, file))
    return cpg_files
def parse_cpg_blocks(cpg_files):
    all_blocks = []
    for cpg_path in cpg_files:
        with open(cpg_path, 'r', encoding='utf-8') as f:
            block = []
            for line in f:
                stripped = line.rstrip()
                if "<Sentence id=" in stripped:
                    block = [stripped]
                elif "</Sentence>" in stripped:
                    block.append(stripped)
                    # join lines into one block
                    all_blocks.append("\n".join(block))
                    block = []
                elif block:
                    block.append(stripped)
    return all_blocks

cpg_root_directory = "/content/drive/MyDrive/Training_Data_BTP/HINDI-DEPENDENCY-ALL-DOMAINS-LATEST/Data/"
cpg_files = get_cpg_files_from_directory(cpg_root_directory)
OUTPUT_CONLLU     = "/content/drive/MyDrive/Training_Data_BTP/predictions.conllu"

cpg_blocks = parse_cpg_blocks(cpg_files)
print(f"Parsed {len(cpg_blocks)} sentences from CPG files.")

with open(OUTPUT_CONLLU, "w", encoding="utf-8") as fout:
    for idx, cpg in enumerate(cpg_blocks, 1):
        prompt = cpg.strip() + "\n\n"

        response = tuned_model_1.predict(prompt)
        ud_conllu = response.text.strip()

        fout.write(ud_conllu.rstrip() + "\n\n")

        if idx % 50 == 0:
            print(f"  • Processed {idx}/{len(cpg_blocks)} sentences")

print(f"\n All done — UD output in: {OUTPUT_CONLLU}")
