In [10]:
import os
from together import Together

# I have set my api key on linux with this following command in terminal.
# export TOGETHER_API_KEY=xxxxxxxxxxxxxx
# then started jupyter notebook through the same terminal
API_key = os.environ.get("TOGETHER_API_KEY")

In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK data if not already available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Path to the file
file_path = r'extracted_passage.txt'

# 1. Read the file and store the content as a single string
with open(file_path, 'r') as file:
    file_content = file.read()

# 2. Convert the text to lowercase
file_content = file_content.lower()

# 3. Remove all punctuation and special characters using regex (keeping words and spaces only)
file_content = re.sub(r'[^\w\s]', '', file_content)

# 4. Remove numbers (e.g., years, dates)
file_content = re.sub(r'\b\d+\b', '', file_content)

# 5. Tokenize the text into words
words = word_tokenize(file_content)

# 6. Filter out stopwords (common words)
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

# 7. Use POS (Part of Speech) tagging to filter out proper nouns (NNP), numbers, etc.
tagged_words = nltk.pos_tag(filtered_words)

# 8. Keep only words that are not proper nouns, numbers, or other irrelevant tags
vocabulary_words = [word for word, pos in tagged_words if pos not in ['NNP', 'CD']]

# 9. Remove duplicates by converting the list to a set and back to a list
vocabulary_words = list(set(vocabulary_words))

# 10. Print the final vocabulary words
print(vocabulary_words)


['national', 'look', 'region', 'connections', 'canoes', 'metres', 'lived', 'rather', 'part', 'indonesian', 'types', 'compared', 'ended', 'historic', 'excavations', 'land', 'undergrowth', 'dispersed', 'discoveries', 'continent', 'found', 'called', 'past', 'twice', 'australian', 'follows', 'support', 'perhaps', 'longer', 'mainland', 'less', 'reading', 'tropical', 'silver', 'hunting', 'significantly', 'material', 'ancient', 'occupationnnjust', 'first', 'grown', 'fragments', 'particular', 'voyages', 'indonesias', 'none', 'using', 'use', 'bisa', 'nearby', 'highly', 'early', 'ago', 'possumlike', 'site', 'seashore', 'final', 'title', 'phase', 'team', 'migration', 'modification', 'forest', 'made', 'southern', 'nearbynnroughly', 'clearing', 'researchers', 'axe', 'glass', 'pottery', 'substance', 'lives', 'project', 'island', 'april', 'construction', 'evidence', 'location', 'ice', 'obi', 'would', 'area', 'suitable', 'finds', 'instructions', 'rothschilds', 'shipton', 'back', 'leave', 'rest', 'many

[nltk_data] Downloading package punkt to
[nltk_data]     /home/danishbaloch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/danishbaloch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/danishbaloch/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [12]:
vocabulary_words_string = "['" + "', '".join(vocabulary_words) + "']"
print(vocabulary_words_string)



['national', 'look', 'region', 'connections', 'canoes', 'metres', 'lived', 'rather', 'part', 'indonesian', 'types', 'compared', 'ended', 'historic', 'excavations', 'land', 'undergrowth', 'dispersed', 'discoveries', 'continent', 'found', 'called', 'past', 'twice', 'australian', 'follows', 'support', 'perhaps', 'longer', 'mainland', 'less', 'reading', 'tropical', 'silver', 'hunting', 'significantly', 'material', 'ancient', 'occupationnnjust', 'first', 'grown', 'fragments', 'particular', 'voyages', 'indonesias', 'none', 'using', 'use', 'bisa', 'nearby', 'highly', 'early', 'ago', 'possumlike', 'site', 'seashore', 'final', 'title', 'phase', 'team', 'migration', 'modification', 'forest', 'made', 'southern', 'nearbynnroughly', 'clearing', 'researchers', 'axe', 'glass', 'pottery', 'substance', 'lives', 'project', 'island', 'april', 'construction', 'evidence', 'location', 'ice', 'obi', 'would', 'area', 'suitable', 'finds', 'instructions', 'rothschilds', 'shipton', 'back', 'leave', 'rest', 'many

In [13]:
file_path = 'vocab_sys_prompt.txt'
with open(file_path, 'r') as file:
    sys_prompt = file.read()  
    
print(sys_prompt)
print('-' * 50)

file_path = 'vocab_user_prompt.txt'
with open(file_path, 'r') as file:
    user_prompt = file.read()  
    
user_prompt = user_prompt + ' ' + vocabulary_words_string
print(user_prompt)


you are an expert english teacher. you know about the grammar rules and have extensive depth in english literature. you have to analyze the given words of english and teach them to students in a way that the meaning remain in their head. you are a great, professional, interactive and expert english teacher.

--------------------------------------------------
At the end of this prompt, there is an array of English words. You have to analyze each word in depth and find relations. I need you to find words that I can use in my IELTS English speaking and writing tasks to improve my vocabulary. Ignore the words in the array that cannot be used in conversations.

NOTE: YOU HAVE TO FOLLOW THE STRICT GUIDELINES GIVEN TO YOU AND RETURN THE REQUIRED ANSWER IN THE SPECIFIED FORMAT AT THE END OF THIS PROMPT. PROVIDE ME THE DATA IN A .CSV FORMAT SO THAT I CAN SAVE IT ON MY DISK. I ONLY NEED .CSV FROM YOU AND NO OTHER TEXT. I NEED MY PARSER TO DIRECTLY READ YOUR OUTPUT AND CONVERT IT INTO A .CSV

Spe

In [14]:
import os
import base64
from together import Together
from PIL import Image
import io

# Initialize API client
client = Together(base_url="https://api.aimlapi.com/v1", api_key=API_key)

# Load image from your local drive
# THIS CODE IS NOT BEING USED BELOW
image_path = 'reading_test_3_passage_1/stitched_passage.png'  # Replace with your image path
with open(image_path, "rb") as image_file:
    # Convert the image to base64 string
    encoded_image = base64.b64encode(image_file.read()).decode('utf-8')

# print(encoded_image)
# print(type(encoded_image))
# Create the API request
response = client.chat.completions.create(
    model="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
    messages=[
            {
                "role": "system",
                "content": sys_prompt,
            },
        
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text":  user_prompt,
                    },
                    
#                     {
#                         "type": "image_url",
#                         "image_url": {
#                             # stitched passage.
#                             "url" :"https://drive.google.com/uc?id=1iQQM14fOhw6MOPxexPR03_u9GG7BDXTr"
# #                             "url": "https://drive.google.com/uc?id=1XM6uNKcw-kvX0i05D2N-eoHmIonXuTPx",
# #                             "url": "https://drive.google.com/uc?id=1GXOZ7spaWSkcvqMwXU6DGwr2IvBiNbjv",
#                         },
#                     },
                    
                    # IT IS NOT WORKING LIKE THIS, NOT TAKING MULTIPLE IMAGES AT ONCE. (MAYBE I AM MISSING SOMETHING)
#                     {
#                         "type": "image_url",
#                         "image_url": {
#                             # second page of passage.
#                             "url": "https://drive.google.com/uc?id=1GXOZ7spaWSkcvqMwXU6DGwr2IvBiNbjv",
#                         },
#                     },
                                
                ],
            }
        ],
    # IELTS PASSAGES ARE MOSTLY AROUND 2000-3000 WORDS
    max_tokens=3500,
)

In [15]:
# Print the model's response
print("Assistant: ", response.choices[0].message.content)

response_text = response.choices[0].message.content
# Write the response to a .txt file
file_path = 'extracted_vocab.txt'  # Save it as a .txt file
with open(file_path, 'w') as file:
    file.write(response_text)  # Write the JSON response as plain text

print(f"Assistant response saved to {file_path}")

Assistant:  Here is the CSV data for the given array of English words, following the specified format and guidelines:


"group_number","word","meaning","formal_writing_sentence","semi_formal_speaking_sentence"

"1","region","A geographical area, especially one that is defined by natural or administrative boundaries.","The research focuses on the cultural significance of the region in the context of human migration.","I've always wanted to visit that region of the country, it's so beautiful."

"1","continent","One of the seven large landmasses on Earth, including Africa, Antarctica, Asia, Australia, Europe, North America, and South America.","The continent has a diverse range of ecosystems, including tropical rainforests and arid deserts.","I've always wanted to travel to another continent and experience a different culture."

"1","island","A piece of land that is surrounded by water, typically smaller than a continent.","The island was inhabited by indigenous peoples for thousands of y

In [16]:
print(response_text)

Here is the CSV data for the given array of English words, following the specified format and guidelines:


"group_number","word","meaning","formal_writing_sentence","semi_formal_speaking_sentence"

"1","region","A geographical area, especially one that is defined by natural or administrative boundaries.","The research focuses on the cultural significance of the region in the context of human migration.","I've always wanted to visit that region of the country, it's so beautiful."

"1","continent","One of the seven large landmasses on Earth, including Africa, Antarctica, Asia, Australia, Europe, North America, and South America.","The continent has a diverse range of ecosystems, including tropical rainforests and arid deserts.","I've always wanted to travel to another continent and experience a different culture."

"1","island","A piece of land that is surrounded by water, typically smaller than a continent.","The island was inhabited by indigenous peoples for thousands of years before 

In [17]:
# Break the string into lines
lines = response_text.splitlines()

# Filter lines to keep only the CSV content
csv_lines = [line for line in lines if line.startswith('"')]

# Join the filtered lines back into a single string for CSV
cleaned_csv = '\n'.join(csv_lines)

# Print the cleaned CSV content
print(cleaned_csv)



"group_number","word","meaning","formal_writing_sentence","semi_formal_speaking_sentence"
"1","region","A geographical area, especially one that is defined by natural or administrative boundaries.","The research focuses on the cultural significance of the region in the context of human migration.","I've always wanted to visit that region of the country, it's so beautiful."
"1","continent","One of the seven large landmasses on Earth, including Africa, Antarctica, Asia, Australia, Europe, North America, and South America.","The continent has a diverse range of ecosystems, including tropical rainforests and arid deserts.","I've always wanted to travel to another continent and experience a different culture."
"1","island","A piece of land that is surrounded by water, typically smaller than a continent.","The island was inhabited by indigenous peoples for thousands of years before European colonization.","I'm planning a vacation to a tropical island, it's going to be amazing."
"2","historic

In [18]:

# Define the file path where you want to save the CSV
file_path = 'generated_vocab.csv'

# Save the cleaned CSV to a file
with open(file_path, 'w', newline='') as csv_file:
    csv_file.write(cleaned_csv)

print(f"CSV file saved as {file_path}.")

CSV file saved as generated_vocab.csv.
