In [1]:
import ollama
response = ollama.chat(model='llama3.1', messages=[
  {
    'role': 'user',
    'content': 'Generate 50 questions. Do not write anything else than the questions',
  },
])
print(response['message']['content'])

1. What is your favorite hobby?
2. Have you traveled to another country?
3. Do you prefer hot or cold weather?
4. Can you speak more than one language?
5. Have you ever learned a new skill?
6. Do you have any pets?
7. Are you currently in a relationship?
8. How many siblings do you have?
9. What is your favorite type of music?
10. Have you ever won an award?
11. Can you cook or bake well?
12. Do you prefer reading fiction or nonfiction books?
13. Are you a morning person or night owl?
14. Have you ever gone skydiving or tried another adventure sport?
15. Do you have any favorite sports teams?
16. Can you play a musical instrument?
17. Have you ever been to a music festival?
18. How many hours of sleep do you get per night?
19. Are you allergic to anything?
20. Have you ever volunteered for a cause?


In [31]:
def extract_questions(response):
    questions = []
    for q in response['message']['content'].split('\n'):
        # if q is empty, skip
        if q == "":
            continue
        # check this is a question (starts with a number)
        if q[0].isdigit():
            # remove the number at the beginning of the question
            q = q.split('.')[1].strip()
            questions.append(q)
        else:
            print(f"Question not valid: {q}")
    return questions


In [17]:
base_prompt = "I'm preparing a small nlp classification project for my students. The project will be to classify spectators questions about the Paris Olympic games in 2024. \
    Here is a list of categories (the classes to predict) and sub categories:\
        * Logistics & Transport (Site Access, Public Transportation, Parking, Accommodation, Accessibility for Disabled Persons) \
        * Ticketing & Access (Ticket Prices, Purchase Procedure, Pass Types, Access Conditions, Security Control) \
        * Sport Events (Schedule, Venue, Athletes, Results) \
        * Program & Planning (Event Schedule, Specific Timetables, Site Planning, Opening/Closing Ceremonies, Side Events) \
        * Services & Amenities (Wi-Fi/Connectivity, Lockers/Storage, Medical Services, Information Points, Lost & Found) \
        * Security & Emergencies (Security Measures, Emergency Procedures, First Aid Points, Allowed/Prohibited Items, Evacuation) \
        * Rules & Sport (Discipline Rules, Competition Format, Qualification System, Records/Statistics, Teams/Athletes) \
        * Food & Drink (Food & Beverage Points, Bars, Restaurants and Snack Bars) \
        I expect them to use TF-IDF to generate the features, so I need the questions of a same  category to have similar words stemms or concepts easy to discover by a tf-IDF" 

In [33]:
Logistics_Transport_prompt = "Here is a list of things that should be in the category Logistics & Transport: \
    * Common words:\
	+ Transport\
	+ Parking\
	+ Access\
	+ Route\
	+ Entrance\
* Phrases:\
	+ 'How do I...'\
	+ 'Is there a designated area for...?'\
	+ 'Can I bring my mobility aid...'\
* Indicators:\
	+ Dates and times\
	+ Accommodations (hotels, hostels)\
	+ Transportation modes (buses, trains, taxis)"
 
 
Ticketing_Access_prompt = "Here is a list of things that should be in the category Ticketing & Access: \
* Common words:\
+ Ticket\
+ Pass\
+ Price\
+ Book\
+ Purchase\
+ Entry\
+ Gate\
+ Valid\
+ Seat\
+ ID\
* Phrases:\
+ 'How much does...'\
+ 'Where can I buy...?'\
+ 'Do I need to show...?'\
+ 'Is my ticket valid for...?'\
* Indicators:\
+ Currency symbols (€, $)\
+ Ticket types (single, multi-day, season)\
+ Documentation references (passport, ID card)\
+ Age categories (adult, child, senior)"

Sport_Events_prompt = "Here is a list of things that should be in the category Sport Events: \
* Common words:\
+ Match\
+ Competition\
+ Race\
+ Final\
+ Medal\
+ Athlete\
+ Score\
+ Team\
+ Event\
+ Round\
* Phrases:\
+ 'When does the...start?'\
+ 'Which athletes are competing...?'\
+ 'Where can I see the results...?'\
+ 'What time is the final...?'\
* Indicators:\
+ Sport disciplines (swimming, athletics, judo)\
+ Competition phases (heats, semi-finals, finals)\
+ Time specifications (morning session, afternoon session)\
+ Performance metrics (points, times, distances)"

Program_Planning_prompt = "Here is a list of things that should be in the category Program & Planning: \
* Common words:\
+ Schedule\
+ Program\
+ Ceremony\
+ Event\
+ Timetable\
+ Opening\
+ Closing\
+ Session\
+ Duration\
+ Festival\
* Phrases:\
+ 'What time does...begin?'\
+ 'When is the opening ceremony...?'\
+ 'How long does...last?'\
+ 'Is there anything else happening...?'\
* Indicators:\
+ Time formats (9:00, 14:30)\
+ Date references (July 26th, August 11th)\
+ Event types (ceremony, cultural event, exhibition)\
+ Schedule periods (morning, afternoon, evening)\
+ Duration units (hours, minutes)"

Services_Amenities_prompt = "Here is a list of things that should be in the category Services & Amenities: \
* Common words:\
+ Wifi\
+ Internet\
+ Locker\
+ Storage\
+ Medical\
+ Information\
+ Lost\
+ Found\
+ Service\
+ Help\
* Phrases:\
+ 'Where can I find...?'\
+ 'Is there wifi available...?'\
+ 'What should I do if I lose...?'\
+ 'How do I access...?'\
* Indicators:\
+ Service locations (information desk, medical center)\
+ Amenity types (wifi network, storage facility)\
+ Equipment references (locker key, first aid)\
+ Lost items (belongings, documents, devices)\
+ Service hours (24/7, opening hours)"

Security_Emergencies_prompt = "Here is a list of things that should be in the category Security & Emergencies: \
* Common words:\
+ Security\
+ Emergency\
+ Safety\
+ Police\
+ Guard\
+ Exit\
+ Evacuation\
+ Prohibited\
+ Allowed\
+ Check\
* Phrases:\
+ 'What items are allowed...?'\
+ 'Where is the emergency exit...?'\
+ 'Who do I contact in case of...?'\
+ 'Is it permitted to bring...?'\
* Indicators:\
+ Emergency contacts (112, emergency numbers)\
+ Security equipment (metal detectors, cameras)\
+ Prohibited items (weapons, glass bottles)\
+ Safety locations (first aid points, security checkpoints)\
+ Emergency procedures (evacuation routes, assembly points)"

Rules_Sport_prompt = "Here is a list of things that should be in the category Rules & Sport: \
* Common words:\
+ Rules\
+ Record\
+ Score\
+ Points\
+ Qualify\
+ Championship\
+ Ranking\
+ Tournament\
+ Medal\
+ Format\
* Phrases:\
+ 'How does scoring work...?'\
+ 'What are the rules for...?'\
+ 'Who holds the record for...?'\
+ 'How do athletes qualify...?'\
* Indicators:\
+ Competition formats (knockout, round-robin)\
+ Scoring systems (points, time, distance)\
+ Performance statistics (world records, personal bests)\
+ Qualification criteria (minimum standards, rankings)\
+ Technical terms (heats, rounds, playoffs)"

Food_Drink_prompt = "Here is a list of things that should be in the category Food & Drink: \
* Common words:\
+ Food\
+ Drink\
+ Restaurant\
+ Bar\
+ Snack\
+ Menu\
+ Water\
+ Beverage\
+ Meal\
+ Cafe\
* Phrases:\
+ 'Where can I find something to eat...?'\
+ 'Are there food options for...?'\
+ 'Can I bring my own...?'\
+ 'What types of food are...?'\
* Indicators:\
+ Food service locations (food court, concession stands)\
+ Dietary specifications (vegetarian, halal, gluten-free)\
+ Meal times (lunch, dinner)\
+ Payment methods (cash, card)\
+ Food types (hot meals, sandwiches, snacks)"

In [43]:
questions_promt = "generate 50 questions for each of the sub categories. Just provide the questions, nothing else."

In [44]:
from tqdm import tqdm 

categories = ['Logistics & Transport', 'Ticketing & Access', 'Sport Events', 'Program & Planning', 'Services & Amenities', 'Security & Emergencies', 'Rules & Sport', 'Food & Drink']
categories_prompts = [Logistics_Transport_prompt, Ticketing_Access_prompt, Sport_Events_prompt, Program_Planning_prompt, Services_Amenities_prompt, Security_Emergencies_prompt, Rules_Sport_prompt, Food_Drink_prompt]
questions = {}
for cat, prompt in tqdm(zip(categories, categories_prompts)):
    final_prompt = base_prompt + prompt + questions_promt
    response = ollama.chat(model='llama3.1', messages=[
      {
        'role': 'user',
        'content': final_prompt,
      },
    ])
    questions[cat] = extract_questions(response)



1it [02:43, 163.12s/it]

Question not valid: Here are 50 questions for each sub category under Logistics & Transport:
Question not valid: **Site Access**
Question not valid: **Public Transportation**
Question not valid: **Parking**
Question not valid: Note: The above list includes 150 questions related to parking, accessibility, and accommodations for spectators with disabilities at the Olympic site. It does not include any specific information or answers to these questions.


2it [03:54, 109.41s/it]

Question not valid: Here are 50 questions for each sub category:
Question not valid: **Ticketing & Access**
Question not valid: **Site Access**
Question not valid: **Public Transportation**
Question not valid: **Parking**
Question not valid: **Accommodation**
Question not valid: **Accessibility for Disabled Persons**


3it [04:38, 79.20s/it] 

Question not valid: Here are the questions for the Sport Events category:
Question not valid: **Common words:**
Question not valid: **Phrases:**
Question not valid: **Indicators:**
Question not valid: And here are 50 additional questions:
Question not valid: Let me know if this helps!


4it [07:09, 107.75s/it]

Question not valid: Here are 50 questions for each of the specified subcategories under "Program & Planning":
Question not valid: **Schedule**
Question not valid: **Program**
Question not valid: **Ceremony**
Question not valid: **Side Events**


5it [08:55, 106.95s/it]

Question not valid: Here are 50 questions for each subcategory under Services & Amenities:
Question not valid: **Common words:**
Question not valid: **Phrases:**
Question not valid: **Indicators:**
Question not valid: Please note that these phrases are examples only and may not accurately reflect real-world services or situations.


6it [25:50, 415.83s/it]

Question not valid: Here are 50 questions for each of the subcategories under Security & Emergencies:
Question not valid: **Common words**
Question not valid: **Phrases**
Question not valid: **Indicators**


7it [1:01:08, 972.39s/it]

Question not valid: Here are 50 questions for each subcategory under Rules & Sport:
Question not valid: **Rules**
Question not valid: **Record**
Question not valid: **Score**


8it [1:02:36, 469.61s/it]

Question not valid: Here are 50 questions for each of the subcategories under "Food & Drink":
Question not valid: **Common words:**
Question not valid: **Phrases:**
Question not valid: **Indicators:**





In [45]:
# questions is a dict with categories as keys and lists of questions as values
# save the questions in a csv file
# id, category, question
# no pandas
import csv
idx = 0
cat_ids = {cat: idx for idx, cat in enumerate(categories)}
with open('questions.csv', 'w') as file:
    writer = csv.writer(file)
    writer.writerow(['id', 'category_id', 'category', 'question'])
    for cat, qs in questions.items():
        for q in qs:
            writer.writerow([idx, cat_ids[cat], cat, q])
            idx += 1

In [46]:
ollama.list()


{'models': [{'name': 'llama3.1:latest',
   'model': 'llama3.1:latest',
   'modified_at': '2024-08-25T17:18:17.947045701+02:00',
   'size': 4661230977,
   'digest': '91ab477bec9d27086a119e33c471ae7afbd786cc4fbd8f38d8af0a0b949d53aa',
   'details': {'parent_model': '',
    'format': 'gguf',
    'family': 'llama',
    'families': ['llama'],
    'parameter_size': '8.0B',
    'quantization_level': 'Q4_0'}}]}