#**Gemini Trial**

In [None]:
#Imports
import pathlib
import re
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

#Import to store API on Colab
from google.colab import userdata

### Setting up API Key

<a class="button button-primary" href="https://makersuite.google.com/app/apikey" target="_blank" rel="noopener noreferrer">Link</a> to generate API Key.



In [None]:
#Put API Key in environment variable under 'secrets'
GOOGLE_API_KEY = userdata.get('GOOGL_API_KEY')
genai.configure(api_key = GOOGLE_API_KEY)

# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.

### List Available Models

In [None]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

#gemini-pro -> Optimized for text-only prompts

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


### Sample Trials

In [None]:
model = genai.GenerativeModel('gemini-pro')

In [None]:
#Documentation: https://ai.google.dev/api/python/google/generativeai/GenerativeModel?authuser=1#generate_content

%%time
response = model.generate_content("Hi, how are you")
to_markdown(response.text) #Prints output.

CPU times: user 20.9 ms, sys: 1.98 ms, total: 22.8 ms
Wall time: 1.29 s


> I'm doing well, thank you. How are you doing today?

In [None]:
#In case of API fail
response.prompt_feedback



In [None]:
#View multiple candidate responses.
response.candidates

[content {
  parts {
    text: "I am well, thank you for asking. How are you today?"
  }
  role: "model"
}
finish_reason: STOP
index: 0
safety_ratings {
  category: HARM_CATEGORY_SEXUALLY_EXPLICIT
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_HATE_SPEECH
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_HARASSMENT
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_DANGEROUS_CONTENT
  probability: NEGLIGIBLE
}
]

In [None]:
response = model.generate_content("Consider the following five words related to engineering: lasers, nuclear, optimization, civil, algorithms. Now, for each of the five words, come up with 10 questions. These questions should be based on the word, and related to engineering. They must be different from one another, and must be like the ones someone will ask an expert on Reddit as a doubt. ")
to_markdown(response.text)
#14 secs for 25 questions.

> **Lasers**
> 
> 1. How are lasers used in optical communication systems?
> 2. What are the different types of lasers and their applications in engineering?
> 3. How are lasers used in materials processing, such as cutting and welding?
> 4. What are the safety considerations when working with high-power lasers?
> 5. How can lasers be used for remote sensing and imaging in engineering?
> 6. What are the emerging applications of lasers in biotechnology and medicine?
> 7. How do lasers play a role in optical storage and information processing?
> 8. What are the limitations of using lasers in engineering applications?
> 9. How are lasers integrated into autonomous systems and robotics?
> 10. What are the current research directions in laser technology?
> 
> **Nuclear**
> 
> 1. How is nuclear energy used to generate electricity and what are its advantages and disadvantages?
> 2. What are the different types of nuclear reactors and how do they differ in design and operation?
> 3. How is nuclear waste managed and what are the challenges associated with its disposal?
> 4. What are the safety considerations and regulations for operating nuclear power plants?
> 5. How can nuclear technology be used for medical applications, such as cancer treatment?
> 6. What are the ethical and environmental concerns related to nuclear energy and waste?
> 7. What is the role of nuclear physics in studying fundamental particles and their interactions?
> 8. How are nuclear techniques used in materials analysis and characterization?
> 9. What are the potential applications of fusion technology in future energy production?
> 10. What are the advancements and challenges in nuclear disarmament and non-proliferation?
> 
> **Optimization**
> 
> 1. What are the different methods for solving optimization problems and which one is appropriate for a given engineering problem?
> 2. How can optimization be used to improve the performance and efficiency of engineering systems?
> 3. What are the challenges and limitations associated with optimization in complex engineering applications?
> 4. How is optimization used in design optimization, such as aircraft design or structural analysis?
> 5. What are the applications of optimization in data analysis and machine learning?
> 6. How can optimization be used to make engineering systems more sustainable and environmentally friendly?
> 7. What are the emerging trends and research areas in optimization theory and algorithms?
> 8. How is optimization used in real-time decision-making and control systems?
> 9. What are the software tools and resources available for optimization in engineering practice?
> 10. How can optimization be used to address complex societal problems, such as transportation planning or energy distribution?
> 
> **Civil**
> 
> 1. What are the different types of civil engineering structures and their design considerations?
> 2. How are civil engineering principles applied to the planning and construction of cities and infrastructure?
> 3. What are the latest advancements in materials and technologies used in civil engineering?
> 4. How does civil engineering contribute to the sustainability and resilience of urban environments?
> 5. What are the challenges and opportunities in civil engineering related to climate change and natural disasters?
> 6. How can civil engineering be used to improve the safety and accessibility of transportation systems?
> 7. What is the role of civil engineering in developing smart and connected cities?
> 8. How can civil engineering principles be applied to address water and waste management issues?
> 9. What are the ethical and social responsibilities of civil engineers in shaping the built environment?
> 10. What are the emerging trends and research areas in civil engineering practice?
> 
> **Algorithms**
> 
> 1. What are the different types of algorithms and how are they classified based on their efficiency and complexity?
> 2. How are algorithms used to solve engineering problems, such as simulation or optimization?
> 3. What are the challenges in designing efficient and scalable algorithms for large-scale engineering systems?
> 4. How can algorithms be optimized for performance on specific hardware platforms, such as GPUs or cloud computing?
> 5. What are the applications of algorithms in data analysis, machine learning, and artificial intelligence?
> 6. How can algorithms be used to improve the security and privacy of engineering systems?
> 7. What are the ethical considerations in using algorithms for decision-making in engineering applications?
> 8. What are the latest research advancements in algorithm design and analysis?
> 9. How can algorithms be used to design and control autonomous systems?
> 10. What are the future directions and challenges in the field of algorithm engineering?

### Computing similarity for 10*10 sentences.

In [None]:
#Checking similarity scores for 10 questions.
response = model.generate_content("Come up with 10 engineering-based questions realted to the word 'lasers'. These questions must be different from one another, and must be like the ones someone will ask an expert on Reddit as a doubt. ")
to_markdown(response.text)

> 1. **Optimal Laser Wavelength for Tissue Ablation:** What is the ideal wavelength range of lasers for precise and efficient tissue ablation in surgical applications?
> 2. **Laser Cavity Design for High Power Output:** How can laser cavity designs be optimized to maximize the output power and stability of high-power lasers?
> 3. **Beam Shaping for Laser Welding:** What techniques can be employed to shape the laser beam for precise and high-quality laser welding in intricate geometries?
> 4. **Pulse Shaping for Laser Materials Processing:** How does pulse shaping affect the material interaction and processing outcomes in laser-based materials processing applications?
> 5. **Laser-Induced Damage Threshold on Optical Surfaces:** What are the factors that influence the laser-induced damage threshold on optical surfaces, and how can it be increased?
> 6. **Laser-Material Interaction for Non-Destructive Testing:** Explain the physics behind the interaction of lasers with materials for non-destructive testing applications, such as ultrasonic imaging and stress analysis.
> 7. **Laser Propagation in Nonlinear Media:** How does the propagation of laser beams differ in nonlinear media compared to linear media? What are the implications for laser applications?
> 8. **Frequency Doubling in Laser Systems:** Describe the principles and techniques for frequency doubling in laser systems. What are the advantages and challenges of implementing them?
> 9. **Quantum Cascade Lasers for Mid-Infrared Applications:** How do quantum cascade lasers differ from conventional lasers, and what are their unique applications in the mid-infrared range?
> 10. **Holographic Laser Metrology:** Explain the principles of holographic laser metrology and discuss its capabilities for high-precision measurements of surface deformation and topography.

In [None]:
sentences = re.split(r'\d+\.\s*', response.text)
sentences = [sentence for sentence in sentences if sentence]  # Remove empty strings

print(sentences)

['**Optimal Laser Wavelength for Tissue Ablation:** What is the ideal wavelength range of lasers for precise and efficient tissue ablation in surgical applications?\n', '**Laser Cavity Design for High Power Output:** How can laser cavity designs be optimized to maximize the output power and stability of high-power lasers?\n', '**Beam Shaping for Laser Welding:** What techniques can be employed to shape the laser beam for precise and high-quality laser welding in intricate geometries?\n', '**Pulse Shaping for Laser Materials Processing:** How does pulse shaping affect the material interaction and processing outcomes in laser-based materials processing applications?\n', '**Laser-Induced Damage Threshold on Optical Surfaces:** What are the factors that influence the laser-induced damage threshold on optical surfaces, and how can it be increased?\n', '**Laser-Material Interaction for Non-Destructive Testing:** Explain the physics behind the interaction of lasers with materials for non-dest

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity

In [None]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
def get_embedding(sentence):
    # Encode and compute embeddings
    encoded_input = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state.mean(dim=1)  # Mean pooling

In [None]:
"""
def compute_cosine_similarities(sentences):
    # Get embeddings for all sentences
    embeddings = [get_embedding(sentence) for sentence in sentences]

    # Initialize an empty matrix to store cosine similarities
    n = len(sentences)
    similarity_matrix = torch.zeros((n, n))

    # Compute cosine similarity for each pair of sentences
    for i in range(n):
        for j in range(n):
            similarity_matrix[i][j] = cosine_similarity(embeddings[i], embeddings[j]).item()

    return similarity_matrix
"""

In [None]:
# Compute embeddings for each sentence
embeddings = [get_embedding(sentence) for sentence in sentences]

# Initialize a 10x10 matrix with zeros
n = len(sentences)
similarity_matrix = [[0]*n for _ in range(n)]
similarity_scores = []

# Compute cosine similarity for each pair of embeddings
for i in range(n):
    for j in range(i+1, n):  # Only fill upper diagonal
        similarity = cosine_similarity(embeddings[i], embeddings[j]).item()
        similarity_matrix[i][j] = similarity
        similarity_scores.append(((i, j), similarity))

# Print the resulting upper diagonal matrix
for row in similarity_matrix:
    print(row)

[0, 0.5077662467956543, 0.6323431730270386, 0.5586272478103638, 0.5190630555152893, 0.6324816346168518, 0.6976943016052246, 0.6163954734802246, 0.5718023180961609, 0.5317742228507996]
[0, 0, 0.7983375191688538, 0.5786458849906921, 0.6622776985168457, 0.5674260258674622, 0.5134333968162537, 0.5471594929695129, 0.5924349427223206, 0.5351109504699707]
[0, 0, 0, 0.6038283109664917, 0.5600835084915161, 0.6150647401809692, 0.6030434370040894, 0.5629034042358398, 0.6258838772773743, 0.6601328253746033]
[0, 0, 0, 0, 0.5239795446395874, 0.516996443271637, 0.5730702877044678, 0.5602624416351318, 0.6718893051147461, 0.6149359941482544]
[0, 0, 0, 0, 0, 0.597025454044342, 0.5511294007301331, 0.5849542617797852, 0.5233476161956787, 0.5653805732727051]
[0, 0, 0, 0, 0, 0, 0.5776981115341187, 0.5677936673164368, 0.5725528597831726, 0.5153347253799438]
[0, 0, 0, 0, 0, 0, 0, 0.5627390742301941, 0.5932350754737854, 0.534429669380188]
[0, 0, 0, 0, 0, 0, 0, 0, 0.5845053195953369, 0.48325273394584656]
[0, 0,

In [None]:
# Find the top two pairs with the highest similarity scores
from heapq import nlargest
top_two = nlargest(2, similarity_scores, key=lambda x: x[1])
removed_sentences = []

# Print the top two pairs along with their similarity scores
for pair in top_two:
    idx1, idx2 = pair[0]
    selected_idx = random.choice([idx1, idx2])  # Randomly select one index to remove
    removed_sentences.add(selected_idx)
    print(f"Sentences: '{sentences[idx1]}' and '{sentences[idx2]}'")
    print(f"Similarity Score: {pair[1]}")

Sentences: '**Can lasers be used for precision material processing like cutting and welding, and what techniques are employed to enhance accuracy?**
' and '**How do lasers interact with different materials, and can their properties be modified to optimize cutting, engraving, or welding efficiency?**
'
Similarity Score: 0.7983375191688538
Sentences: '**How do different laser types differ in wavelength and power output, and what are their respective applications?**
' and '**How do laser diodes differ from traditional lasers in terms of design, efficiency, and applications, and what are their advantages and disadvantages?**
'
Similarity Score: 0.6976943016052246


In [None]:
# Remove one sentence arbitrarily from each of the top two pairs
import random
removed_sentences = set()
for pair in top_two:
    idx1, idx2 = pair[0]
    selected_idx = random.choice([idx1, idx2])  # Randomly select one index to remove
    removed_sentences.add(selected_idx)

# Update the sentences list by removing the selected sentences
sentences = [sentence for idx, sentence in enumerate(sentences) if idx not in removed_sentences]

In [None]:
# Print the updated list of sentences
print("Updated list of sentences:")
for sentence in sentences:
    print(sentence)

# Print the sentences removed
print("\nSentences removed:")
for idx in removed_sentences:
    print(sentences[idx])

Updated list of sentences:
**Can lasers be used for precision material processing like cutting and welding, and what techniques are employed to enhance accuracy?**

**What are the latest advancements in laser technology and their potential impact on fields like medicine, manufacturing, and communications?**

**How can lasers be used to measure distance and velocity accurately, and what limitations or uncertainties are associated with these applications?**

**What are the fundamental principles behind laser beam shaping, and how do different techniques influence the shape and intensity distribution of the laser output?**

**How do laser diodes differ from traditional lasers in terms of design, efficiency, and applications, and what are their advantages and disadvantages?**

**Can lasers be used to generate coherent light for optical fiber communication, and what factors affect the performance and reliability of such systems?**

**What are the challenges and complexities in designing and

# Pipeline

In [None]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')
import pandas as pd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#path = "/content/drive/MyDrive/Colab DataSets/NLP Project/Batch1.xlsx"
path = "/content/drive/MyDrive/Colab DataSets/NLP Project/Batch5.xlsx"
df = pd.read_excel(path)
df.head()

Unnamed: 0,Topic,Field,Description,Constraints,Keywords,Examples for Few-Shot
0,askscience_train,Science,This is a subreddit where people ask about dif...,\n\n1. You're on Reddit. Ask about the implica...,1. Experiment\n2. Quantum\n3. Biology\n4. Chem...,1. We are human genetics and genomics research...


In [None]:
df.iloc[0][-2] #Keywords

'1. Experiment\n2. Quantum\n3. Biology\n4. Chemistry\n5. Physics\n6. Astronomy\n7. Genetics\n8. Neurology\n9. Meteorology\n10. Paleontology\n11. Ecology\n12. Botany\n13. Zoology\n14. Thermodynamics\n15. Robotics\n16. Nanotechnology\n17. Geology\n18. Hydrology\n19. Sociology\n20. Anthropology\n21. Archaeology\n22. Psychology\n23. Psychiatry\n24. Pharmacology\n25. Pathology\n26. Immunology\n27. Virology\n28. Oncology\n29. Endocrinology\n30. Cardiology\n31. Dermatology\n32. Gastroenterology\n33. Nephrology\n34. Ophthalmology\n35. Orthopedics\n36. Pediatrics\n37. Radiology\n38. Surgery\n39. Anesthesiology\n40. Epidemiology\n41. Biotechnology\n42. Astrophysics\n43. Cosmology\n44. Particle\n45. Fusion\n46. Algorithm\n47. Data\n48. Simulation\n49. Hypothesis\n50. Theory\n51. Microscope\n52. Telescope\n53. Satellite\n54. Laboratory\n55. Research\n56. Innovation\n57. Discovery\n58. Patent\n59. Publication\n60. Journal\n61. Peer-review\n62. Experimentation\n63. Observation\n64. Analysis\n65. Mea

In [None]:
dataset = pd.DataFrame(columns=["Subreddit", "Keyword", "Question"]) #Dataframe to store the generated datasets

In [None]:
def generatePrompt(rowNum, field, keyWord):
    prompt = "You are a creative thinking and writing expert. Assume you're a user on Reddit and want to ask a question on the " + df.iloc[rowNum]["Topic"] + " subreddit. " + df.iloc[rowNum]["Description"] + " Here are some examples of questions that people ask on this subreddit: " + df.iloc[rowNum]["Examples for Few-Shot"] + " Come up with a set of 15 diverse questions related to " + keyWord + " in the field of " + field + " you would ask on such a subreddit. Make sure that all the questions are diverse and different from one another. Remember that you're a Reddit user. So, while the questions are technical, try to maintain an informal tone. Quesions can be 1-3 sentences long. Give your final output as a list of numbered questions."
    return prompt

In [None]:
def getKeyWords(keyWords):
  words = re.findall(r'\b\d+\.\s*(\w+)', keyWords)
  return words

In [None]:
def parse_numbered_list(input_string):
    # Split the string at positions where a number followed by a dot and space occurs
    items = re.split(r'\s(?=\d+\.\s)', input_string)
    # Remove the numbering from each item and strip whitespace
    items = [re.sub(r'^\d+\.\s+', '', item).strip() for item in items if item.strip()]
    return items

In [None]:
prompt = generatePrompt(3, "Baking", "Yeast")
response = model.generate_content(prompt)
to_markdown(response.text) #Prints output.

> **Yeast in the Field of Baking on AskScienceFiction_Train:**
> 
> 1.  What cosmic force empowers yeast to make bread dough rise majestically? Is it the ethereal essence of fermentation or an intergalactic enchantment?
> 
> 2. If a baker were to cast a spell that made their yeast infinitely potent, what cataclysmic pastries would emerge from their oven, defying the laws of physics?
> 
> 3. Legends whisper that a rare strain of yeast exists on a distant planet, capable of transforming dough into a sentient sourdough imbued with the wisdom of ancient bakers. Is this a mere myth or a tantalizing reality?
> 
> 4. Can yeast communicate with its fellow microbes in the dough, orchestrating a symphony of flavors and textures that enchant the palate?
> 
> 5. If yeast were granted the gift of time travel, would it alter the history of baking, bringing forth pastries from bygone eras to delight modern palates?
> 
> 6. Is there a yeast strain that dwells in the deepest crevices of the galaxy, where gravitational forces intensify its fermentation prowess, yielding bread of unimaginable lightness and porosity?
> 
> 7. Could a warlock baker create a yeast that can brew magical potions capable of granting bread the power to heal wounds or bestow good fortune on those who consume it?
> 
> 8. If a portal were opened to a parallel universe where yeast evolved into sentient beings, what wisdom and insights might they share with our earthly bakers?
> 
> 9. What cosmic energies interact with yeast during fermentation, giving rise to the intoxicating aromas and flavors that captivate our senses?
> 
> 10. Is there a yeast strain lurking in the depths of space that, when combined with lunar soil, creates bread that grants interstellar powers to those who taste it?
> 
> 11. If yeast were endowed with the ability to manipulate the laws of thermodynamics, what innovative baking techniques and marvels would it unleash upon our world?
> 
> 12. Could a baker harness the power crystals from a dying supernova to create a yeast strain that bakes bread that emits a soothing glow, illuminating kitchens and breadbaskets alike?
> 
> 13. What ethereal melodies would yeast compose if it possessed the ability to sing? And would these cosmic harmonies enhance the taste of the bread it ferments?
> 
> 14. If a baker were to summon a yeast spirit from a mystical realm, what secrets of dough alchemy would it reveal, unlocking the true potential of our earthly loaves?
> 
> 15. Are there yeast strains that reside in the mycelial networks of ancient forests, imbued with the wisdom of centuries-old trees, which impart a profound and earthy flavor to bread?

In [None]:
# Loop through each row in df
for index, row in df.iterrows():
    # Get keywords from the "Keywords" column of df
    keywords_list = getKeyWords(row["Keywords"])

    # Loop through each keyword in the list
    for word in keywords_list:


        # Generate a prompt using the index, "Field", and keyword
        prompt = generatePrompt(index, row["Field"], word)
        # print(word, " started")
        # Generate content based on the prompt
        try:
          response = model.generate_content(prompt)

          # Parse the response text to get questions
          questions = parse_numbered_list(response.text)

          # Add each question along with the corresponding topic and keyword to the 'dataset' dataframe
          for element in questions:
              new_row = pd.DataFrame({"Subreddit": [row["Topic"]], "Keyword": [word], "Question": [element]})
              dataset = pd.concat([dataset, new_row], ignore_index=True)


        except Exception as e:
            # Model might choose not to give a response.
            print(f"Failed to generate or parse content for keyword '{word}': {e}")

    print(f"________________________Completed processing for row {index + 1}_________________")

Experiment  started
Quantum  started
Biology  started
Chemistry  started
Physics  started
Astronomy  started
Genetics  started
Neurology  started
Meteorology  started
Paleontology  started
Ecology  started
Botany  started
Zoology  started
Thermodynamics  started
Robotics  started
Nanotechnology  started
Geology  started
Hydrology  started
Sociology  started
Anthropology  started
Archaeology  started
Psychology  started
Psychiatry  started
Pharmacology  started
Pathology  started
Immunology  started
Virology  started
Oncology  started
Endocrinology  started
Cardiology  started
Dermatology  started
Gastroenterology  started
Nephrology  started
Ophthalmology  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 635.40ms


Failed to generate or parse content for keyword 'Ophthalmology': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Orthopedics  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 890.22ms


Failed to generate or parse content for keyword 'Orthopedics': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Pediatrics  started
Radiology  started
Surgery  started
Anesthesiology  started
Epidemiology  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 663.37ms


Failed to generate or parse content for keyword 'Epidemiology': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Biotechnology  started
Astrophysics  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1092.06ms


Failed to generate or parse content for keyword 'Astrophysics': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Cosmology  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 535.85ms


Failed to generate or parse content for keyword 'Cosmology': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Particle  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 896.87ms


Failed to generate or parse content for keyword 'Particle': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Fusion  started
Algorithm  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 819.64ms


Failed to generate or parse content for keyword 'Algorithm': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Data  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1017.06ms


Failed to generate or parse content for keyword 'Data': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Simulation  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 716.55ms


Failed to generate or parse content for keyword 'Simulation': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Hypothesis  started
Theory  started
Microscope  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 609.41ms


Failed to generate or parse content for keyword 'Microscope': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Telescope  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1049.50ms


Failed to generate or parse content for keyword 'Telescope': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Satellite  started
Laboratory  started
Research  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 613.64ms


Failed to generate or parse content for keyword 'Research': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Innovation  started
Discovery  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1016.25ms


Failed to generate or parse content for keyword 'Discovery': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Patent  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1168.67ms


Failed to generate or parse content for keyword 'Patent': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Publication  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1472.84ms


Failed to generate or parse content for keyword 'Publication': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Journal  started
Peer  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 865.89ms


Failed to generate or parse content for keyword 'Peer': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Experimentation  started
Observation  started
Analysis  started
Measurement  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 711.74ms


Failed to generate or parse content for keyword 'Measurement': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Quantum  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 764.37ms


Failed to generate or parse content for keyword 'Quantum': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Relativity  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 586.12ms


Failed to generate or parse content for keyword 'Relativity': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Evolution  started
Climate  started
Energy  started
Matter  started
Species  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 636.01ms


Failed to generate or parse content for keyword 'Species': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Ecosystem  started
Habitat  started
Biodiversity  started
Conservation  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 685.18ms


Failed to generate or parse content for keyword 'Conservation': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Pollution  started
Sustainability  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1127.65ms


Failed to generate or parse content for keyword 'Sustainability': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Renewable  started
Fossil  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 660.90ms


Failed to generate or parse content for keyword 'Fossil': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Mineral  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 762.87ms


Failed to generate or parse content for keyword 'Mineral': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Seismology  started
Volcano  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 458.48ms


Failed to generate or parse content for keyword 'Volcano': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Glacier  started
Oceanography  started
Tide  started
Current  started
Atmosphere  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 510.04ms


Failed to generate or parse content for keyword 'Atmosphere': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Meteor  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 992.30ms


Failed to generate or parse content for keyword 'Meteor': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Galaxy  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 991.53ms


Failed to generate or parse content for keyword 'Galaxy': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Nebula  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 666.75ms


Failed to generate or parse content for keyword 'Nebula': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Black  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1042.88ms


Failed to generate or parse content for keyword 'Black': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Supernova  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 842.62ms


Failed to generate or parse content for keyword 'Supernova': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Quark  started
Genome  started
Chromosome  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 520.32ms


Failed to generate or parse content for keyword 'Chromosome': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
DNA  started


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 838.06ms


Failed to generate or parse content for keyword 'DNA': 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
RNA  started
Protein  started
Enzyme  started
________________________Completed processing for row 1_________________


In [None]:
dataset.head()

Unnamed: 0,Subreddit,Keyword,Question
0,askdocs_train,Physician,"Hey doc, I've been getting these weird heart p..."
1,askdocs_train,Physician,My son has been complaining of joint pain and ...
2,askdocs_train,Physician,"I'm a healthy 30-year-old, but I've been havin..."
3,askdocs_train,Physician,Can you explain the difference between viral a...
4,askdocs_train,Physician,My blood sugar levels have been running a bit ...


In [None]:
#Download dataset into local
excel_filename = "dataset5.xlsx"
dataset.to_excel(excel_filename, index=False)
files.download(excel_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
total_rows = len(dataset)
print("Total number of rows:", total_rows)

rows_per_subreddit = dataset.groupby('Subreddit').size()
print("Number of rows per subreddit:\n", rows_per_subreddit)

unique_keywords_per_subreddit = dataset.groupby('Subreddit')['Keyword'].nunique()
print("Number of unique keywords per subreddit:\n", unique_keywords_per_subreddit)

Total number of rows: 26885
Number of rows per subreddit:
 Subreddit
askacademia_train          1503
askanthropology_train      1497
askbaking_train            1484
askcarguys_train           1515
askculinary_train          1501
askdocs_train              1727
askengineers_train         1380
askhistorians_train        1326
askhr_train                1426
askphilosophy_train        2776
askphysics_train           1291
askscience_train           1006
asksciencefiction_train    1369
asksocialscience_train     1275
askvet_train               1352
changemyview_train         1754
explainlikeimfive_train    1320
legaladvice_train          1383
dtype: int64
Number of unique keywords per subreddit:
 Subreddit
askacademia_train           94
askanthropology_train      100
askbaking_train             99
askcarguys_train           101
askculinary_train           96
askdocs_train               87
askengineers_train          52
askhistorians_train         87
askhr_train                 95
askphilosop

#Combine Datasets and Perform Clustering

In [None]:
!pip install sentence-transformers scikit-learn

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [None]:
!pip install sentence-transformers



In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer
import torch
from sentence_transformers import SentenceTransformer

In [None]:
path1 = "/content/drive/MyDrive/Colab DataSets/NLP Project/dataset1.xlsx"
df1 = pd.read_excel(path1)
path2 = "/content/drive/MyDrive/Colab DataSets/NLP Project/dataset2.xlsx"
df2 = pd.read_excel(path2)
path3 = "/content/drive/MyDrive/Colab DataSets/NLP Project/dataset3.xlsx"
df3 = pd.read_excel(path3)
path4 = "/content/drive/MyDrive/Colab DataSets/NLP Project/dataset4.xlsx"
df4 = pd.read_excel(path4)
path5 = "/content/drive/MyDrive/Colab DataSets/NLP Project/dataset5.xlsx"
df5 = pd.read_excel(path5)

In [None]:
df5.head()

Unnamed: 0,Subreddit,Keyword,Question
0,askscience_train,Experiment,"Hey y'all! I'm curious, could we genetically e..."
1,askscience_train,Experiment,Is there a scientific explanation for why some...
2,askscience_train,Experiment,How do deep-sea creatures survive in environme...
3,askscience_train,Experiment,Can we use artificial intelligence to predict ...
4,askscience_train,Experiment,"I've heard about ""quantum entanglement."" Is it..."


In [None]:
dataset = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
dataset.head()

Unnamed: 0,Subreddit,Keyword,Question
0,askdocs_train,Physician,"Hey doc, I've been getting these weird heart p..."
1,askdocs_train,Physician,My son has been complaining of joint pain and ...
2,askdocs_train,Physician,"I'm a healthy 30-year-old, but I've been havin..."
3,askdocs_train,Physician,Can you explain the difference between viral a...
4,askdocs_train,Physician,My blood sugar levels have been running a bit ...


In [None]:
df5.shape[0]

1006

In [None]:
#Number of rows
dataset.shape[0]

26885

In [None]:
#Number of categories
dataset['Subreddit'].nunique()

18

In [None]:
dataset['Subreddit'].unique()

array(['askdocs_train', 'askanthropology_train', 'askculinary_train',
       'askbaking_train', 'askcarguys_train', 'askacademia_train',
       'askphilosophy_train', 'askphysics_train',
       'asksciencefiction_train', 'askengineers_train',
       'askhistorians_train', 'askhr_train', 'asksocialscience_train',
       'askvet_train', 'legaladvice_train', 'changemyview_train',
       'explainlikeimfive_train', 'askscience_train'], dtype=object)

In [None]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Load the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to generate embeddings
def generate_embeddings(texts):
    embeddings = embedding_model.encode(texts, convert_to_tensor=True, show_progress_bar=True)
    #return embeddings.cpu().numpy()
    return embeddings

In [None]:
embeddings = generate_embeddings(dataset['Question'].tolist())

Batches:   0%|          | 0/841 [00:00<?, ?it/s]

In [None]:
type(embeddings)

torch.Tensor

In [None]:
np.shape(embeddings)

torch.Size([26885, 384])

In [None]:
# Dimensionality Reduction
pca = PCA(n_components = 50)
reduced_embeddings = pca.fit_transform(embeddings.cpu().numpy())

In [None]:
# Clustering
kmeans = KMeans(n_clusters = 11, random_state=0)
clusters = kmeans.fit_predict(reduced_embeddings)
dataset['Cluster'] = clusters

# Print cluster sizes
print(dataset['Cluster'].value_counts())



Cluster
0     3180
6     3147
3     3024
9     2852
2     2670
10    2660
1     2323
7     2151
4     1750
8     1735
5     1393
Name: count, dtype: int64


In [None]:
# Removing similar sentences
unique_dataset = pd.DataFrame()

In [None]:
"""
for cluster in range(11):
    cluster_data = dataset[dataset['Cluster'] == cluster]
    cluster_embeddings = generate_embeddings(cluster_data['Question'].tolist())
    cos_sim_matrix = cosine_similarity(cluster_embeddings.cpu().numpy())

    # Set the similarity of each item to itself to zero
    np.fill_diagonal(cos_sim_matrix, 0)

    to_remove = set()
    for i in range(len(cos_sim_matrix)):
        for j in range(i + 1, len(cos_sim_matrix)):
            if cos_sim_matrix[i][j] > 0.95:  # Threshold for similarity
                to_remove.add(j)

    # Drop similar sentences
    cluster_data = cluster_data.drop(cluster_data.index[list(to_remove)])
    unique_dataset = pd.concat([unique_dataset, cluster_data], ignore_index=True)

    # Early exit if target size is reached
    if len(unique_dataset) <= 20000:
        break

unique_dataset = unique_dataset.iloc[:20000]  # Ensure exactly 20000 entries
"""

Batches:   0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
# Adjust the similarity threshold
"""
similarity_threshold = 0.98

# Process each cluster
for cluster in range(11):
    cluster_data = dataset[dataset['Cluster'] == cluster]
    if len(unique_dataset) + len(cluster_data) > 20000:
        # Calculate how many more rows are needed to reach 20,000
        needed = 20000 - len(unique_dataset)
        cluster_data = cluster_data.sample(n=needed, random_state=42)
    elif len(unique_dataset) >= 20000:
        break

    cluster_embeddings = generate_embeddings(cluster_data['Question'].tolist()).cpu().numpy()
    cos_sim_matrix = cosine_similarity(cluster_embeddings)

    # Set the similarity of each item to itself to zero
    np.fill_diagonal(cos_sim_matrix, 0)

    to_remove = set()
    for i in range(len(cos_sim_matrix)):
        for j in range(i + 1, len(cos_sim_matrix)):
            if cos_sim_matrix[i][j] > similarity_threshold:
                to_remove.add(j)

    # Drop similar sentences
    cluster_data = cluster_data.drop(cluster_data.index[list(to_remove)])
    unique_dataset = pd.concat([unique_dataset, cluster_data], ignore_index=True)

    # Early exit if target size is reached
    if len(unique_dataset) >= 20000:
        break

unique_dataset = unique_dataset.iloc[:20000]  # Ensure exactly 20,000 entries
"""

Batches:   0%|          | 0/100 [00:00<?, ?it/s]

Batches:   0%|          | 0/73 [00:00<?, ?it/s]

Batches:   0%|          | 0/84 [00:00<?, ?it/s]

Batches:   0%|          | 0/95 [00:00<?, ?it/s]

Batches:   0%|          | 0/55 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/99 [00:00<?, ?it/s]

Batches:   0%|          | 0/68 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#unique_dataset.shape[0]

20000

In [None]:
#unique_dataset['Subreddit'].unique()

array(['askdocs_train', 'askanthropology_train', 'askculinary_train',
       'askacademia_train', 'askphilosophy_train', 'askphysics_train',
       'asksciencefiction_train', 'askengineers_train',
       'askhistorians_train', 'askhr_train', 'asksocialscience_train',
       'askvet_train', 'legaladvice_train', 'changemyview_train',
       'explainlikeimfive_train', 'askscience_train', 'askbaking_train',
       'askcarguys_train'], dtype=object)

In [None]:
#unique_dataset.to_csv('Method1_dataset.csv', index=False)
#files.download('Method1_dataset.csv')

In [None]:
"""
subreddit_counts = unique_dataset.groupby('Subreddit').size()
# Print the count of entries for each subreddit
print(subreddit_counts)
"""

Subreddit
askacademia_train          1479
askanthropology_train      1482
askbaking_train              32
askcarguys_train           1460
askculinary_train           126
askdocs_train              1695
askengineers_train         1328
askhistorians_train        1311
askhr_train                1418
askphilosophy_train        2634
askphysics_train            604
askscience_train            733
asksciencefiction_train    1243
asksocialscience_train     1256
askvet_train               1342
changemyview_train          392
explainlikeimfive_train      98
legaladvice_train          1367
dtype: int64


In [None]:
# Removing similar sentences
# unique_dataset = pd.DataFrame()

In [None]:
# Process each cluster
"""
for cluster in range(11):
    cluster_data = dataset[dataset['Cluster'] == cluster]
    cluster_embeddings = generate_embeddings(cluster_data['Question'].tolist()).cpu().numpy()
    cos_sim_matrix = cosine_similarity(cluster_embeddings)

    # Only consider the upper triangular part of the matrix, excluding the diagonal
    upper_tri_indices = np.triu_indices_from(cos_sim_matrix, k=1)

    to_remove = set()
    for i, j in zip(*upper_tri_indices):
        if cos_sim_matrix[i][j] > similarity_threshold:  # Check only upper triangular entries
            to_remove.add(j)  # Choose to remove the second sentence of the pair

    # Drop similar sentences
    cluster_data = cluster_data.drop(cluster_data.index[list(to_remove)])
    unique_dataset = pd.concat([unique_dataset, cluster_data], ignore_index=True)

    # Early exit if target size is reached
    if len(unique_dataset) >= 20000:
        break

unique_dataset = unique_dataset.iloc[:20000]  # Ensure exactly 20,000 entries
"""

Batches:   0%|          | 0/100 [00:00<?, ?it/s]

Batches:   0%|          | 0/73 [00:00<?, ?it/s]

Batches:   0%|          | 0/84 [00:00<?, ?it/s]

Batches:   0%|          | 0/95 [00:00<?, ?it/s]

Batches:   0%|          | 0/55 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/99 [00:00<?, ?it/s]

Batches:   0%|          | 0/68 [00:00<?, ?it/s]

Batches:   0%|          | 0/55 [00:00<?, ?it/s]

In [None]:
#unique_dataset.shape[0]

20000

In [None]:
#unique_dataset.groupby('Subreddit').size()

Subreddit
askacademia_train          1479
askanthropology_train      1482
askbaking_train              20
askcarguys_train           1460
askculinary_train           117
askdocs_train              1695
askengineers_train         1328
askhistorians_train        1311
askhr_train                1418
askphilosophy_train        2634
askphysics_train            604
askscience_train            733
asksciencefiction_train    1243
asksocialscience_train     1256
askvet_train               1342
changemyview_train          413
explainlikeimfive_train      98
legaladvice_train          1367
dtype: int64


In [None]:
#dataset.groupby('Subreddit').size()

Subreddit
askacademia_train          1503
askanthropology_train      1497
askbaking_train            1484
askcarguys_train           1515
askculinary_train          1501
askdocs_train              1727
askengineers_train         1380
askhistorians_train        1326
askhr_train                1426
askphilosophy_train        2776
askphysics_train           1291
askscience_train           1006
asksciencefiction_train    1369
asksocialscience_train     1275
askvet_train               1352
changemyview_train         1754
explainlikeimfive_train    1320
legaladvice_train          1383
dtype: int64

#Similarity scores across clusters

In [None]:
similarity_threshold = 0.98

In [None]:
similarity_pairs = []

# Compute similarities for each cluster
for cluster in range(11):
    cluster_data = dataset[dataset['Cluster'] == cluster]
    if cluster_data.empty:
        continue
    cluster_embeddings = generate_embeddings(cluster_data['Question'].tolist()).cpu().numpy()
    cos_sim_matrix = cosine_similarity(cluster_embeddings)

    # Store the upper triangular part of the matrix excluding diagonal
    upper_tri_indices = np.triu_indices_from(cos_sim_matrix, k=1)
    for i, j in zip(*upper_tri_indices):
        if cos_sim_matrix[i][j] > similarity_threshold:  # Only consider high similarities
            similarity_pairs.append((cos_sim_matrix[i][j], i, j, cluster)) #Mark the pair as a possible candidate for elimination

# Sort all pairs by similarity score in descending order - Sorting done across clusters (not within cluster)
similarity_pairs.sort(reverse=True, key=lambda x: x[0])

# Select and remove similar sentences globally
to_remove = set()
for similarity, i, j, cluster in similarity_pairs:
    if (cluster, i) in to_remove or (cluster, j) in to_remove:
        continue
    # Remove 2nd sentence from the each pair.
    to_remove.add((cluster, j))

# Rebuild the dataset
unique_dataset = pd.DataFrame()
for cluster in range(11):
    cluster_data = dataset[dataset['Cluster'] == cluster]
    indices_to_keep = [idx for idx in range(len(cluster_data)) if (cluster, idx) not in to_remove]
    cluster_data = cluster_data.iloc[indices_to_keep]
    unique_dataset = pd.concat([unique_dataset, cluster_data], ignore_index=True)

# Fix dataset length = 20,000
if len(unique_dataset) > 20000:
    unique_dataset = unique_dataset.sample(n=20000, random_state=42)
print(f"Remaining entries: {len(unique_dataset)}")


Batches:   0%|          | 0/100 [00:00<?, ?it/s]

Batches:   0%|          | 0/73 [00:00<?, ?it/s]

Batches:   0%|          | 0/84 [00:00<?, ?it/s]

Batches:   0%|          | 0/95 [00:00<?, ?it/s]

Batches:   0%|          | 0/55 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/99 [00:00<?, ?it/s]

Batches:   0%|          | 0/68 [00:00<?, ?it/s]

Batches:   0%|          | 0/55 [00:00<?, ?it/s]

Batches:   0%|          | 0/90 [00:00<?, ?it/s]

Batches:   0%|          | 0/84 [00:00<?, ?it/s]

Remaining entries: 20000


In [None]:
unique_dataset.shape[0]

20000

In [None]:
dataset.groupby('Subreddit').size() - unique_dataset.groupby('Subreddit').size()

Subreddit
askacademia_train          381
askanthropology_train      345
askbaking_train            394
askcarguys_train           340
askculinary_train          373
askdocs_train              405
askengineers_train         346
askhistorians_train        328
askhr_train                338
askphilosophy_train        634
askphysics_train           302
askscience_train           268
asksciencefiction_train    329
asksocialscience_train     290
askvet_train               335
changemyview_train         622
explainlikeimfive_train    527
legaladvice_train          328
dtype: int64

In [None]:
dataset.groupby('Subreddit').size()

Subreddit
askacademia_train          1503
askanthropology_train      1497
askbaking_train            1484
askcarguys_train           1515
askculinary_train          1501
askdocs_train              1727
askengineers_train         1380
askhistorians_train        1326
askhr_train                1426
askphilosophy_train        2776
askphysics_train           1291
askscience_train           1006
asksciencefiction_train    1369
asksocialscience_train     1275
askvet_train               1352
changemyview_train         1754
explainlikeimfive_train    1320
legaladvice_train          1383
dtype: int64

In [None]:
df1 = unique_dataset
df1 = df1.drop('Keyword', axis=1)
df1 = df1.drop('Cluster', axis=1)
df1.to_csv('Method1_dataset.csv', index=False)
files.download('Method1_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# torch.utils.data Class

In [None]:
import pandas as pd
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Colab DataSets/NLP Project/Method1_dataset.csv"
dataset = pd.read_csv(path)
dataset.head()

Unnamed: 0,Subreddit,Question
0,askcarguys_train,What are the signs of a failing transmission a...
1,askengineers_train,**I'm stumped:** How can I optimize the design...
2,askacademia_train,What are the key differences between practicum...
3,askengineers_train,I'm curious about the latest advances in machi...
4,changemyview_train,CMV: Modern art is subjective and its value is...


In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [None]:
#Load dataset into torch.utils.data
class SubredditQuestionDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        subreddit = row['Subreddit']
        question = row['Question']
        return {'Subreddit': subreddit, 'Question': question}

In [None]:
subreddit_question_dataset = SubredditQuestionDataset(dataset)

In [None]:
#Can use data_loader in training loop
data_loader = DataLoader(subreddit_question_dataset, batch_size=10, shuffle=True)

In [None]:
type(data_loader)

In [None]:
import google.generativeai as genai
GOOGLE_API_KEY = 'AIzaSyB9j30V-HeUhRyH4FEDyZqfKJk_cLAUWsg' # or 'AIzaSyDrxEH1b_TbuOFCqFQPkPxfFTZ9H33eLSM'
genai.configure(api_key = GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-1.5-pro-latest') # gemini-1.5-pro-latest
response = model.generate_content("Where does the sun rise?")
print(response.text)

The sun always rises in the **east**. 

While the exact location on the horizon where it rises can vary slightly throughout the year due to the Earth's tilt, it will always rise in the general direction of east. 

