In [1]:
import pandas as pd

# Load CSV
df = pd.read_csv('techcrunch.csv')

# Preprocess content (if needed)
df['Content'] = df['Content'].fillna('')  # Handle missing content if any
combined_texts = df['Content'].tolist()



In [3]:
print(combined_texts)

['Comment Back in 2019, Synex Medical founder Ben Nashman spent the night detained by US customs. Nashman tried to explain he was simply transporting materials from Buffalo to Toronto for his homemade MRI. Customs, however, took issue with the label on the package: “nuclear magnetic resonance.” Nashman spent hours in a bright waiting room before he finally convinced them that he was really just a run-of-the-mill 18-year-old scientist with an obsession with MRI technology. They let him take his roughly 80-pound magnet and he zoomed back to Toronto. “I got back at like 3 or 4 am and got a few hours of sleep before classes,” he said. Nashman, now 24, might have landed himself on a list of suspicious individuals, but he insists it was worth it: that one very long night was part of his years-long journey to build a portable MRI capable of testing glucose and other important molecules without the need to extract blood. Today, the company is one step closer to that goal, announcing a $21.8 mi

In [18]:
from sentence_transformers import SentenceTransformer

# Load pre-trained transformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Create embeddings for each article's content
embeddings = model.encode(combined_texts)



In [17]:
import faiss
import numpy as np

# Create a FAISS index
d = embeddings.shape[1]  # Embedding dimension
index = faiss.IndexFlatL2(d)  # L2 distance for similarity

# Convert embeddings to a NumPy array and add them to the index
embeddings_np = np.array(embeddings).astype('float32')
index.add(embeddings_np)

# Check the number of vectors in the index
print(f"Total articles indexed: {index.ntotal}")

Total articles indexed: 105


In [19]:
def search_articles(query, top_n=5):
    # Embed the query
    query_embedding = model.encode([query])

    # Search the FAISS index
    distances, indices = index.search(np.array(query_embedding).astype('float32'), top_n)

    # Return the results
    results = []
    for idx in indices[0]:
        result = {
            'Title': df.iloc[idx]['Title'],
            'Author': df.iloc[idx]['Author'],
            'URL': df.iloc[idx]['URL'],
            'Content': df.iloc[idx]['Content']
        }
        results.append(result)
    
    return results

# Example search
# query = input("how can I help you today?: ")
# results = search_articles(query)

# for res in results:
#     print(f"Title: {res['Title']}\nAuthor: {res['Author']}\nURL: {res['URL']}\nContent: {res['Content'][:200]}...\n")

In [20]:
from openai import OpenAI
client = OpenAI()
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Set OpenAI API key
client.api_key = os.getenv("OPENAI_API_KEY")
systemPrompt = "you are the best assistant ever and you answers are full with enthousiasm"

# Function to enhance search results using GPT
def enhance_with_gpt(articles):
    enhanced_results = []
    
    for article in articles:
        # Format content for GPT
        prompt = f"Please summarize the following article details in a more conversational tone:\n\n"
        prompt += f"Title: {article['Title']}\n"
        prompt += f"Author: {article['Author']}\n"
        prompt += f"Content Snippet: {article['Content'][:500]}\n\n"
        prompt += "Make the summary user-friendly and informative."
        
        # Use client.chat.completions
        response = client.chat.completions.create(
            model="gpt-4o-mini",  
            messages=[
                {"role": "system", "content": systemPrompt},
                {"role": "user", "content": prompt}
            ],
            max_tokens=200,
            temperature=0.7
        )
        
        # Extract the response content
        
        enhanced_content = response.choices[0].message.content
        
        # Append enhanced content to the result
        enhanced_results.append({
            'Title': article['Title'],
            'Author': article['Author'],
            'URL': article['URL'],
            'Enhanced Content': enhanced_content
        })
        
    return enhanced_results

In [21]:
# Example enhanced search
query = input("how can i help you today")
faiss_results = search_articles(query)

# Enhance with GPT
enhanced_results = enhance_with_gpt(faiss_results)

# Display the results
for res in enhanced_results:
    print(f"Title: {res['Title']}\nAuthor: {res['Author']}\nURL: {res['URL']}\nEnhanced Summary: {res['Enhanced Content']}\n")

Title: Fal.ai, which hosts media-generating AI models, raises $23M from a16z and others
Author: Kyle Wiggers
URL: https://techcrunch.com/2024/09/18/fal-ai-which-hosts-media-generating-ai-models-raises-23m-from-a16z-and-others/
Enhanced Summary: Hey there! Exciting news in the world of AI! Fal.ai, a cool platform that focuses on generating audio, video, and images using AI, just announced it has raised a whopping $23 million in funding! 🎉 Their investors include big names like Andreessen Horowitz (a16z), Robin Rombach from Black Forest Labs, and Aravind Srinivas, the CEO of Perplexity.

This funding is part of a two-round deal: they secured $14 million in a Series A round led by Kindred Ventures, and the remaining $9 million came from a previously unannounced seed round led by a16z. The founders, Burkay Gur and Gorkem Yurtseven, are clearly making waves in the tech scene! Can't wait to see what they do next! 🌟

Title: Virtuous, a fundraising CRM for nonprofits, raises $100M from Susqueh

In [3]:
import os
from dotenv import load_dotenv
import openai
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
import pandas as pd

# Load environment variables from .env file
load_dotenv()

# Initialize OpenAI API
openai.api_key = os.getenv("OPENAI_API_KEY")

# Initialize Pinecone
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key, environment="us-east-1")  # Change environment based on your region

# Create or connect to a Pinecone index
index_name = "index-article"
index = pc.Index(index_name)

# Initialize the model for generating embeddings
model = SentenceTransformer('paraphrase-MiniLM-L12-v2')  

# Load CSV
df = pd.read_csv('techcrunch.csv')
# Preprocess content (if needed)
df['Content'] = df['Content'].fillna('')  # Handle missing content if any
articles = df.to_dict(orient='records')

# Function to add articles to Pinecone
def add_articles_to_pinecone(articles):
    for idx, article in enumerate(articles):
        # Create the embedding for the article content
        embedding = model.encode(article['Content'])
        # Upload the vector to Pinecone
        index.upsert(vectors=[(str(idx), embedding)])

# Function to search articles using Pinecone
def search_articles(query):
    # Create the query vector
    query_embedding = model.encode(query)
    
    # Convert the NumPy array to a list
    query_embedding = query_embedding.tolist()
    # Query Pinecone for similar articles
    result = index.query(queries=[query_embedding], top_k=5)
    # Retrieve matched article IDs
    matched_articles = result['matches']
    return matched_articles

# Function to enhance search results using GPT
def enhance_with_gpt(articles):
    enhanced_results = []
    for article in articles:
        # Format content for GPT
        prompt = f"Please summarize the following article details in a more conversational tone:\n\n"
        prompt += f"Title: {article['Title']}\n"
        prompt += f"Author: {article['Author']}\n"
        prompt += f"Content Snippet: {article['Content'][:500]}\n\n"
        prompt += "Make the summary user-friendly and informative."
        # Use client.chat.completions
        response = openai.Completion.create(
            model="gpt-4o-mini",
            prompt=prompt,
            max_tokens=200,
            temperature=0.7
        )
        # Extract the response content
        enhanced_content = response.choices[0].text.strip()
        # Append enhanced content to the result
        enhanced_results.append({
            'Title': article['Title'],
            'Author': article['Author'],
            'URL': article['URL'],
            'Enhanced Content': enhanced_content
        })
    return enhanced_results

# Example workflow
# Add articles to Pinecone
add_articles_to_pinecone(articles)

# Example search query
query = input("how can I help today: ")
# Search in Pinecone
pinecone_results = search_articles(query)

# Retrieve full articles based on Pinecone search results
matched_articles = [articles[int(match['id'])] for match in pinecone_results]

# Enhance with GPT
enhanced_results = enhance_with_gpt(matched_articles)

# Print the enhanced results
for res in enhanced_results:
    print(f"Title: {res['Title']}\nAuthor: {res['Author']}\nURL: {res['URL']}\nEnhanced Content: {res['Enhanced Content']}\n")




PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Wed, 25 Sep 2024 01:34:49 GMT', 'Content-Type': 'application/json', 'Content-Length': '103', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '38', 'x-pinecone-request-id': '4881108236369496423', 'x-envoy-upstream-service-time': '38', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Vector dimension 384 does not match the dimension of the index 1536","details":[]}


In [9]:
import os
import openai
from pinecone import Pinecone, ServerlessSpec
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
client = OpenAI()

# Load environment variables from .env file
load_dotenv()

# Initialize OpenAI API
client.api_key = os.getenv("OPENAI_API_KEY")

# Initialize Pinecone
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key, environment="us-east-1")  # Change environment based on your region

# Create or connect to a Pinecone index
index_name = "index-article"
index = pc.Index(index_name)

# Load CSV
df = pd.read_csv('techcrunch.csv')
df['Content'] = df['Content'].fillna('')  # Handle missing content if any
articles = df.to_dict(orient='records')

# Function to generate embeddings using OpenAI
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

# Function to add articles to Pinecone
def add_articles_to_pinecone(articles):
    vectors = []
    for idx, article in enumerate(articles):
        # Create the embedding for the article content using OpenAI
        embedding = get_embedding(article['Content'])
        # Add to list of vectors
        vectors.append((str(idx), embedding))
    # Upload the vector batch to Pinecone
    index.upsert(vectors=vectors, namespace="ns1")

def search_articles(query):
    # Create the query embedding using OpenAI
    query_embedding = get_embedding(query)
    
    # # Query Pinecone for similar articles
    # result = index.query(queries=[query_embedding], top_k=5)
    # # Retrieve matched article IDs
    # matched_articles = result['matches']
    pinecone_results = index.query(
    namespace="ns1",
    vector=query_embedding[0],
    top_k=5)
    return pinecone_results

# Function to enhance search results using GPT
def enhance_with_gpt(articles):
    enhanced_results = []
    for article in articles:
        # Format content for GPT
        prompt = f"Please summarize the following article details in a more conversational tone:\n\n"
        prompt += f"Title: {article['Title']}\n"
        prompt += f"Author: {article['Author']}\n"
        prompt += f"Content Snippet: {article['Content'][:500]}\n\n"
        prompt += "Make the summary user-friendly and informative."
        
        # Use OpenAI's Chat API to enhance the results
        response = client.Completion.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=200,
            temperature=0.7
        )
        enhanced_content = response['choices'][0]['message']['content'].strip()
        
        # Append enhanced content to the result
        enhanced_results.append({
            'Title': article['Title'],
            'Author': article['Author'],
            'URL': article['URL'],
            'Enhanced Content': enhanced_content
        })
    return enhanced_results

# Example workflow
# Add articles to Pinecone
add_articles_to_pinecone(articles)

# Example search query
query = input("How can I help today: ")
embedded_query = search_articles(query)

# Search in Pinecone
#pinecone_results = search_articles(query)

# Function to search articles using Pinecone

# Retrieve full articles based on Pinecone search results
matched_articles = [articles[int(match['id'])] for match in pinecone_results]

# Enhance with GPT
enhanced_results = enhance_with_gpt(matched_articles)

# Print the enhanced results
for res in enhanced_results:
    print(f"Title: {res['Title']}\nAuthor: {res['Author']}\nURL: {res['URL']}\nEnhanced Content: {res['Enhanced Content']}\n")

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Wed, 25 Sep 2024 02:07:56 GMT', 'Content-Type': 'application/json', 'Content-Length': '101', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '47', 'x-pinecone-request-id': '4562747807993570371', 'x-envoy-upstream-service-time': '48', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Vector dimension 1 does not match the dimension of the index 1536","details":[]}
