# Read in cvs of tweets and get GPT analysis

### Run all cells:
  - `ctrl`+`shift`+`P` -> "run all cells in notebook"

    Or
  - Runtime > Run all

    Or
  - `ctrl` + `F9`
### Once complete, final excel files will be saved to the local runtime environment

# Read Scraped Tweets and Do Some Cleanup

## Download the CSV from the scraped epidural tweets

In [None]:
import requests

url = 'https://media.githubusercontent.com/media/kswanjitsu/epidural/main/filtered_df.csv'
resp = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
resp

### Read data to pandas DataFrame

In [None]:
import io
import pandas as pd

df = pd.read_csv(io.StringIO(resp.text))
df

## Remove Dupes and Retweets

In [None]:
df_no_dupes = df.drop_duplicates(subset=['cleaned_tweet'])
df_no_dupes_or_rts = df_no_dupes[~df_no_dupes['cleaned_tweet'].str.startswith('rt @')]
df = df_no_dupes_or_rts
df

# GPT Analysis Section

## Count tokens per tweet

In [None]:
!pip install --upgrade tiktoken

In [None]:
import tiktoken


# gpt_model = 'gpt-3.5-turbo'
gpt_model = 'gpt-4'

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


df['tok_per_twt'] = df['cleaned_tweet'].apply(lambda x: num_tokens_from_string(x, tiktoken.encoding_for_model(gpt_model).name))
df

### Install OpenAI and Langchain libraries

In [None]:
!pip install --upgrade openai langchain

## Setup OpenAI key and instatiate a chat model to make the API calls

In [None]:
import openai
import os

openai.api_key  = '<your_open_ai_key>'
os.environ['OPENAI_API_KEY'] = openai.api_key

In [None]:
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(temperature=0.0, model_name=gpt_model)
chat

### Prompt template

Give the model instructions to interpret the message and format a response.

**_text_** is a list of tweets string built in subsequent cell

**_format_instructions_** is a langchain output parser, also built in subsequent cell

In [None]:
from langchain.prompts import ChatPromptTemplate

template_string = '''There is a list of posts demarcated by triple backticks (```).

For each item in the list, determine:
(1) if the author had an epidural or not
(2) if their opinion of epidurals is positive, negative, or neutral
(3) if their opinion on natural childbirth is positive, negative, or neutral

```{text}```

Respond with a list of python dictionaries corresponding to each message in the list.
{format_instructions}

'''

#### Format instructions:
- We want the model to return a list of json objects
- This code describes each of the fields in the those JSON objects. This way we can parse the responses and convert it more easily into usable data.

In [None]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser


index_schema = ResponseSchema(name="index",
                            description="The index number in front of the list entry")

about_epi_schema = ResponseSchema(name="about_epi",
                            description="Is the author mainly talking about epidurals or natural?\
                            1 for 'epidurals', 0 for 'natural', NaN if you could not determine.\
                            Response should ONLY be one of: [1, 0, NaN]")
had_epi_schema = ResponseSchema(name="had_epi",
                            description="Did the author have an epidural?\
                            1 for 'yes', 0 for 'no', NaN if you could not determine.\
                            Response should ONLY be one of: [1, 0, NaN]")
epi_pos_schema = ResponseSchema(name="epi_pos",
                            description="Does the message display a positive sentiment towards epidurals?\
                            1 for 'yes', 0 for 'neutral', -1 for 'negative sentiment', NaN if you could not determine.\
                            Response should ONLY be one of: [1, 0, -1, NaN]")

nat_pos_schema = ResponseSchema(name="nat_pos",
                            description="Does the message display a positive sentiment towards natural births?\
                            1 for 'yes', 0 for 'neutral', -1 for 'negative sentiment', NaN if you could not determine.\
                            Response should ONLY be one of: [1, 0, -1, NaN]")

response_schemas = [index_schema, about_epi_schema, had_epi_schema, epi_pos_schema, nat_pos_schema]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()
prompt = ChatPromptTemplate.from_template(template=template_string)


## Sample the full dataset and remove no-longer-needed columns

In [None]:
import numpy as np

num_samples = 40000
sample_of_tweets_df = df.sample(n=num_samples)

############## Only keep the cleaned_tweets for this section ############
sample_of_tweets_df = sample_of_tweets_df[['cleaned_tweet']]
# Save off the old index so it will be easy to merge the results DF and the original
sample_of_tweets_df['old_index'] = sample_of_tweets_df.index

# Reset the index so it will be easier to group the tweets and send them as batches
sample_of_tweets_df.reset_index(inplace=True)
sample_of_tweets_df

## Send the tweets to the model

### The groupby snags x rows from `sample_of_tweets_df` at a time, then builds a list of the tweets preceeded by their original index. The format instructions are already embedded into the prompt. Here we are creating the `text` component.
- For each batch, get and parse the responses. The response dictionaries are appended to a list of response dictionaries.
- Lots of debugging prints commented out. Feel free to uncomment to see what's going on, but it may add significant amounts of text.

In [None]:
import json
import regex as re


all_responses_list_of_dicts = list()

send_message = False
all_messages = []


group_size = 20

for i, g in sample_of_tweets_df.groupby(sample_of_tweets_df.index // group_size):
  try:
    # print('-'*50)
    print(f'\rGroup #{i}', end='')
    msg_list = []
    # print(g.to_dict())
    for j, data in g.iterrows():
      old_index = data['old_index']
      tweet = data['cleaned_tweet']
      # print(f'Adding tweet {old_index}: {tweet}')
      msg_list.append(f'{old_index}) {tweet.encode("UTF-8")}')
    messages = prompt.format_messages(text='\n'.join(msg_list),
                              format_instructions=format_instructions)
    all_messages.append(messages[0].content)
    # print(messages[0].content)
    if send_message:
      response = chat(messages)
      m = re.findall(r'\{([^{]*?)\}', response.content)
      all_responses_list_of_dicts.extend([json.loads(f'{{{item}}}') for item in m])
  except Exception as e:
    print(e)

# print('='*50)
# print(all_responses_list_of_dicts)

In [None]:
# Sample of the responses
print(len(all_responses_list_of_dicts))
print('Sample...:')
all_responses_list_of_dicts[:2]

### Put the list of response dicitonaries into a DataFrame

In [None]:
pd.set_option('display.max_colwidth', None)


temp_df = pd.DataFrame(all_responses_list_of_dicts)
temp_df['index'] = pd.to_numeric(temp_df['index'])
temp_df

## Merge the results with the original `sample_of_tweets_df` and original full dictionary

In [None]:
merged_results = sample_of_tweets_df.merge(right=temp_df, how='inner', left_on='old_index', right_on='index')
merged_results

In [None]:
df['old_index'] = df.index
filtered_df_merged_results = df.merge(right=temp_df, how='inner', left_on='old_index', right_on='index')
filtered_df_merged_results

## Save and Download results

In [None]:
merged_results_file = 'merged_results.xlsx'
filtered_df_merged_results_file = 'filtered_df_merged_results.xlsx'

merged_results.to_excel(merged_results_file)
filtered_df_merged_results.to_excel(filtered_df_merged_results_file)
sample_of_tweets_df.to_excel('sample_of_tweets_df.xlsx')

