# Compare the sentiment analysis from GPT and Roberta
## Break it down into 3 dataframes:
1. full csv with everything
1. one filtered on `had_epi = True`
1. one filtered on `had_epi = False`

In [None]:
import pandas as pd


data_url = r'https://github.com/kswanjitsu/epidural/raw/main/data/filtered_df_merged_results.xlsx' # or results from OpenAI_Epidural_Search_v2
df = pd.read_excel(data_url)
backup_df = df
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,User,Time,Likes,Source,Tweet,cleaned_tweet,sent_analysis,topic,tok_per_twt,old_index,index,about_epi,had_epi,epi_pos,nat_pos
0,0,417,dpedini,2009-09-07 18:53:24+00:00,0,Twitter Web Client,For the last time labor day is not a holiday f...,for the last time labor day is not a holiday f...,"{'label': 'negative', 'score': 0.5397067666053...",1,18,13,13,0.0,,,0.0
1,1,705,trishlrees,2009-07-10 03:17:34+00:00,0,Twitter Web Client,"@ehasselbeck don't know if I should tell you, ...","@user don't know if i should tell you, but the...","{'label': 'negative', 'score': 0.8739022016525...",1,36,24,24,0.0,0.0,-1.0,0.0
2,2,755,KristaSilveria,2009-06-09 02:47:39+00:00,0,Twitter Web Client,@ehasselbeck if you haven't had good luck don'...,@user if you haven't had good luck don't bothe...,"{'label': 'positive', 'score': 0.9046367406845...",1,30,29,29,0.0,,,1.0
3,3,780,NaturalBirthPro,2009-05-17 17:55:21+00:00,0,Twitter Web Client,Natural childbirth refers to a labor and birth...,natural childbirth refers to a labor and birth...,"{'label': 'neutral', 'score': 0.68845134973526}",1,21,33,33,0.0,,,1.0
4,4,875,_Momster_,2009-02-03 06:34:01+00:00,0,Twitter Web Client,"I would give birth 20 times if I could, just t...","i would give birth 20 times if i could, just t...","{'label': 'positive', 'score': 0.9370241761207...",1,29,43,43,0.0,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72994,72994,658343,vlm1victoria,2018-01-01 23:35:19+00:00,6,Twitter for iPhone,@coollyndz @WhistlingDixie4 @aroradrn @ICUltra...,@user @user @user @user @user @user @user @use...,"{'label': 'positive', 'score': 0.5854973793029...",1,98,431498,431498,1.0,,1.0,
72995,72995,658351,EllisMarianne,2018-01-01 21:59:09+00:00,0,Twitter for Android,@Laneycakes @TomPettinger @millihill A million...,@user @user @user a million times this. lost c...,"{'label': 'negative', 'score': 0.731343150138855}",1,61,431503,431503,1.0,,1.0,-1.0
72996,72996,658364,lucybball,2018-01-01 18:32:35+00:00,1,Twitter Web Client,@gridironbaby So sorry you are and have been g...,@user so sorry you are and have been going thr...,"{'label': 'negative', 'score': 0.8958117365837...",1,68,431506,431506,1.0,1.0,-1.0,
72997,72997,658367,KISS_myMorgs,2018-01-01 16:32:18+00:00,0,Twitter for iPhone,I got an epidural ... but I would never tell s...,i got an epidural ... but i would never tell s...,"{'label': 'neutral', 'score': 0.5093790292739868}",1,43,431509,431509,1.0,1.0,0.0,0.0


# Dataframe Cleanup

## Confirm that all rows in the columns we are looking at are valid

In [None]:
import numpy as np

assert len(df[~df['epi_pos'].isin([0.0, 1.0, -1.0, np.NaN])]) == 0
assert len(df[~df['nat_pos'].isin([0.0, 1.0, -1.0, np.NaN])]) == 0
assert len(df[~df['about_epi'].isin([0.0, 1.0, -1.0, np.NaN])]) == 0
assert len(df[~df['had_epi'].isin([0.0, 1.0, -1.0, np.NaN])]) == 0

## Translate from numerical representations to words to make it more clear

In [None]:
import numpy as np


bool_res_cols = ['about_epi',	'had_epi']
sent_res_cols = ['epi_pos',	'nat_pos']
res_cols = bool_res_cols + sent_res_cols


df['about_epi'] = df['about_epi'].astype(str).replace({'0.0': 'natural', '1.0': 'epidural', 'nan': 'unknown'})
df['had_epi'] = df['had_epi'].astype(str).replace({'0.0': False, '1.0': True, 'nan': 'unknown'})

for col in sent_res_cols:
  s = df[col].astype(str).replace({'0.0': 'neutral', '1.0': 'positive', '-1.0': 'negative', 'nan': 'unknown'})
  df[col] = s

df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,User,Time,Likes,Source,Tweet,cleaned_tweet,sent_analysis,topic,tok_per_twt,old_index,index,about_epi,had_epi,epi_pos,nat_pos
0,0,417,dpedini,2009-09-07 18:53:24+00:00,0,Twitter Web Client,For the last time labor day is not a holiday f...,for the last time labor day is not a holiday f...,"{'label': 'negative', 'score': 0.5397067666053...",1,18,13,13,natural,unknown,unknown,neutral
1,1,705,trishlrees,2009-07-10 03:17:34+00:00,0,Twitter Web Client,"@ehasselbeck don't know if I should tell you, ...","@user don't know if i should tell you, but the...","{'label': 'negative', 'score': 0.8739022016525...",1,36,24,24,natural,False,negative,neutral
2,2,755,KristaSilveria,2009-06-09 02:47:39+00:00,0,Twitter Web Client,@ehasselbeck if you haven't had good luck don'...,@user if you haven't had good luck don't bothe...,"{'label': 'positive', 'score': 0.9046367406845...",1,30,29,29,natural,unknown,unknown,positive
3,3,780,NaturalBirthPro,2009-05-17 17:55:21+00:00,0,Twitter Web Client,Natural childbirth refers to a labor and birth...,natural childbirth refers to a labor and birth...,"{'label': 'neutral', 'score': 0.68845134973526}",1,21,33,33,natural,unknown,unknown,positive
4,4,875,_Momster_,2009-02-03 06:34:01+00:00,0,Twitter Web Client,"I would give birth 20 times if I could, just t...","i would give birth 20 times if i could, just t...","{'label': 'positive', 'score': 0.9370241761207...",1,29,43,43,natural,unknown,unknown,positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72994,72994,658343,vlm1victoria,2018-01-01 23:35:19+00:00,6,Twitter for iPhone,@coollyndz @WhistlingDixie4 @aroradrn @ICUltra...,@user @user @user @user @user @user @user @use...,"{'label': 'positive', 'score': 0.5854973793029...",1,98,431498,431498,epidural,unknown,positive,unknown
72995,72995,658351,EllisMarianne,2018-01-01 21:59:09+00:00,0,Twitter for Android,@Laneycakes @TomPettinger @millihill A million...,@user @user @user a million times this. lost c...,"{'label': 'negative', 'score': 0.731343150138855}",1,61,431503,431503,epidural,unknown,positive,negative
72996,72996,658364,lucybball,2018-01-01 18:32:35+00:00,1,Twitter Web Client,@gridironbaby So sorry you are and have been g...,@user so sorry you are and have been going thr...,"{'label': 'negative', 'score': 0.8958117365837...",1,68,431506,431506,epidural,True,negative,unknown
72997,72997,658367,KISS_myMorgs,2018-01-01 16:32:18+00:00,0,Twitter for iPhone,I got an epidural ... but I would never tell s...,i got an epidural ... but i would never tell s...,"{'label': 'neutral', 'score': 0.5093790292739868}",1,43,431509,431509,epidural,True,neutral,neutral


## Same deal for the roberta column (sent_analysis column --> roberta column)

In [None]:
import json

df['roberta'] = [json.loads(sa.replace('\'', '"'))['label'] for sa in df['sent_analysis']]

df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,User,Time,Likes,Source,Tweet,cleaned_tweet,sent_analysis,topic,tok_per_twt,old_index,index,about_epi,had_epi,epi_pos,nat_pos,roberta
0,0,417,dpedini,2009-09-07 18:53:24+00:00,0,Twitter Web Client,For the last time labor day is not a holiday f...,for the last time labor day is not a holiday f...,"{'label': 'negative', 'score': 0.5397067666053...",1,18,13,13,natural,unknown,unknown,neutral,negative
1,1,705,trishlrees,2009-07-10 03:17:34+00:00,0,Twitter Web Client,"@ehasselbeck don't know if I should tell you, ...","@user don't know if i should tell you, but the...","{'label': 'negative', 'score': 0.8739022016525...",1,36,24,24,natural,False,negative,neutral,negative
2,2,755,KristaSilveria,2009-06-09 02:47:39+00:00,0,Twitter Web Client,@ehasselbeck if you haven't had good luck don'...,@user if you haven't had good luck don't bothe...,"{'label': 'positive', 'score': 0.9046367406845...",1,30,29,29,natural,unknown,unknown,positive,positive
3,3,780,NaturalBirthPro,2009-05-17 17:55:21+00:00,0,Twitter Web Client,Natural childbirth refers to a labor and birth...,natural childbirth refers to a labor and birth...,"{'label': 'neutral', 'score': 0.68845134973526}",1,21,33,33,natural,unknown,unknown,positive,neutral
4,4,875,_Momster_,2009-02-03 06:34:01+00:00,0,Twitter Web Client,"I would give birth 20 times if I could, just t...","i would give birth 20 times if i could, just t...","{'label': 'positive', 'score': 0.9370241761207...",1,29,43,43,natural,unknown,unknown,positive,positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72994,72994,658343,vlm1victoria,2018-01-01 23:35:19+00:00,6,Twitter for iPhone,@coollyndz @WhistlingDixie4 @aroradrn @ICUltra...,@user @user @user @user @user @user @user @use...,"{'label': 'positive', 'score': 0.5854973793029...",1,98,431498,431498,epidural,unknown,positive,unknown,positive
72995,72995,658351,EllisMarianne,2018-01-01 21:59:09+00:00,0,Twitter for Android,@Laneycakes @TomPettinger @millihill A million...,@user @user @user a million times this. lost c...,"{'label': 'negative', 'score': 0.731343150138855}",1,61,431503,431503,epidural,unknown,positive,negative,negative
72996,72996,658364,lucybball,2018-01-01 18:32:35+00:00,1,Twitter Web Client,@gridironbaby So sorry you are and have been g...,@user so sorry you are and have been going thr...,"{'label': 'negative', 'score': 0.8958117365837...",1,68,431506,431506,epidural,True,negative,unknown,negative
72997,72997,658367,KISS_myMorgs,2018-01-01 16:32:18+00:00,0,Twitter for iPhone,I got an epidural ... but I would never tell s...,i got an epidural ... but i would never tell s...,"{'label': 'neutral', 'score': 0.5093790292739868}",1,43,431509,431509,epidural,True,neutral,neutral,neutral


## Go through each of the three sub-DFs to determine fraction of agreement between roberta and GPT

## All rows

In [None]:
df['either_sentiment_matches'] = np.logical_or(df['roberta'] == df['epi_pos'], df['roberta'] == df['nat_pos'])
print('Fraction of rows with sentiment agreement:')
print(f'{len(df[df["either_sentiment_matches"]==True]) / len(df)}')
print('='*50)
df

Fraction of rows with sentiment agreement:
0.5679803832929218


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,User,Time,Likes,Source,Tweet,cleaned_tweet,sent_analysis,topic,tok_per_twt,old_index,index,about_epi,had_epi,epi_pos,nat_pos,roberta,either_sentiment_matches
0,0,417,dpedini,2009-09-07 18:53:24+00:00,0,Twitter Web Client,For the last time labor day is not a holiday f...,for the last time labor day is not a holiday f...,"{'label': 'negative', 'score': 0.5397067666053...",1,18,13,13,natural,unknown,unknown,neutral,negative,False
1,1,705,trishlrees,2009-07-10 03:17:34+00:00,0,Twitter Web Client,"@ehasselbeck don't know if I should tell you, ...","@user don't know if i should tell you, but the...","{'label': 'negative', 'score': 0.8739022016525...",1,36,24,24,natural,False,negative,neutral,negative,True
2,2,755,KristaSilveria,2009-06-09 02:47:39+00:00,0,Twitter Web Client,@ehasselbeck if you haven't had good luck don'...,@user if you haven't had good luck don't bothe...,"{'label': 'positive', 'score': 0.9046367406845...",1,30,29,29,natural,unknown,unknown,positive,positive,True
3,3,780,NaturalBirthPro,2009-05-17 17:55:21+00:00,0,Twitter Web Client,Natural childbirth refers to a labor and birth...,natural childbirth refers to a labor and birth...,"{'label': 'neutral', 'score': 0.68845134973526}",1,21,33,33,natural,unknown,unknown,positive,neutral,False
4,4,875,_Momster_,2009-02-03 06:34:01+00:00,0,Twitter Web Client,"I would give birth 20 times if I could, just t...","i would give birth 20 times if i could, just t...","{'label': 'positive', 'score': 0.9370241761207...",1,29,43,43,natural,unknown,unknown,positive,positive,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72994,72994,658343,vlm1victoria,2018-01-01 23:35:19+00:00,6,Twitter for iPhone,@coollyndz @WhistlingDixie4 @aroradrn @ICUltra...,@user @user @user @user @user @user @user @use...,"{'label': 'positive', 'score': 0.5854973793029...",1,98,431498,431498,epidural,unknown,positive,unknown,positive,True
72995,72995,658351,EllisMarianne,2018-01-01 21:59:09+00:00,0,Twitter for Android,@Laneycakes @TomPettinger @millihill A million...,@user @user @user a million times this. lost c...,"{'label': 'negative', 'score': 0.731343150138855}",1,61,431503,431503,epidural,unknown,positive,negative,negative,True
72996,72996,658364,lucybball,2018-01-01 18:32:35+00:00,1,Twitter Web Client,@gridironbaby So sorry you are and have been g...,@user so sorry you are and have been going thr...,"{'label': 'negative', 'score': 0.8958117365837...",1,68,431506,431506,epidural,True,negative,unknown,negative,True
72997,72997,658367,KISS_myMorgs,2018-01-01 16:32:18+00:00,0,Twitter for iPhone,I got an epidural ... but I would never tell s...,i got an epidural ... but i would never tell s...,"{'label': 'neutral', 'score': 0.5093790292739868}",1,43,431509,431509,epidural,True,neutral,neutral,neutral,True


## Rows where `had_epi` is `True`

In [None]:
df_had_epi = df[df['had_epi']==True]
df_had_epi.reset_index(inplace=True)
df_had_epi['sent_agreement'] = pd.Series(df_had_epi['epi_pos'] == df_had_epi['roberta'])


print('Fraction of rows with sentiment agreement:')
print(f'{len(df_had_epi[df_had_epi["sent_agreement"]==True]) / len(df_had_epi)}')
print('='*50)
df_had_epi

Fraction of rows with sentiment agreement:
0.6344483339815895


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_had_epi['sent_agreement'] = pd.Series(df_had_epi['epi_pos'] == df_had_epi['roberta'])


Unnamed: 0.2,level_0,Unnamed: 0.1,Unnamed: 0,User,Time,Likes,Source,Tweet,cleaned_tweet,sent_analysis,...,tok_per_twt,old_index,index,about_epi,had_epi,epi_pos,nat_pos,roberta,either_sentiment_matches,sent_agreement
0,44,44,4876,MollyMakowsky,2013-01-28 01:57:46+00:00,0,Twitter for iPhone,Hahahaha for real tho labor and child birth is...,hahahaha for real tho labor and child birth is...,"{'label': 'positive', 'score': 0.841778576374054}",...,39,290,290,epidural,True,neutral,neutral,positive,False,False
1,60,60,6077,DarleneMacAuley,2015-09-15 14:43:55+00:00,1,TweetDeck,"@Mummuddlingthru @SmartMomCo W/ 1st, I had an ...","@user @user w/ 1st, i had an epi &amp; forceps...","{'label': 'neutral', 'score': 0.49350351095199...",...,47,366,366,epidural,True,neutral,positive,neutral,True,True
2,72,72,7970,gidgey08,2015-03-24 01:02:02+00:00,5,Twitter for iPhone,"""oh yeah I'm having a natural birth"" *two hour...","""oh yeah i'm having a natural birth"" *two hour...","{'label': 'positive', 'score': 0.5466676354408...",...,37,443,443,natural,True,positive,negative,positive,True,True
3,116,116,13601,dreamyprotag,2022-10-09 21:30:02+00:00,4,Twitter for iPhone,@briaananax @Tiniwana @Wait10001 Back labor ha...,@user @user @user back labor had me throwing u...,"{'label': 'negative', 'score': 0.8549533486366...",...,44,706,706,epidural,True,positive,negative,negative,True,False
4,168,168,21315,BazookaTX,2022-03-14 14:18:12+00:00,1,Twitter for Android,@ClintFiore @jaminball Started natural but he ...,@user @user started natural but he just got st...,"{'label': 'neutral', 'score': 0.697364866733551}",...,55,1048,1048,natural,True,unknown,positive,neutral,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30847,72985,72985,658290,Az49erGirl,2018-01-02 16:35:23+00:00,3,Twitter for iPhone,Epidural time....so thankful for it too.,epidural time....so thankful for it too.,"{'label': 'positive', 'score': 0.9869111776351...",...,11,431470,431470,epidural,True,positive,unknown,positive,True,True
30848,72987,72987,658299,LizInFallsCity,2018-01-02 15:10:05+00:00,0,Twitter for iPhone,@SarahT_in_Prov @phx_nolagirl @Hannah_N_Denver...,@user @user @user @user iâm with sarah on al...,"{'label': 'negative', 'score': 0.6633009314537...",...,57,431472,431472,epidural,True,positive,unknown,negative,False,False
30849,72992,72992,658332,jusT_STEWit,2018-01-02 03:29:47+00:00,0,Twitter for iPhone,@breaanaaaaa Girl i was so scared to get my ep...,@user girl i was so scared to get my epidural ...,"{'label': 'negative', 'score': 0.8322213292121...",...,49,431491,431491,epidural,True,neutral,unknown,negative,False,False
30850,72996,72996,658364,lucybball,2018-01-01 18:32:35+00:00,1,Twitter Web Client,@gridironbaby So sorry you are and have been g...,@user so sorry you are and have been going thr...,"{'label': 'negative', 'score': 0.8958117365837...",...,68,431506,431506,epidural,True,negative,unknown,negative,True,True


## Rows where `had_epi` is `False`

In [None]:
df_no_epi = df[df['had_epi']==False]
df_no_epi.reset_index(inplace=True)
df_no_epi['sent_agreement'] = pd.Series(df_no_epi['nat_pos'] == df_no_epi['roberta'])


print('Fraction of rows with sentiment agreement:')
print(f'{len(df_no_epi[df_no_epi["sent_agreement"]==True]) / len(df_no_epi)}')
print('='*50)

df_no_epi

Fraction of rows with sentiment agreement:
0.3198887343532684


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_epi['sent_agreement'] = pd.Series(df_no_epi['nat_pos'] == df_no_epi['roberta'])


Unnamed: 0.2,level_0,Unnamed: 0.1,Unnamed: 0,User,Time,Likes,Source,Tweet,cleaned_tweet,sent_analysis,...,tok_per_twt,old_index,index,about_epi,had_epi,epi_pos,nat_pos,roberta,either_sentiment_matches,sent_agreement
0,1,1,705,trishlrees,2009-07-10 03:17:34+00:00,0,Twitter Web Client,"@ehasselbeck don't know if I should tell you, ...","@user don't know if i should tell you, but the...","{'label': 'negative', 'score': 0.8739022016525...",...,36,24,24,natural,False,negative,neutral,negative,True,False
1,6,6,1139,InfluenceHQ,2013-12-04 18:22:43+00:00,3,Twubs,Wow! RT @merts75 @MomCentral q3a I was on tim...,wow! rt @user @user q3a i was on time. natura...,"{'label': 'positive', 'score': 0.9623589515686...",...,28,58,58,natural,False,unknown,positive,positive,True,True
2,8,8,1430,AmbyyTweets,2013-11-07 11:52:15+00:00,2,Twitter for iPhone,"My @neyney517 is strong. Labor for 30 min, no ...","my @user is strong. labor for 30 min, no epidu...","{'label': 'positive', 'score': 0.9493963718414...",...,35,74,74,natural,False,unknown,positive,positive,True,True
3,9,9,1448,thewaybot,2013-11-05 23:06:40+00:00,0,thewaybot,The way that sudden labor natural birth with j...,the way that sudden labor natural birth with j...,"{'label': 'neutral', 'score': 0.5056585669517517}",...,18,76,76,natural,False,unknown,positive,neutral,False,False
4,25,25,2982,hanuhtee,2013-07-22 19:53:05+00:00,0,Twitter for iPhone,Props to the Duchess for enduring over 10 hour...,props to the duchess for enduring over 10 hour...,"{'label': 'positive', 'score': 0.8454124927520...",...,24,159,159,natural,False,unknown,positive,positive,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8623,72977,72977,658246,iam_Destini,2018-01-03 01:58:53+00:00,0,Twitter for iPhone,i love telling people how i went into labor ð...,i love telling people how i went into labor ð...,"{'label': 'positive', 'score': 0.9718281626701...",...,43,431442,431442,epidural,False,unknown,positive,positive,True,True
8624,72983,72983,658278,yadaamaee,2018-01-02 18:38:15+00:00,2,Twitter for iPhone,Iâm not getting an epidural... Iâm finna f...,iâm not getting an epidural... iâm finna f...,"{'label': 'negative', 'score': 0.7182402014732...",...,26,431462,431462,epidural,False,negative,unknown,negative,True,False
8625,72988,72988,658302,Xiomykinss,2018-01-02 12:23:16+00:00,0,Twitter for Android,They will seek the easy way out .... And after...,they will seek the easy way out .... and after...,"{'label': 'negative', 'score': 0.4398196041584...",...,28,431474,431474,epidural,False,unknown,positive,negative,False,False
8626,72989,72989,658303,Xiomykinss,2018-01-02 12:21:44+00:00,0,Twitter for Android,Is it sad that the RN attending me through lab...,is it sad that the rn attending me through lab...,"{'label': 'negative', 'score': 0.8193557858467...",...,34,431475,431475,epidural,False,negative,positive,negative,True,False


# Save dataframes to excel files
- If running in Colab --> Download files

In [None]:
from google.colab import files

datasets_to_files = {
    'cleaned_dataset.xlsx': df,
    'epidural_sentiments.xlsx': df_had_epi,
    'natural_sentiments.xlsx': df_no_epi,
}

for f, d in datasets_to_files.items():
  print(f'Saving {f}')
  d.to_excel(f)
  try:
    files.download(f)
  except Exception as e:
    print(e)

Saving cleaned_dataset.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saving epidural_sentiments.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saving natural_sentiments.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>