In [1]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from insight_utils import bar_groups_chart, line_plot_column
from utils import clean_transcript, clean_summary
from constants import FORMAT_MODIFIERS, LENGTH_MODIFIERS, DENSENESS_MODIFIERS, QUALITY_MODIFIERS, STRUCTURE_MODIFIERS
import textstat
import nltk

transcript_path = "data/manual_summaries2.csv"
data = pd.read_csv(transcript_path, delimiter=";")

#### Format

In [2]:
data

Unnamed: 0,transcript,summary,title,speaker,description,topic,summary2,summary3,summary4
0,"When I think about the rise of AI, I'm reminde...",AI is controlled by big tech companies\n- Most...,How AI could empower any business,Andrew Ng,Expensive to build and often needing highly sk...,"technology,engineering,business,entrepreneur,s...",AI could be transformational if everyone can u...,AI Control by Big Tech Companies\n- Skilled en...,The Rise of AI\n- AI is in the hands of big te...
1,Everyone needs a coach. It doesn't matter whet...,Teaching and systematic feedback\n- 98% of tea...,Teachers need real feedback,Bill Gates,"Until recently, many teachers only got one wor...","culture,global issues,education,teaching",Teachers need useful feedback \n- 98% of US te...,Current teacher Feedback systems\n- 98% of US ...,Teachers Lack Feedback Systematically\n- 98% o...
2,"Hi, I'm Jeff. I lead AI Research and Health at...",What caused the recent progress of AI systems?...,AI isn't as smart as you think -- but it could be,Jeff Dean,"What is AI, really? Jeff Dean, the head of Goo...","technology,future,AI,algorithm,machine learnin...",AI is progressing quickly\n- Two keys to AI pr...,AI Progress and Current Challenges\n- AI is us...,What AI Can Do Right Now\n- AI helps computers...
3,Buildings are not only what they seem. They ar...,Buildings from the perspective of energy\n- Co...,What if buildings created energy instead of co...,Ksenia Petrichenko,Buildings are bad news for the climate -- but ...,"climate change,technology,design,innovation,fu...",Conventional Buildings are Consumers of Electr...,Why buildings are a problem from the perspecti...,Buildings as Energy System Components\n- Build...
4,"My journey to become a polar specialist, photo...",Close-ups of the Arctic\n- Disappearing sea ic...,Animal tales from icy wonderlands,Paul Nicklen,Diving under the Antarctic ice to get close to...,"culture,design,entertainment,animals,creativit...",Sea ice is disappearing at faster rates than e...,Life in the Polar Regions\n- Spent childhood i...,Ice Loss Destroys an Ecosystem\n- Polar bears ...
5,"""Ashley Judd, stupid fucking slut. ""You can't ...","Abuse on girls and women\n- Personally, econom...",How online abuse of women has spiraled out of ...,Ashley Judd,"Enough with online hate speech, sexual harassm...","global issues,computers,gender,communication,a...",Online abuse of women and girls is gender righ...,The Impact of Online Misogyny\n- Online misogy...,Women of all ages suffer abuse and attacks onl...
6,"How many of us have ever seen something, thoug...",Employees Reluctance to report wrongdoing\n- 4...,How whistle-blowers shape history,Kelly Richmond Pope,Fraud researcher and documentary filmmaker Kel...,"politics,corruption,social change,history,soci...",People regularly see things and don’t report t...,The Challenge of Whistle-blowers\n- Retaliatio...,Whistle-blowers Make Valuable Contributions at...
7,"I'm a bit of a perfectionist. Now, how many ti...",Pervasive Celebration of Perfectionism\n- Indi...,Our dangerous obsession with perfectionism is ...,Thomas Curran,Social psychologist Thomas Curran explores how...,"culture,social change,psychology,depression,me...",Perfectionism is a favorite flaw\n- Socially a...,Perfectionism's Pervasive Influence:\n- It is ...,Perfectionism Is Not a Symbol of Success\n- Pe...
8,"Please, stand up. If you're able. Thank you. I...",Personal Journey and dreams\n- Escape and the ...,Pussy Riot's powerful message to Vladimir Putin,Nadya Tolokonnikova,"Nadya Tolokonnikova, founding member of the an...","politics,corruption,social change,art,activism...",Protesting despite the consequences\n- Protest...,Challenging the System: Prisoner to Activist\n...,Nadya's Arrest and Trial Experience\n- Arreste...
9,"We've evolved with tools, and tools have evolv...",Evolution of Tools and specialization\n- Evolu...,Shape-shifting tech will change work as we kno...,Sean Follmer,What will the world look like when we move bey...,"technology,design,engineering,industrial desig...",Tools evolve and become more specialized over ...,Evolving Tools and Need for New Interfaces\n- ...,Tools Evolved With Humanity\n- Tools are more ...


In [3]:
data.columns

Index(['transcript', 'summary', 'title', 'speaker', 'description', 'topic',
       'summary2', 'summary3', 'summary4'],
      dtype='object')

In [4]:
summaries = data[['summary', 'summary2', 'summary3', 'summary4']]
melted_df = pd.melt(summaries, value_vars=summaries.columns, value_name="summary_text")
melted_df

Unnamed: 0,variable,summary_text
0,summary,AI is controlled by big tech companies\n- Most...
1,summary,Teaching and systematic feedback\n- 98% of tea...
2,summary,What caused the recent progress of AI systems?...
3,summary,Buildings from the perspective of energy\n- Co...
4,summary,Close-ups of the Arctic\n- Disappearing sea ic...
5,summary,"Abuse on girls and women\n- Personally, econom..."
6,summary,Employees Reluctance to report wrongdoing\n- 4...
7,summary,Pervasive Celebration of Perfectionism\n- Indi...
8,summary,Personal Journey and dreams\n- Escape and the ...
9,summary,Evolution of Tools and specialization\n- Evolu...


In [8]:
all_summaries = melted_df['summary_text']
textstat.set_lang("en_US")

char_counts = []
word_counts = []
sent_counts = []
sent_lengths = []
words_in_sents = []
unique_words_count = []
all_words = []

subheadings = False
bullets = False


for summary in all_summaries:
    cleaned = clean_summary(summary)   
    sents = nltk.sent_tokenize(cleaned)
    
    if subheadings:
        sents = [sents[0], sents[4], sents[8]]
        cleaned = " ".join(sents)
    
    if bullets:
        del sents[0]
        del sents[4]
        del sents[8]
        cleaned = " ".join(sents)
        
    if bullets or subheadings:
        
        sent_lengths.append([len(sent) for sent in sents])

        words = nltk.word_tokenize(cleaned)
        num_words = [len(nltk.word_tokenize(sent)) for sent in sents]
        for num in num_words:
            word_counts.append(num)

        chars = [textstat.char_count(sent) for sent in sents]
        for char in chars:
            char_counts.append(char)

        
    else:
        sent_counts.append(len(sents))
        sent_lengths.append([len(sent) for sent in sents])
        
        words = nltk.word_tokenize(cleaned)
        word_counts.append(len(words))
        
        all_words.append(words)
        unique_words_count.append(len(list(set(words))))
        
        char_counts.append(textstat.char_count(cleaned))

        total_unique_words = list(set(np.concatenate(all_words)))


if bullets or subheadings:
    df = pd.DataFrame(columns=["char_count", "word_count"])
    df["char_count"] = char_counts
    df["word_count"] = word_counts

    print(df.describe().to_latex())

else:
    df = pd.DataFrame(columns=["char_count", "word_count", "words_in_sents", "unique_words"])
    df["word_count"] = word_counts
    df["sent_count"] = sent_counts
    df["unique_words"] = unique_words_count
    df["char_count"] = char_counts

    print(df.describe().to_latex())


# get latex tabular code for mean, median, max, min, std of char, word, sent counts
# df["summary"] = all_summaries






# print(df.to_latex(columns=["title", "char_count", "word_count", "unique_words", "sent_count"], ))







\begin{tabular}{lrrrr}
\toprule
 & char_count & word_count & unique_words & sent_count \\
\midrule
count & 60.000000 & 60.000000 & 60.000000 & 60.000000 \\
mean & 529.000000 & 105.433333 & 70.950000 & 12.000000 \\
std & 68.016449 & 13.825016 & 8.999859 & 0.000000 \\
min & 413.000000 & 75.000000 & 48.000000 & 12.000000 \\
25% & 472.250000 & 94.000000 & 64.000000 & 12.000000 \\
50% & 522.500000 & 106.000000 & 72.000000 & 12.000000 \\
75% & 578.250000 & 116.500000 & 77.000000 & 12.000000 \\
max & 733.000000 & 133.000000 & 88.000000 & 12.000000 \\
\bottomrule
\end{tabular}

