### Import Data and Merge 

In [1]:
import pandas as pd
import numpy as np

em_s, em_e = "\033[1m", "\033[0;0m"

In [2]:
insider_decks = pd.read_csv("./data/insider_pitch_decks.csv").drop(columns = ["Unnamed: 0"]).dropna()
best_decks = pd.read_csv("./data/best_pitch_decks.csv").dropna(subset = ["Industry"])

display(insider_decks.head())
display(best_decks.head())

Unnamed: 0,Local Link,Stage,Industry,Region
0,./InsiderPitchPDFs/0xLabs.pdf,SeriesB,Fintech,N. America
1,./InsiderPitchPDFs/100plus.pdf,Seed,Healthcare,N. America
2,./InsiderPitchPDFs/10KC.pdf,Growth,Online learning,N. America
3,./InsiderPitchPDFs/1906.pdf,SeriesA,Cannabis,N. America
4,./InsiderPitchPDFs/1inchNetwork.pdf,SeriesB,Fintech,N. America


Unnamed: 0,Name,Industry,Description,Short Description,Tags,Business Model,Customer Model,Website,Raised,Year Raised,Stage,Investors,PDF URL
0,Bolt,SaaS,Bolt is an innovative payment processing compa...,One-click checkout Bolt used this 12-page deck...,SaaS,SaaS,B2B2C,bolt.com,$393M,2021,Series D,VC,./BestPitchPDFs/bolt.pdf
1,Spotify,Music,"Spotify Technology S.A., together with its sub...",Spotify is a premium streaming service that gi...,"Music, Entertainment, Audio","SaaS, Subscription",B2C,spotify.com,$100M,2012,Late Stage,Angel Investors,./BestPitchPDFs/spotify.pdf
2,WeWork,Proptech,WeWork Companies Inc. operates as a real estat...,WeWork is a commercial real estate company tha...,"Proptech, Collaboration, Real Estate",Marketplace,B2B,wework.com,$335M,2014,Late Stage,Corporate,./BestPitchPDFs/wework.pdf
3,AirBnB,Travel & Events,"Airbnb, Inc., together with its subsidiaries, ...",Airbnb is one of the world's largest marketpla...,Travel & Events,"SaaS, E-commerce",,airbnb.com,$600k,2008,Pre-Seed,,./BestPitchPDFs/airbnb.pdf
4,Facebook,Social Media,"Facebook, Inc. operates a social networking we...",Facebook is a social network that connects you...,"Social Media, IT",Marketplace,B2B2C,facebook.com,$500k,2004,Early Stage,"VC, Corporate",./BestPitchPDFs/facebook.pdf


In [3]:
import re
def standardize_text(x):
    return re.sub('[^0-9a-zA-Z]+', '', x.lower())

best_decks["Industry"] = best_decks["Industry"].astype(str).apply(standardize_text)
insider_decks["Industry"] = insider_decks["Industry"].astype(str).apply(standardize_text)

all_unique_industries = np.unique(np.concatenate((best_decks["Industry"], insider_decks["Industry"])))

In [4]:
insider_decks.rename(columns = {'Local Link': 'PDF URL'}, inplace = True)

common_columns = list(set(best_decks.columns).intersection(insider_decks.columns))

master_df = pd.concat((best_decks[common_columns], insider_decks[common_columns]))
display(master_df)

Unnamed: 0,Industry,Stage,PDF URL
0,saas,Series D,./BestPitchPDFs/bolt.pdf
1,music,Late Stage,./BestPitchPDFs/spotify.pdf
2,proptech,Late Stage,./BestPitchPDFs/wework.pdf
3,travelevents,Pre-Seed,./BestPitchPDFs/airbnb.pdf
4,socialmedia,Early Stage,./BestPitchPDFs/facebook.pdf
...,...,...,...
982,enterprisesoftware,Seed,./InsiderPitchPDFs/Zestful.pdf
983,fintech,SeriesC,./InsiderPitchPDFs/Zeta.pdf
984,fintech,SeriesA,./InsiderPitchPDFs/Zevoy.pdf
985,fintech,Seed,./InsiderPitchPDFs/ZoeFinancial.pdf


In [5]:
twenty_sectors = ['Consumer', 'Cybersecurity', 'Data Analytics and Management', 'Education',
                  'Energy', 'Entertainment', 'Environment', 'Finance and Banking', 'Food',
                  'Health Care', 'Legal Tech', 'Marketing', 'Media and Advertising', 'Other',
                  'Real Estate', 'Tech', 'Transportation', 'Travel and Hospitality',
                  'Construction and Manufacturing', 'Enterprise']

def question_builder(industry):
    return "The twenty sectors to choose from are the following: " + \
           ', '.join([str(sector) for sector in twenty_sectors]) + ". \n" + \
           "Chose a single sector that maps best the sub-industry: " + industry + "? " + \
           "Reply with no explanation, just the name of the chosen sector."

In [8]:
import openai
def askGPT(text):
    openai.api_key = "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"
    
    response = openai.Completion.create(
        engine = "gpt-3.5-turbo",
        prompt = text,
        temperature = 0.6,
        max_tokens = 100
    )

    return response


In [9]:
import time

gpt_generated_industry_mapping = {'3dtechnology': 'Tech', 'abtesting': 'Marketing', 'accounting': 'Finance and Banking', 'adnetwork': 'Media and Advertising', 'adventuretravel': 'Travel and Hospitality', 'advertising': 'Media and Advertising', 'agriculture': 'Other', 'agtech': 'Agriculture', 'ai': 'Data Analytics and Management', 'airtransportation': 'Transportation', 'analytics': 'Data Analytics and Management', 'android': 'Tech', 'app': 'Tech', 'apps': 'Tech', 'art': 'Entertainment', 'assetmanagement': 'Finance and Banking', 'auctions': 'Finance and Banking', 'audio': 'Entertainment', 'augmentedreality': 'Entertainment', 'automotive': 'Transportation', 'b2b': 'Enterprise', 'banking': 'Finance and Banking', 'beauty': 'Consumer', 'bigdata': 'Data Analytics and Management', 'billing': 'Finance and Banking', 'bioinformatics': 'Health Care', 'biotechnology': 'Health Care', 'bitcoin': 'Cybersecurity', 'blockchain': 'Tech', 'brandmarketing': 'Marketing', 'broadcasting': 'Media and Advertising', 'buildingmaterial': 'Construction and Manufacturing', 'businessdevelopment': 'Enterprise', 'cad': 'Data Analytics and Management', 'cannabis': 'Other', 'careerplanning': 'Education', 'charity': 'Other', 'childeducation': 'Education', 'cleantech': 'Environment', 'clinicaltrials': 'Health Care', 'cloud': 'Tech', 'cloudcomputing': 'Tech', 'clouddataservices': 'Data Analytics and Management', 'coaching': 'Education', 'coding': 'Tech', 'coffee': 'Food', 'collaboration': 'Enterprise', 'collaborativeconsumption': 'Sharing Economy', 'commerciallending': 'Finance and Banking', 'computer': 'Tech', 'concerts': 'Entertainment', 'construction': 'Construction', 'consulting': 'Other', 'consumer': 'Consumer', 'consumerapp': 'Consumer', 'consumerelectronics': 'Consumer', 'consumergoods': 'Consumer', 'consumerlending': 'Finance and Banking', 'consumertech': 'Consumer', 'content': 'Media and Advertising', 'cooking': 'Food', 'corporatetraining': 'Education', 'coupons': 'Marketing', 'crm': 'Marketing', 'crowdfunding': 'Finance and Banking', 'crowdsourcing': 'Other', 'crypto': 'Cybersecurity', 'cryptocurrency': 'Finance and Banking', 'customerservice': 'Consumer', 'cybersecurity': 'Cybersecurity', 'database': 'Data Analytics and Management', 'datamanagement': 'Data Analytics and Management', 'datavisualization': 'Data Analytics and Management', 'dating': 'Entertainment', 'delivery': 'Transportation', 'developerapis': 'Tech', 'digitalentertainment': 'Entertainment', 'digitalmedia': 'Media and Advertising', 'digitalsignage': 'Entertainment', 'drone': 'Tech', 'ecommerce': 'Consumer', 'edtech': 'Education', 'education': 'Education', 'eldercare': 'Health Care', 'elderly': 'Health Care', 'elearning': 'Education', 'electronichealthrecord': 'Health Care', 'email': 'Data Analytics and Management', 'emailmarketing': 'Marketing', 'employment': 'Employment', 'energy': 'Energy', 'enterprise': 'Enterprise', 'enterprisear': 'Enterprise', 'enterprisesoftware': 'Enterprise', 'enterprisetech': 'Enterprise', 'entertainment': 'Entertainment', 'europe': 'Other', 'eventmanagement': 'Entertainment', 'events': 'Entertainment', 'fashion': 'Entertainment', 'finance': 'Finance and Banking', 'financialservices': 'Finance and Banking', 'fintech': 'Finance and Banking', 'fintechsecurity': 'Finance and Banking', 'fitness': 'Health Care', 'food': 'Food', 'foodandbeverage': 'Food', 'foodbeverages': 'Food', 'fooddelivery': 'Food', 'foodprocessing': 'Food', 'gaming': 'Entertainment', 'govtech': 'Other', 'guides': 'Education', 'hardware': 'Construction and Manufacturing', 'healthcare': 'Health Care', 'healthdiagnostics': 'Health Care', 'healthinsurance': 'Health Care', 'hospitality': 'Travel and Hospitality', 'hr': 'Human Resources', 'insurance': 'Finance and Banking', 'insurtech': 'Finance and Banking', 'intellectualproperty': 'Legal Tech', 'internet': 'Tech', 'iot': 'Tech', 'it': 'Real Estate', 'languagelearning': 'Education', 'legaltech': 'Legal Tech', 'locationbasedservices': 'Transportation', 'logistics': 'Transportation', 'machinelearning': 'Data Analytics and Management', 'manufacturing': 'Construction and Manufacturing', 'marketing': 'Marketing', 'marketplaces': 'Tech', 'media': 'Media and Advertising', 'mediaandentertainment': 'Media and Entertainment', 'messaging': 'Entertainment', 'mobileapps': 'Tech', 'mobilepayments': 'Finance and Banking', 'music': 'Entertainment', 'networking': 'Tech', 'onlinelearning': 'Education', 'opensource': 'Other', 'other': 'Other', 'payments': 'Finance and Banking', 'pitchdecktemplate': 'Marketing', 'productivity': 'Enterprise', 'proptech': 'Real Estate', 'realestate': 'Real Estate', 'recruiting': 'Legal Tech', 'restaurants': 'Food', 'retail': 'Consumer', 'ridesharing': 'Transportation', 'saas': 'Tech', 'searchengine': 'Tech', 'security': 'Cybersecurity', 'sensor': 'Tech', 'socialimpact': 'Other', 'socialmedia': 'Media and Advertising', 'software': 'Tech', 'sports': 'Entertainment', 'supplychainmanagement': 'Transportation', 'sustainability': ' Environment', 'tech': 'Tech', 'transportation': 'Transportation', 'travel': 'Travel and Hospitality', 'travelevents': 'Travel and Hospitality', 'venturecapital': 'Finance and Banking', 'video': 'Entertainment', 'virtualreality': 'Entertainment'}


## UNCOMMENT THIS WHEN ACTUALLY USING GPT TO REGENERATE MAPPING ##

# gpt_generated_industry_mapping = {}
# for industry in all_unique_industries:
#     question = question_builder(industry)
#     response = askGPT(question)
#     target_industry = response.choices[0].text.replace("\n", "")
#     gpt_generated_industry_mapping[industry] = target_industry
    
#     time.sleep(1)

In [10]:
# print(gpt_generated_industry_mapping)

In [11]:
master_df["Industry"] = master_df["Industry"].apply(lambda x : gpt_generated_industry_mapping[x])
broken_pdfs = []

In [12]:
import pytesseract
from PIL import Image
import io
import pdfplumber
from pathlib import Path
from wand.exceptions import WandError

In [13]:
def extract_txt_files(pdf_link):
    try:
        company_name = pdf_link.split("/")[-1][:-4]
        print(company_name)

        text_file_path = "./ExtractionsInTXT/" + company_name + ".txt"
        if (Path(text_file_path).is_file()):
            return

        with pdfplumber.open(pdf_link) as pdf:
            full_text = ""
            for page in pdf.pages:
                image = page.to_image().original
                page_text = pytesseract.image_to_string(image)
                full_text += page_text

            with open(text_file_path, "w") as text_file:
                text_file.write(full_text)
    
    except WandError:
        broken_pdfs.append(text_file_path)
        print(f"Error: Unable to extract text from {pdf_link}")

In [14]:
## UNCOMMENT THIS IF YOU WANT TO RERUN TEXT EXTRACTION
# master_df["PDF URL"].apply(extract_txt_files)

In [15]:
display(master_df)

Unnamed: 0,Industry,Stage,PDF URL
0,Tech,Series D,./BestPitchPDFs/bolt.pdf
1,Entertainment,Late Stage,./BestPitchPDFs/spotify.pdf
2,Real Estate,Late Stage,./BestPitchPDFs/wework.pdf
3,Travel and Hospitality,Pre-Seed,./BestPitchPDFs/airbnb.pdf
4,Media and Advertising,Early Stage,./BestPitchPDFs/facebook.pdf
...,...,...,...
982,Enterprise,Seed,./InsiderPitchPDFs/Zestful.pdf
983,Finance and Banking,SeriesC,./InsiderPitchPDFs/Zeta.pdf
984,Finance and Banking,SeriesA,./InsiderPitchPDFs/Zevoy.pdf
985,Finance and Banking,Seed,./InsiderPitchPDFs/ZoeFinancial.pdf


In [32]:
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize

Lem = WordNetLemmatizer()
stop_words = stopwords.words("english")
english_words = set(words.words())
unigram_freq_df = pd.read_csv("unigram_freq.csv")
unigram_freq_dict = pd.Series(unigram_freq_df["count"].values, index = unigram_freq_df["word"]).to_dict()
frequency_threshold = 5 * 10e6

def preprocessing_lambda(text):
    # Make everything lowercase.
    out_text = str(text).lower()
    # Remove mentions, hashtags and urls.
    out_text = re.sub(r"(?:\@|\#|https?\://)\S+", "", out_text)
    # Keep alphabetic sequences only
    out_text = re.sub(r'[^a-zA-Z]', ' ', out_text)
    
    # tokenize sentence
    token_out = word_tokenize(out_text)
    
    
    # Lemmatize words using both settings from nltk as:
    # 'v' works for bombing -> bomb
    # 'n' works for years-> year
    out_text = " ".join([Lem.lemmatize(Lem.lemmatize(w, 'v'), 'n') for w in token_out])

    # Keep only words longer than two characters
    # ignore stopwords and words that are not in the english vocabulary
    token_out = [w for w in token_out if (not w in stop_words) and (w in english_words) and 
                 ((w in unigram_freq_dict) and unigram_freq_dict[w] > frequency_threshold) \
                 and (len(w) > 2)]
    
    
    
    return " ".join([w for w in token_out])

In [34]:
def write_processed_text_to_csv(pdf_link):
    company_name = pdf_link.split("/")[-1][:-4]
    print(em_s + company_name.upper() + em_e)

    text_file_path = "./ExtractionsInTXT/" + company_name + ".txt"
    if not(Path(text_file_path).is_file()):
        return None
    
    with open(text_file_path) as txt_file:
        raw_text = txt_file.read()
        processed_text = preprocessing_lambda(raw_text)
        print(processed_text, "\n")
        time.sleep(0.01)
        return processed_text
    

In [35]:
master_df["Processed Text"] = master_df["PDF URL"].apply(write_processed_text_to_csv)

[1mBOLT[0;0m
global network around powered one account single site see one login one click player wide core impact network sea network hit culture one account 

[1mSPOTIFY[0;0m
computer science language idea like running water million active within previous days million come fast always working much possible fast fast always working nothing build talk error test code fire cut cable long distance getting right tool job availability availability user search memory daily read search sort search index memory storage storage long storage storage better use memory storage high choose level source master yes please come work well 

[1mWEWORK[0;0m
space service community people new work fee powered technology every first value value added capital deal opportunity membership mobile experience growth mobile social provide strong interest member base physical making city opportunity 

[1mAIRBNB[0;0m
welcome book rather problem price important travel leave city culture easy way book local 

In [36]:
display(master_df.reset_index().drop(columns = ["index"]))

Unnamed: 0,Industry,Stage,PDF URL,Processed Text
0,Tech,Series D,./BestPitchPDFs/bolt.pdf,global network around powered one account sing...
1,Entertainment,Late Stage,./BestPitchPDFs/spotify.pdf,computer science language idea like running wa...
2,Real Estate,Late Stage,./BestPitchPDFs/wework.pdf,space service community people new work fee po...
3,Travel and Hospitality,Pre-Seed,./BestPitchPDFs/airbnb.pdf,welcome book rather problem price important tr...
4,Media and Advertising,Early Stage,./BestPitchPDFs/facebook.pdf,see directory user basis social school system ...
...,...,...,...,...
1794,Enterprise,Seed,./InsiderPitchPDFs/Zestful.pdf,membership card real time track real time proc...
1795,Finance and Banking,SeriesC,./InsiderPitchPDFs/Zeta.pdf,face added bank bank added bank network first ...
1796,Finance and Banking,SeriesA,./InsiderPitchPDFs/Zevoy.pdf,management financial authority business licens...
1797,Finance and Banking,Seed,./InsiderPitchPDFs/ZoeFinancial.pdf,financial want financial rather management dat...


In [37]:
master_df.to_csv("./data/new_processed_pitch_data.csv")