In [1]:
def remove_numbers(text):
    """
    removes numbers from text 
    text: string
    returns text without numbers as a string
    """
    return ''.join(char for char in text if not char.isdigit())

In [2]:
import string

def remove_punctuation(text):
    """
    removes punctuation from text 
    text: string
    returns text without punctuation as a string
    """
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [4]:
! pyenv versions

  system
  3.8.5
  3.8.5/envs/data
  3.8.5/envs/deep4deep
* NLP (set by PYENV_VERSION environment variable)
  data
  data/envs/NLP
  deep4deep


In [3]:
from nltk.corpus import stopwords
from nltk import word_tokenize

def remove_stopwords(text):
    """
    removes stopwords from text 
    text: string
    returns text without stopwords as a string
    """
    my_stopwords = set(stopwords.words('english')) 
    my_stopwords.add('•')
    my_stopwords.add('’')
    #adding special characters found in hello tomorrow reports
    
    tokens = word_tokenize(text) # correspond à un split
    return [word for word in tokens if word not in my_stopwords]

In [4]:
from nltk.stem import WordNetLemmatizer

def lemmatize(text):
    """
    text: string
    returns lemmatized text 
    """
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in text]

In [5]:
def text_preprocessing(text):
    """
    applies preprocessing steps 
    text: string
    returns preprocessed, tokenized text
    """
    text = text.lower()
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    
    return lemmatize(text)

In [8]:
ls ../raw_data/ 

[1m[36mHello_tomorrow_files_EN[m[m/ NLP_requirements.txt
[1m[36mHello_tomorrow_files_FR[m[m/ dummy_NLP.csv


In [6]:
import pandas as pd

X_test_unprocessed = pd.read_csv('../raw_data/dummy_NLP.csv', nrows=6)['description']
# this character in line 1 or 2 was causing a utf 8 encoding issue: '–'. 
# Tibber line was a problem too (copy-pasted in whatsapp)
#### =========== ENCODING PROBLEM TO SOLVE 
X_test_unprocessed

0    OpenGamma creates a derivates analytics platfo...
1    Exscientia Limited uses AI driven systems to a...
2    Novelda develops their advanced Xethru sensor ...
3    Tibber develops a mobile application to help i...
4    eSmart Systems builds a SaaS platform using bi...
5    Carbon Waters is a graphene company. We have d...
Name: description, dtype: object

In [7]:
X_test_processed = [text_preprocessing(doc) for doc in X_test_unprocessed]


In [37]:
# Adding Hello_Tomorrow pdfs using pdfminer on command-line interface
# using them as 'deeptech' source

df = pd.DataFrame(columns=['text','is_deep_tech'])

l = !ls ../raw_data/Hello_tomorrow_files_EN

for i, file_name in enumerate(l):
    file_path = '../raw_data/Hello_tomorrow_files_EN/' + file_name
    file_text = !pdf2txt.py {file_path}
    df.loc[file_name,'text'] = ' '.join(file_text)
    df.loc[file_name,'is_deep_tech'] = 1

In [38]:
#Now doing the same for non-deeptech documents

l_2 = !ls ../raw_data/Non_deeptech_EN

for file_name in (l_2):
    file_path = '../raw_data/Non_deeptech_EN/' + file_name
    file_text = !pdf2txt.py {file_path}
    df.loc[file_name,'text'] = ' '.join(file_text)
    df.loc[file_name,'is_deep_tech'] = 0

In [39]:
df

Unnamed: 0,text,is_deep_tech
HT-BCG-The-Dawn-of-the-Deep-Tech-Ecosystem-Mar-2019.pdf,The Dawn of the Deep Tech Ecosystem Boston...,1
Hello-Tomorrow-BCG-FROM-TECH-TO-DEEP-TECH.pdf,From Tech to Deep Tech Fostering collaborat...,1
How-to-build-a-succesful-deep-tech-acceleration-program-Hello-Tomorrow-Bpifrance-1.pdf,HOW TO BUILD A SUCC...,1
BCG-After-the-Honeymoon-Ends-July-2019-R2_tcm108-222810.pdf,After the Honeymoon Ends MAKING CORPORATE-S...,0
The-next-normal-the-recovery-will-be-digital.pdf,The Next Normal The recovery will be digita...,0
tech-for-good-summit-progress-report.pdf,TECH FOR GOOD SUMMIT Progress report July 20...,0


In [46]:
#for the time being, using those pdf files as X_train 
from gensim.models import Word2Vec
import csv

df['processed_text'] = df['text'].map(text_preprocessing)

my_embedding = Word2Vec(sentences=df['processed_text'], min_count=7, window=5)