In [1]:
def remove_numbers(text):
    """
    removes numbers from text 
    text: string
    returns text without numbers as a string
    """
    return ''.join(char for char in text if not char.isdigit())

In [3]:
import string

def remove_punctuation(text):
    """
    removes punctuation from text 
    text: string
    returns text without punctuation as a string
    """
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [4]:
! pyenv versions

  system
  3.8.5
  3.8.5/envs/data
  3.8.5/envs/deep4deep
* NLP (set by PYENV_VERSION environment variable)
  data
  data/envs/NLP
  deep4deep


In [5]:
from nltk.corpus import stopwords
from nltk import word_tokenize

def remove_stopwords(text):
    """
    removes stopwords from text 
    text: string
    returns text without stopwords as a string
    """
    my_stopwords = set(stopwords.words('english')) 
    my_stopwords.add('•')
    my_stopwords.add('’')
    #adding special characters found in hello tomorrow reports
    
    tokens = word_tokenize(text) # correspond à un split
    return [word for word in tokens if word not in my_stopwords]

In [6]:
from nltk.stem import WordNetLemmatizer

def lemmatize(text):
    """
    text: string
    returns lemmatized text 
    """
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in text]

In [7]:
def text_preprocessing(text):
    """
    applies preprocessing steps 
    text: string
    returns preprocessed, tokenized text
    """
    text = text.lower()
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    
    return lemmatize(text)

In [8]:
ls ../raw_data/ 

[1m[36mHello_tomorrow_files_EN[m[m/ NLP_requirements.txt
[1m[36mHello_tomorrow_files_FR[m[m/ dummy_NLP.csv


In [9]:
import pandas as pd

X_test_unprocessed = pd.read_csv('../raw_data/dummy_NLP.csv', nrows=6)['description']
# this character in line 1 or 2 was causing a utf 8 encoding issue: '–'. 
# Tibber line was a problem too (copy-pasted in whatsapp)
#### =========== ENCODING PROBLEM TO SOLVE 
X_test_unprocessed

0    OpenGamma creates a derivates analytics platfo...
1    Exscientia Limited uses AI driven systems to a...
2    Novelda develops their advanced Xethru sensor ...
3    Tibber develops a mobile application to help i...
4    eSmart Systems builds a SaaS platform using bi...
5    Carbon Waters is a graphene company. We have d...
Name: description, dtype: object

In [10]:
#X_unprocessed = ["bla!bla; 52 stop talking to me my dear friend, I cannot hear anymore", "Leader in humanoid robotics, SoftBank Robotics Europe (SBRE) is headquartered in Paris and regroups 400 employees. Creator of the robots NAO, Pepper and Romeo, used today in more  than  70  countries  worldwide,  in  various  fields,  such  as  research,  education,  retail, healthcare, tourism, hospitality or entertainment, SoftBank Robotics Europe is a subsidiary of SoftBank Robotics Holdings Corp."]
X_test_processed = [text_preprocessing(doc) for doc in X_test_unprocessed]
#X_processed

In [28]:
#Adding Hello_Tomorrow pdfs using pdfminer on command-line interface
# using them as 'deeptech' source
pdf_files = {}

l = !ls ../raw_data/Hello_tomorrow_files_EN

for i, file_name in enumerate(l):
    file_path = '../raw_data/Hello_tomorrow_files_EN/' + file_name
    file_text = !pdf2txt.py {file_path}
    pdf_files[i] = ' '.join(file_text)

In [29]:
#for the timebeing, using those 3 pdf files as X_train for is_deep_tech = 1

X_train_processed = [text_preprocessing(doc) for doc in pdf_files.values()]

from gensim.models import Word2Vec

my_embedding = Word2Vec(sentences=X_train_processed, min_count=3, max_count=100, window=1)

my_embedding.wv.vocab.keys()

my_embedding.wv.vocab['data']
my_embedding.wv.most_similar("process")
my_embedding.wv.get_vector('data').shape

type(my_embedding.wv)
my_embedding.wv.

# to use tensor flow 'auto-tSNE', need a TSV (tab-separated values) and a separate file of names

# persist word vector to disk
from gensim.models import KeyedVectors

my_embedding.wv.save('vectors.kv')
reloaded_word_vectors = KeyedVectors.load('vectors.kv')

# building TSV for use in Embedding Projector
import csv

with open('my_embedding.tsv', 'w') as vectors_tsvfile:
    with open('my_metadata.tsv', 'w') as metadata_tsvfile:
        writer1 = csv.writer(vectors_tsvfile, delimiter='\t')
        writer2 = csv.writer(metadata_tsvfile, delimiter='\t')
        writer2.writerow(['header:word','is_deep_tech'])
        
        words = my_embedding.wv.vocab.keys()
        for word in words:
            vector = my_embedding.wv.get_vector(word).tolist()
            writer1.writerow(vector)
            print(word)
            writer2.writerow([word, 1]) # writing 1 in 'is_deep_tech'

SyntaxError: invalid syntax (<ipython-input-29-c515a06b907d>, line 16)

In [25]:
#Now doing the same for non-deeptech documents

pdf_files_2 = {}

l_2 = !ls ../raw_data/Non_deeptech_EN

for i, file_name in enumerate(l_2):
    file_path = '../raw_data/Non_deeptech_EN/' + file_name
    file_text = !pdf2txt.py {file_path}
    pdf_files_2[i] = ' '.join(file_text)

In [26]:
pdf_files_2

{0: "After the  Honeymoon  Ends  MAKING CORPORATE-STARTUP RELATIONSHIPS  WORK   Boston Consulting Group partners with leaders in business and society to tackle their most  important challenges and capture their greatest opportunities. BCG was the pioneer in business  strategy when it was founded in 1963. Today, we help clients with total transformation—inspiring  complex change, enabling organizations to grow, building competitive advantage, and driving  bottom-line impact.  To succeed, organizations must blend digital and human capabilities. Our diverse, global teams  bring deep industry and functional expertise and a range of perspectives to spark change. BCG  delivers solutions through leading-edge management consulting along with technology and  design, corporate and digital ventures—and business purpose. We work in a uniquely  collaborative model across the firm and throughout all levels of the client organization,  generating results that allow our clients to thrive.  BCG Digital

In [30]:
#for the timebeing, using those pdf files as X_train for is_deep_tech = 0

X_train_processed_2 = [text_preprocessing(doc) for doc in pdf_files_2.values()]

from gensim.models import Word2Vec

my_embedding_2 = Word2Vec(sentences=X_train_processed_2, min_count=3, max_count=100, window=1)

# to use tensor flow 'auto-tSNE', need a TSV (tab-separated values) and a separate file of names

# persist word vector to disk
from gensim.models import KeyedVectors
my_embedding_2.wv.save('vectors_2.kv')

# building TSV for use in Embedding Projector
with open('my_embedding.tsv', 'a') as vectors_tsvfile:
    with open('my_metadata.tsv', 'a') as metadata_tsvfile:
        writer1 = csv.writer(vectors_tsvfile, delimiter='\t')
        writer2 = csv.writer(metadata_tsvfile, delimiter='\t')
        #writer2.writerow(['header:word','is_deep_tech']) # no need to write the header again
        
        words = my_embedding_2.wv.vocab.keys()
        for word in words:
            vector = my_embedding_2.wv.get_vector(word).tolist()
            writer1.writerow(vector)
            print(word)
            writer2.writerow([word, 0]) # writing 1 in 'is_deep_tech'

TypeError: __init__() got an unexpected keyword argument 'max_count'