In [1]:
import numpy
import json
import re
import spacy
import pandas as pd
from tqdm import tqdm
from pathlib import Path

In [2]:
def tokenize_comment(comment, nlp_model=None ) -> list:
    """
    Tokenize a `comment` by removing hyperlinks, punctuation and extra spaces.
    
    Parameter
    ---------
    comment  : str
        String to tokenize
    nlp_model: optional (default: None)
        NLP model to load for special tokenization (in particular stop words 
        removal during the tokenization filtering phase).

    Returns
    -------
    list(str)
        The list of tokens extracted from the original comment
    """
    re_punctuation = re.compile(r"[^\d|(a-z)|!||#|@|è|é|à|ù|ô|ü|ë|ä|û|î|ê|â|ç\s]")
    re_hyperlink   = re.compile(r"http\S+")
    re_extra_space = re.compile(r"\s+")
    re_repetition  = re.compile(r'(.+?)\1\1+')
    
    tokens = re_hyperlink.sub(' ', comment.lower())
    tokens = re_punctuation.sub(' ', tokens)
    tokens = re_extra_space.sub(' ', tokens)
    tokens = re_repetition.sub(r'\1\1\1', tokens)
    tokens = [ token for token in tokens.split() if len(token) > 1 ]

    # Filtering out stop words from the `tmp` list
    
    if nlp_model is not None:
        tokens = [ token for token in tokens if not token in nlp_model.Defaults.stop_words ]

    return tokens

In [4]:
json_file = Path("../data/json/train.json")

In [5]:
# Loading raw json file
with open(json_file, 'r', encoding='utf8') as file:   
    raw_dataset = json.load(file)

In [6]:
# Tokenization and comments registration
num_part   = 0
num_elts   = len(raw_dataset)
num_digits = len(f'{num_elts:_d}')

In [7]:
# Loading SpaCy model from tokenization utilities
print(f'\033[0;37mLoading SpaCy en_core_web_sm model..\033[0m', end=' ')
spacy.prefer_gpu()
nlp = spacy.load("fr_core_news_sm")

# Loading stop words
stop_words_path = Path('/home/jarod/git/allocine-sentiment-analysis/data/json/stopwords.json')
with open(stop_words_path, 'r', encoding='utf8') as file:   
    stop_words = json.load(file)
nlp.Defaults.stop_words = set(stop_words)
print('\033[0;34mDone!\033[0m')

[0;37mLoading SpaCy en_core_web_sm model..[0m [0;34mDone![0m


In [60]:
for idx, review in enumerate(raw_dataset):
    review['lst_mots']  = tokenize_comment(review['commentaire'], nlp)
    print(f'\033[0;37mProgress: \033[1;30m{idx:>{num_digits}_d}\033[0m /{num_elts:_d}', end='\r')
    
    if idx == 1500:
        break
    
print(f'\033[0;34mDone!\033[0m [ \033[0;37mnum_items: {num_elts:_d}\033[0m ]')

[0;34mDone![0m [ [0;37mnum_items: 665_962[0m ]


In [61]:
tokenize_comment("hellllo !!!!", nlp)

['helllo', '!!!']

In [62]:
df = pd.DataFrame(raw_dataset)

In [69]:
toy = df.head(1500)

In [70]:
filtered = toy[toy['lst_mots'].apply(lambda x: len(x)>0)]

In [71]:
filtered = toy[toy['lst_mots'].apply(lambda x: len(''.join(x))<2000)]

In [72]:
filtered = filtered.drop(columns=['name', 'user_id', 'commentaire', 'movie'])

In [74]:
filtered = filtered.head(1000)

In [75]:
filtered.to_csv('../data/csv/train_1000.csv', index=False)

In [31]:
lst = filtered.iloc[0]['lst_mots']

In [32]:
''.join(lst)

'pastrouvefacetientcôtéphrasetaglineveutrésumeparfaitementpassciencefictionjosephkosinskiintéressepompierséliteprofessionnelshommesdressetrèsbeauportraitbandepotesprêtssacrifierprotégerenvironnementflammesdavantagecentréquotidienhérosfacedilemmesempêchepasscènesterrainsoientsuperbesterriblesvraimentimpressionnantestémoigneexcellentfinalhistoireinspiréefaitsréelsconnaissezpasmieuxtempsdécouvrirtraversexpériencebienforteefficacetrèsbonhistoirepuissantepleinehumanitétraitementsobretrèsjamaisforcétandisplantechniquetrèssolidescènesgrandréalismesommetrèsbeaudivertissement'

In [81]:
tokens = nlp("Je mange du paain")

TypeError: Argument 'string' has incorrect type (expected str, got list)

In [82]:
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, len(token.vector))

Je True 46.5811 96
mange True 41.34777 96
du True 44.64348 96
paain True 45.513462 96
