## Modèle avancé

In [6]:
import pandas as pd
import os  
import json  
import re  
import string
import demoji
from tqdm import tqdm

import tensorflow as tf
import numpy as np
from joblib import Parallel, delayed
from gensim.models import Word2Vec  

from sklearn.model_selection import train_test_split  
from sklearn.metrics import accuracy_score
import sklearn
from tensorflow.keras import Sequential  
from tensorflow.keras.layers import Dense

import sys
import sklearn
import gensim
import tqdm as tq
import pickle

In [7]:
print("Python Version:", sys.version)  
print("TensorFlow Version:", tf.__version__)  
print("Scikit-Learn Version:", sklearn.__version__)  
print("Pandas Version:", pd.__version__)  
print("Demoji Version:", demoji.__version__) 
print("tqdm Version:", tq.__version__) 
print("gensim Version:", gensim.__version__) 

Python Version: 3.10.12 (main, Jul  5 2023, 15:02:25) [Clang 14.0.6 ]
TensorFlow Version: 2.15.0
Scikit-Learn Version: 1.2.2
Pandas Version: 1.5.3
Demoji Version: 1.1.0
tqdm Version: 4.66.1
gensim Version: 4.3.2


In [8]:
if tf.config.list_physical_devices('GPU'):  
    print("GPU is enabled!")  
else:  
    print("GPU is not enabled.")  

GPU is enabled!


In [9]:
with open("./google/model.pkl", "rb") as file:  
    model = pickle.load(file)  
  
with open("./google/vector_model.pkl", "rb") as file:  
    vector_model = pickle.load(file)

2024-02-06 15:57:06.482441: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-06 15:57:06.482459: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [10]:
def comment_to_vec(comment, model):
    vec = np.zeros(model.vector_size)
    num_words = 0
    for word in comment:
        if word in model.wv:
            vec += model.wv[word]
            num_words += 1
    if num_words > 0:
        vec /= num_words
    return vec
  
def clean_tweet(doc):  
  # Lower the code
  doc = doc.lower().strip()
  #remove emoji
  text = demoji.replace(doc, '')
  #remove links
  text = re.sub(r'http\S+|www.\S+', '', text)  
  # # Remove mentions
  text = re.sub(r'@\w+', '', text) 
  # Remove hashtag symbol but keep the text  
  text = re.sub(r'#(\w+)', r'\1', text)
  # Keep only alphanumeric characters and spaces  
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  # Remove multiple spaces (replace them with a single space)  
  text = re.sub(r'\s+', ' ', text).strip()
  
  return text
    
def clean_df(dataframe):
  df = dataframe.copy()
  # Keep only comment and sentiment columns
  df = df[["comment","sentiment"]]
  
  # negative field 0 = 0
  # Map positive field 4 = 1
  df.loc[df['sentiment'] == 4, 'sentiment'] = 1  
  
  # Clean the comment
  df['comment_clean'] = parallelize_on_rows(df['comment'], clean_tweet)  
  
  # Count the number of words from comment & comment_cleam
  df['words_nb'] = parallelize_on_rows(df['comment'], lambda x: len(x.split()))  
  df['words_nb_clean'] = parallelize_on_rows(df['comment_clean'], lambda x: len(x.split()))  
  
  # Only keep the clean words
  df = df[df['words_nb_clean'] > 3]
  
  # Remove duplicate
  df.drop_duplicates(subset='comment',inplace=True)
  
  return df


def parallelize_on_rows(data, func):  
    r = Parallel(n_jobs=-1)(delayed(func)(i) for i in tqdm(data, desc="Processing"))  
    return r  

In [11]:
def predict_sentiment(comment, model, vector_model):
  cleaned_comment = comment.split() # placeholder for actual preprocessing
  vec = comment_to_vec(cleaned_comment, vector_model).reshape(1, -1)
  prediction = model.predict(vec)

  return prediction

In [12]:
print("GPU")
with tf.device('/GPU:0'):
    print(predict_sentiment("I am so sad, this is very bad news, terrible!", model, vector_model))
    print(predict_sentiment("I am so happy, this is very good news, congrats!", model, vector_model))

print("\nCPU")
with tf.device('/CPU:0'):
  print(predict_sentiment("I am so sad, this is very bad news, terrible!", model, vector_model))
  print(predict_sentiment("I am so happy, this is very good news, congrats!", model, vector_model))

GPU
[[0.9649106]]
[[0.99831223]]

CPU
[[0.08758987]]
[[0.85165584]]


2024-02-06 15:57:06.867469: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
