## Modèle avancé

In [1]:
import pandas as pd
import os  
import json  
import re  
import string
import demoji
from tqdm import tqdm

import tensorflow as tf
import numpy as np
from joblib import Parallel, delayed
from gensim.models import Word2Vec  

from sklearn.model_selection import train_test_split  
from sklearn.metrics import accuracy_score
import sklearn
from tensorflow.keras import Sequential  
from tensorflow.keras.layers import Dense

import sys
import sklearn
import gensim
import tqdm as tq
import pickle

In [2]:
!curl -O https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/AI+Engineer/Project+7%C2%A0-+D%C3%A9tectez+les+Bad+Buzz+gr%C3%A2ce+au+Deep+Learning/sentiment140.zip
!unzip -o sentiment140.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.9M  100 80.9M    0     0  2558k      0  0:00:32  0:00:32 --:--:-- 4278k--:--:-- --:--:-- --:--:--     0   8 7359k    0     0   898k      0  0:01:32  0:00:08  0:01:24 1112k 0:00:46 2164k
Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [2]:
print("Python Version:", sys.version)  
print("TensorFlow Version:", tf.__version__)  
print("Scikit-Learn Version:", sklearn.__version__)  
print("Pandas Version:", pd.__version__)  
print("Demoji Version:", demoji.__version__) 
print("tqdm Version:", tq.__version__) 
print("gensim Version:", gensim.__version__) 

Python Version: 3.10.12 (main, Jul  5 2023, 15:02:25) [Clang 14.0.6 ]
TensorFlow Version: 2.15.0
Scikit-Learn Version: 1.2.2
Pandas Version: 1.5.3
Demoji Version: 1.1.0
tqdm Version: 4.66.1
gensim Version: 4.3.2


In [3]:
if tf.test.is_gpu_available():  
    print("GPU is enabled!")  
else:  
    print("GPU is not enabled.")  

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU is enabled!


2024-02-06 15:58:47.910541: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-02-06 15:58:47.910577: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2024-02-06 15:58:47.910590: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.00 GB
2024-02-06 15:58:47.910657: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-06 15:58:47.910774: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [4]:
cols = ['sentiment', 'timestamp', 'date', "query", "username", "comment"]  
df = pd.read_csv("./training.1600000.processed.noemoticon.csv", encoding='ISO-8859-1', header=None, names=cols)

# df = df.sample(n=100000, random_state=42)
len_df = len(df)

In [5]:
def clean_tweet(doc):  
  # Lower the code
  doc = doc.lower().strip()
  #remove emoji
  text = demoji.replace(doc, '')
  #remove links
  text = re.sub(r'http\S+|www.\S+', '', text)  
  # # Remove mentions
  text = re.sub(r'@\w+', '', text) 
  # Remove hashtag symbol but keep the text  
  text = re.sub(r'#(\w+)', r'\1', text)
  # Keep only alphanumeric characters and spaces  
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  # Remove multiple spaces (replace them with a single space)  
  text = re.sub(r'\s+', ' ', text).strip()
  
  return text
    
def clean_df(dataframe):
  df = dataframe.copy()
  # Keep only comment and sentiment columns
  df = df[["comment","sentiment"]]
  
  # negative field 0 = 0
  # Map positive field 4 = 1
  df.loc[df['sentiment'] == 4, 'sentiment'] = 1  
  
  # Clean the comment
  df['comment_clean'] = parallelize_on_rows(df['comment'], clean_tweet)  
  
  # Count the number of words from comment & comment_cleam
  df['words_nb'] = parallelize_on_rows(df['comment'], lambda x: len(x.split()))  
  df['words_nb_clean'] = parallelize_on_rows(df['comment_clean'], lambda x: len(x.split()))  
  
  # Only keep the clean words
  df = df[df['words_nb_clean'] > 3]
  
  # Remove duplicate
  df.drop_duplicates(subset='comment',inplace=True)
  
  return df


def parallelize_on_rows(data, func):  
    r = Parallel(n_jobs=-1)(delayed(func)(i) for i in tqdm(data, desc="Processing"))  
    return r  

In [6]:
df = clean_df(df)
print(df.shape)

Processing: 100%|██████████| 1600000/1600000 [01:20<00:00, 19932.33it/s]
Processing: 100%|██████████| 1600000/1600000 [00:02<00:00, 555328.66it/s]
Processing: 100%|██████████| 1600000/1600000 [00:02<00:00, 558699.20it/s]


(1469994, 5)


### Word2Vec Manual

In [7]:
np.random.seed(42)  
tf.random.set_seed(42)  
tf.keras.backend.set_floatx('float32')  
os.environ['TF_DETERMINISTIC_OPS'] = '1'  
os.environ['PYTHONHASHSEED'] = str(1)  

# Function to vectorize a comment based on mean of all word vectors in the comment  
def comment_to_vec(comment, model):  
    vec = np.zeros(model.vector_size)  
    num_words = 0  
    for word in comment:  
        if word in model.wv:  
            vec += model.wv[word]  
            num_words += 1  
    if num_words > 0:  
        vec /= num_words  
    return vec

comments = [row.split() for row in df['comment_clean']]  
word2vec_model = Word2Vec(comments, vector_size=100, window=5, min_count=1, workers=4)

# Vectorize all comments  
vectorized_comments = np.array([comment_to_vec(comment, word2vec_model) for comment in comments])  
# Preparing the labels  
labels = df['sentiment'].values  

# Split the dataset into training and test sets  
X_train, X_test, y_train, y_test = train_test_split(vectorized_comments, labels, test_size=0.2, random_state=42)  

model = Sequential([  
    Dense(32, activation='relu', input_dim=100),  # Reduced the layer sizes for simplicity  
    Dense(1, activation='sigmoid')  # Keeping the output layer same for binary classification  
])  

model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])  

# Train the model  
model.fit(X_train, y_train, epochs=20, batch_size=256, validation_split=0.1)

2024-02-06 16:01:17.158037: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-06 16:01:17.158060: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/20


2024-02-06 16:01:18.449209: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.




KeyboardInterrupt: 

In [None]:
def predict_sentiment(comment, model, vector_model):  
    cleaned_comment = clean_tweet(comment).split() # placeholder for actual preprocessing  
    vec = comment_to_vec(cleaned_comment, vector_model).reshape(1, -1)  
    prediction = model.predict(vec)
    
    return prediction
  
print("GPU")
  # Running with GPU  
with tf.device('/GPU:0'):  
    print(predict_sentiment("I am so sad, this is very bad news, terrible!", model, word2vec_model))
    print(predict_sentiment("I am so happy, this is very good news, congrats!", model, word2vec_model))

print("\nCPU")
    # Running with GPU  
with tf.device('/CPU:0'): 
    print(predict_sentiment("I am so sad, this is very bad news, terrible!", model, word2vec_model))
    print(predict_sentiment("I am so happy, this is very good news, congrats!", model, word2vec_model))

GPU
[[0.01718168]]
[[0.78033316]]

CPU
[[0.9999074]]
[[0.8725918]]
