## Modèle avancé

In [3]:
import pandas as pd
import os  
import json  
import re  
import string
import demoji
from tqdm import tqdm

import tensorflow as tf
import numpy as np
from joblib import Parallel, delayed
from gensim.models import Word2Vec  

from sklearn.model_selection import train_test_split  
from sklearn.metrics import accuracy_score

from tensorflow.keras import Sequential  
from tensorflow.keras.layers import Dense


In [4]:
cols = ['sentiment', 'timestamp', 'date', "query", "username", "comment"]  
df = pd.read_csv("./../input/training.1600000.processed.noemoticon.csv", header=None, names=cols)

df = df.sample(n=100000, random_state=42)
len_df = len(df)

In [7]:
def clean_tweet(doc):  
  # Lower the code
  doc = doc.lower().strip()
  #remove emoji
  text = demoji.replace(doc, '')
  #remove links
  text = re.sub(r'http\S+|www.\S+', '', text)  
  # # Remove mentions
  text = re.sub(r'@\w+', '', text) 
  # Remove hashtag symbol but keep the text  
  text = re.sub(r'#(\w+)', r'\1', text)
  # Keep only alphanumeric characters and spaces  
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  # Remove multiple spaces (replace them with a single space)  
  text = re.sub(r'\s+', ' ', text).strip()
  
  return text
    
def clean_df(dataframe):
  df = dataframe.copy()
  # Keep only comment and sentiment columns
  df = df[["comment","sentiment"]]
  
  # negative field 0 = 0
  # Map positive field 4 = 1
  df.loc[df['sentiment'] == 4, 'sentiment'] = 1  
  
  # Clean the comment
  df['comment_clean'] = parallelize_on_rows(df['comment'], clean_tweet)  
  
  # Count the number of words from comment & comment_cleam
  df['words_nb'] = parallelize_on_rows(df['comment'], lambda x: len(x.split()))  
  df['words_nb_clean'] = parallelize_on_rows(df['comment_clean'], lambda x: len(x.split()))  
  
  # Only keep the clean words
  df = df[df['words_nb_clean'] > 3]
  
  # Remove duplicate
  df.drop_duplicates(subset='comment',inplace=True)
  
  return df


def parallelize_on_rows(data, func):  
    r = Parallel(n_jobs=-1)(delayed(func)(i) for i in tqdm(data, desc="Processing"))  
    return r  

In [8]:
df = clean_df(df)
print(df.shape)

Processing: 100%|██████████| 100000/100000 [00:05<00:00, 19054.91it/s]
Processing: 100%|██████████| 100000/100000 [00:00<00:00, 448879.59it/s]
Processing: 100%|██████████| 100000/100000 [00:00<00:00, 480977.21it/s]

(92469, 5)





## SIMPLE

In [None]:
# Copy original dataframe
df_simple_approach = df.copy()

X = df_simple_approach['comment_clean']
y = df_simple_approach['sentiment']

# Step : Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  

# Step 2: Feature extraction
direct_vectorizer = CountVectorizer()  
X_train_counts = direct_vectorizer.fit_transform(X_train)  
X_test_counts = direct_vectorizer.transform(X_test)  

print("CountVectorizer Feature Extraction")
print("X train before", X_train.shape)
print("X train after", X_train_counts.shape)

print("\nTraining the model...")
direct_model = LogisticRegression(max_iter=1000)  
direct_model.fit(X_train_counts, y_train)  
print("Training done")

y_pred = direct_model.predict(X_test_counts)  

print(f"\nAccuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")  

CountVectorizer Feature Extraction
X train before (73975,)
X train after (73975, 54815)

Training the model...
Training done

Accuracy: 77.48%


In [None]:
def predict_simple(text, model, tokenizer):
    X = vectorizer.transform([text])
    prediction = model.predict(X)
    return prediction[0].item()

In [None]:
# Running with GPU  
with tf.device('/GPU:0'):  
    print(predict_simple("I am so sad, this is very bad news, terrible!", direct_model, direct_vectorizer))
    print(predict_simple("I am so happy, this is very good news, congrats!", direct_model, direct_vectorizer))

0
1


2024-02-06 11:27:52.954056: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-02-06 11:27:52.954134: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2024-02-06 11:27:52.954167: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.00 GB
2024-02-06 11:27:52.954226: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-06 11:27:52.954253: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
# Running with GPU  
with tf.device('/CPU:0'):  
    print(predict_simple("I am so sad, this is very bad news, terrible!", direct_model, direct_vectorizer))
    print(predict_simple("I am so happy, this is very good news, congrats!", direct_model, direct_vectorizer))

0
1


### Word2Vec Manual

In [85]:
# TODO WEIGHT ATTRIBUTION TO FIXED VALUE
np.random.seed(42)  
tf.random.set_seed(42)  
tf.keras.backend.set_floatx('float32')  
os.environ['TF_DETERMINISTIC_OPS'] = '1'  
os.environ['PYTHONHASHSEED'] = str(1)  

# Function to vectorize a comment based on mean of all word vectors in the comment  
def comment_to_vec(comment, model):  
    vec = np.zeros(model.vector_size)  
    num_words = 0  
    for word in comment:  
        if word in model.wv:  
            vec += model.wv[word]  
            num_words += 1  
    if num_words > 0:  
        vec /= num_words  
    return vec

comments = [row.split() for row in df['comment_clean']]  
word2vec_model = Word2Vec(comments, vector_size=100, window=5, min_count=1, workers=4)

# Vectorize all comments  
vectorized_comments = np.array([comment_to_vec(comment, word2vec_model) for comment in comments])  
# Preparing the labels  
labels = df['sentiment'].values  

# Split the dataset into training and test sets  
X_train, X_test, y_train, y_test = train_test_split(vectorized_comments, labels, test_size=0.2, random_state=42)  

model = Sequential([  
    Dense(32, activation='relu', input_dim=100),  # Reduced the layer sizes for simplicity  
    Dense(1, activation='sigmoid')  # Keeping the output layer same for binary classification  
])  

model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.00001), loss='binary_crossentropy', metrics=['accuracy'])  

# Train the model  
model.fit(X_train, y_train, epochs=20, batch_size=256, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x174160650>

In [87]:
def predict_sentiment(comment, model, vector_model):  
    cleaned_comment = comment.split() # placeholder for actual preprocessing  
    vec = comment_to_vec(cleaned_comment, vector_model).reshape(1, -1)  
    prediction = model.predict(vec)
    
    return prediction 
  
print("GPU")
  # Running with GPU  
with tf.device('/GPU:0'):  
    print(predict_sentiment("I am so sad, this is very bad news, terrible!", model, word2vec_model))
    print(predict_sentiment("I am so happy, this is very good news, congrats!", model, word2vec_model))
    

print("\nCPU")
    # Running with GPU  
with tf.device('/CPU:0'): 
    print(predict_sentiment("I am so sad, this is very bad news, terrible!", model, word2vec_model))
    print(predict_sentiment("I am so happy, this is very good news, congrats!", model, word2vec_model))

GPU
[[0.19233872]]
[[0.359747]]

CPU
[[0.29876477]]
[[0.50094825]]


### Word2Vec From Helper

In [None]:
df_w2vec = df.copy()

config = {
    "vector_size": 100,
    # Set input length to be the max from the number of words
    "input_length": df_w2vec["words_nb_clean"].max(),
    # Word before & after for word2vec
    "window": 5,
    # Max number of workers
    "workers": 8,
    # Vocal length
    "vocab_length": 14000
}

(model_w2vec, tokenizer_w2vec, callbacks_w2vec, X_w2vec, y_w2vec) = get_w2vec_data(config, df_w2vec)


W2vec Vocabulary Length (you can adjust the vocal length): 8196
Embedding Matrix Shape: (14000, 100)


2024-02-06 11:12:44.815860: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-02-06 11:12:44.815919: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2024-02-06 11:12:44.815927: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.00 GB
2024-02-06 11:12:44.815961: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-06 11:12:44.815980: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
model_w2vec.summary()

Model: "Sentiment_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 100)           1400000   
                                                                 
 bidirectional (Bidirection  (None, 40, 200)           160800    
 al)                                                             
                                                                 
 conv1d (Conv1D)             (None, 36, 100)           100100    
                                                                 
 global_max_pooling1d (Glob  (None, 100)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 16)                1616      
                                                                 
 dense_1 (Dense)             (None, 1)             

In [None]:
space = {
    "input_length": config["input_length"],
    'experiment_name': f"Tweet Sentiment - {len_df} - w2vec",
    'batch_size': hp.choice('batch_size', [256]),  
    'epochs': hp.choice('epochs', [8]),  
    'learning_rate': hp.uniform('learning_rate', 0.0001, 0.001),  
}

In [None]:
trials = Trials()
fmin(  
    fn=lambda params: objective(params, X_w2vec, y_w2vec, model_w2vec, callbacks_w2vec, tokenizer_w2vec, False),  
    space=space,  
    algo=tpe.suggest,  
    max_evals=1,
    trials=trials  
)

Running one fit with the params:                     
{'batch_size': 256, 'epochs': 8, 'experiment_name': 'Tweet Sentiment - 100000 - w2vec', 'input_length': 40, 'learning_rate': 0.0009507858259624248}
Epoch 1/8                                            

  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]

2024-02-06 11:12:46.328052: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


  1/228 [..............................] - ETA: 6:26 - loss: 0.6936 - accuracy: 0.5156
  2/228 [..............................] - ETA: 18s - loss: 0.6851 - accuracy: 0.5469 
  4/228 [..............................] - ETA: 12s - loss: 0.6750 - accuracy: 0.5654
  6/228 [..............................] - ETA: 11s - loss: 0.6614 - accuracy: 0.5924
  8/228 [>.............................] - ETA: 11s - loss: 0.6571 - accuracy: 0.6016
  9/228 [>.............................] - ETA: 11s - loss: 0.6569 - accuracy: 0.6020
 10/228 [>.............................] - ETA: 11s - loss: 0.6530 - accuracy: 0.6066
 12/228 [>.............................] - ETA: 11s - loss: 0.6498 - accuracy: 0.6152
 13/228 [>.............................] - ETA: 11s - loss: 0.6474 - accuracy: 0.6178
 15/228 [>.............................] - ETA: 11s - loss: 0.6444 - accuracy: 0.6221
 17/228 [=>............................] - ETA: 10s - loss: 0.6422 - accuracy: 0.6264
 18/228 [=>............................] - ETA: 10s 

{'batch_size': 0, 'epochs': 0, 'learning_rate': 0.0009507858259624248}

In [None]:
def pad_sequences(sequence, maxlen, value=0):  
    return np.array([np.pad(s[:maxlen], (max(0, maxlen-len(s)), 0), 'constant', constant_values=value) if len(s) < maxlen else s[:maxlen] for s in sequence])  

def predict(text_to_predict, model, tokenizer):
    # Tokenizing and padding  
    sequence = tokenizer.texts_to_sequences(text_to_predict)
    # We need to pad sequences to ensure uniform input size  
    padded_sequence = pad_sequences(sequence, maxlen=int(space["input_length"]))
    
    prediction = model.predict(padded_sequence)
    sentiment = prediction.astype(float)[0][0]
    return sentiment.item()

## DIRECT TEST

### Testing model not loaded with GPU

In [None]:
# Running with GPU  
with tf.device('/GPU:0'):  
    print(predict(["I am so sad, this is very bad news, terrible!"], model_w2vec, tokenizer_w2vec))
    print(predict(["I am so happy, this is very good news, congrats!"], model_w2vec, tokenizer_w2vec))

0.11283314228057861
0.8007263541221619


### Testing model not loaded with CPU

In [None]:
# Running with GPU
with tf.device('/CPU:0'):  
    print(predict(["I am so sad, this is very bad news, terrible!"], model_w2vec, tokenizer_w2vec))
    print(predict(["I am so happy, this is very good news, congrats!"], model_w2vec, tokenizer_w2vec))

0.9790797233581543
0.9946383833885193


## PICKLE LOAD

In [None]:
with open("./model.pkl", "rb") as file:  
    model = pickle.load(file)  
  
with open("./tokenizer.pkl", "rb") as file:  
    tokenizer = pickle.load(file)

### Testing model loaded with GPU

In [None]:
# Running with GPU  
with tf.device('/GPU:0'):  
    print(predict(["I am so sad, this is very bad news, terrible!"], model, tokenizer))
    print(predict(["I am so happy, this is very good news, congrats!"], model, tokenizer))

0.11283314228057861
0.8007263541221619


### Testing model loaded with CPU

In [None]:
# Running with GPU  
with tf.device('/CPU:0'):  
    print(predict(["I am so sad, this is very bad news, terrible!"], model, tokenizer))
    print(predict(["I am so happy, this is very good news, congrats!"], model, tokenizer))

0.9790797233581543
0.9946383833885193
