# TD 5 Aurelien Pouxviel - First language model

In [30]:
import os
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from sklearn.model_selection import train_test_split

In [29]:
def load_reviews_from_directory(directory):
    reviews = []
    labels = []
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), "r", encoding="utf-8") as file:
                review = file.read()
                reviews.append(review)
                if directory.endswith("pos"):
                    labels.append("positive")
                elif directory.endswith("neg"):
                    labels.append("negative")
    
    return reviews, labels

# Load positive and negative reviews
positive_reviews, positive_labels = load_reviews_from_directory("C:/Users/aurel/OneDrive - De Vinci/ONE DRIVE PC/A5/NLP/TD2/txt_sentoken/pos")
negative_reviews, negative_labels = load_reviews_from_directory("C:/Users/aurel/OneDrive - De Vinci/ONE DRIVE PC/A5/NLP/TD2/txt_sentoken/neg")

# Combine positive and negative reviews and labels
all_reviews = positive_reviews + negative_reviews
all_labels = positive_labels + negative_labels

# Example: print the first positive and negative reviews
print("Positive Review:")
print(positive_reviews[0])
print("Label:", positive_labels[0])

print("\nNegative Review:")
print(all_reviews[len(positive_reviews)])
print("Label:", all_labels[len(positive_labels)])


Positive Review:
films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . 
for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . 
to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . 
the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . 
in other words , don't dismiss this film because of its source . 
if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes . 
getting the hughes brothers to direct this

## Tokenize

In [31]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_reviews)
sequences = tokenizer.texts_to_sequences(all_reviews)

max_len = 50  # choose an appropriate max_len
padded_sequences = pad_sequences(sequences, maxlen=max_len)


## Load Pre-trained Word Embeddings (GloVe):

In [32]:
embedding_dim = 50  
embeddings_index = {}
with open('C:/Users/aurel/Downloads/glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

## Create Embedding Matrix:

In [33]:
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


## Build Model:

In [34]:
model = Sequential()
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_dim,
                    input_length=max_len,
                    weights=[embedding_matrix],
                    trainable=False))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [40]:
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, all_labels, test_size=0.2, random_state=42)
X_train = np.array(X_train)
X_val = np.array(X_val)
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_train = y_train.astype(int)
y_val = y_val.astype(int)

In [44]:
from sklearn.metrics import accuracy_score
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1e3d3e50eb0>

In [51]:
# Assuming that 1 represents positive and 0 represents negative
y_val_binary = np.array([1 if label == 'positive' else 0 for label in y_val])

y_pred_prob = model.predict(X_val)
y_pred = (y_pred_prob > 0.5).astype(int)
print(np.isnan(y_pred).any())


accuracy = accuracy_score(y_val, y_pred)
print("Overall Accuracy: {:.2f}%".format(accuracy * 100))

accuracy_positive = accuracy_score(y_val[y_val == 1], y_pred[y_val == 1])
accuracy_negative = accuracy_score(y_val[y_val == 0], y_pred[y_val == 0])

print("Accuracy for Positive Class: {:.2f}%".format(accuracy_positive * 100))
print("Accuracy for Negative Class: {:.2f}%".format(accuracy_negative * 100))

False
Overall Accuracy: 62.25%
Accuracy for Positive Class: 64.82%
Accuracy for Negative Class: 59.70%


#### ON Wp2, we had a 

Positive Reviews Accuracy: 90.60%
Negative Reviews Accuracy: 31.90%

### So our model is now hyper balanced