In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import re
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression

#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')
def transform_sentiment_to_number(row):
    if row == 'positive':
        return 2
    elif row == 'neutral':
        return 1
    elif row == 'negative':
        return 0
    return None

def map_sentiment(row):
    if row == 'positive':
        return 'positive'
    elif row == 'neutral':
        return 'neutral'
    elif row == 'negative':
        return 'negative'
    elif row == 'Positive':
        return 'positive'
    elif row == 'Neutral':
        return 'neutral'
    elif row == 'Negative':
        return 'negative'
    elif row == 'Irrelevant':
        return 'neutral'
    return None

def preprocess_text(text):
    # Elimină caracterele non-alfabetice
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Transformă textul în litere mici
    text = text.lower()
    # Tokenizare și eliminarea stopwords + lematizare
    text = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    return text


KeyboardInterrupt: 

In [2]:
# Citirea dataseturilor și extragerea textului și a sentimentelor
data_tweets = pd.read_csv('https://raw.githubusercontent.com/Dackohn/ExcaliburAiProject/refs/heads/main/datasets/Tweets.csv')
data_tweets=data_tweets[['text','airline_sentiment']]

data_sentiment = pd.read_csv('https://raw.githubusercontent.com/Dackohn/ExcaliburAiProject/refs/heads/main/datasets/sentimentdataset.csv')
data_sentiment=data_sentiment[['Text','Sentiment']]

data_imdb = pd.read_csv('https://raw.githubusercontent.com/Dackohn/ExcaliburAiProject/refs/heads/main/datasets/IMDB-Dataset.csv')

data_text2 = pd.read_csv('https://raw.githubusercontent.com/Dackohn/ExcaliburAiProject/refs/heads/main/datasets/train.csv', encoding='unicode_escape')
data_text2 = data_text2[['text','sentiment']]

data_text1 = pd.read_csv('https://raw.githubusercontent.com/Dackohn/ExcaliburAiProject/refs/heads/main/datasets/test.csv', encoding='unicode_escape')
data_text1 = data_text1[['text','sentiment']]

data_titter = pd.read_csv('https://raw.githubusercontent.com/Dackohn/ExcaliburAiProject/refs/heads/main/datasets/twitter_training.csv', encoding='unicode_escape')
nume_col = ['Coloana1', 'Coloana2','sentiment', 'text']
data_titter.columns = nume_col
data_titter = data_titter[['text','sentiment']]


In [30]:
# Redenumirea coloanelor pentru consistență
data_imdb = data_imdb.rename(columns={
    'review': 'text',
    'sentiment': 'sentiment'
})

data_tweets = data_tweets.rename(columns={
    'review': 'text',
    'airline_sentiment': 'sentiment'
})

data_sentiment = data_sentiment.rename(columns={
    'Text': 'text',
    'Sentiment': 'sentiment'
})

In [53]:
# Combinarea tuturor dataseturilor într-un singur DataFrame
data_combined = pd.concat([data_titter, data_imdb, data_sentiment, data_text1, data_text2, data_tweets], ignore_index=True)
# Normalizarea sentimentelor
data_combined['sentiment'] = data_combined['sentiment'].apply(map_sentiment)
# Eliminarea valorilor lipsă
data_combined = data_combined.dropna(subset=['sentiment'])
data_combined = data_combined.dropna(subset=['text'])

# Balansare dataset
X = data_combined['text']
y = data_combined['sentiment']

rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_res, y_res = rus.fit_resample(X.values.reshape(-1, 1), y)

data_combined_balanced = pd.DataFrame({
    'text': X_res.flatten(),
    'sentiment': y_res
})
# Actualizați 'data_combined' cu dataset-ul echilibrat
data_combined = data_combined_balanced

In [54]:
data_combined['sentiment']


0         negative
1         negative
2         negative
3         negative
4         negative
            ...   
139879    positive
139880    positive
139881    positive
139882    positive
139883    positive
Name: sentiment, Length: 139884, dtype: object

In [55]:
# Definirea stopwords și lematizatorului
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocesarea textului
data_combined['text'] = data_combined['text'].apply(preprocess_text)
data_combined['sentiment']=data_combined['sentiment'].apply(transform_sentiment_to_number)

X_res = data_combined['text']
y_res = data_combined['sentiment']

In [6]:
data_combined['sentiment']=data_combined['sentiment'].apply(transform_sentiment_to_number)

In [40]:
data_combined['sentiment'].apply(transform_sentiment_to_number).values

array([None, None, None, ..., None, None, None], dtype=object)

In [56]:
data_combined['sentiment']

0         0
1         0
2         0
3         0
4         0
         ..
139879    2
139880    2
139881    2
139882    2
139883    2
Name: sentiment, Length: 139884, dtype: int64

In [57]:
y_res

0         0
1         0
2         0
3         0
4         0
         ..
139879    2
139880    2
139881    2
139882    2
139883    2
Name: sentiment, Length: 139884, dtype: int64

In [58]:
y_res

0         0
1         0
2         0
3         0
4         0
         ..
139879    2
139880    2
139881    2
139882    2
139883    2
Name: sentiment, Length: 139884, dtype: int64

In [59]:
# Vectorizare text
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(X_res)

# Transformarea sentimentelor în numere
y = y_res

# Implementare K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = LogisticRegression(max_iter=1000)
scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
print("K-Fold Cross Validation Accuracy Scores:", scores)
print("Mean Accuracy:", scores.mean())

K-Fold Cross Validation Accuracy Scores: [0.68055903 0.67866462 0.67534046 0.67580513 0.67933228]
Mean Accuracy: 0.6779403033369182


In [70]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

# 1. Pregătirea datelor
# Tokenizare și transformare în secvențe
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(data_combined['text'])
sequences = tokenizer.texts_to_sequences(data_combined['text'])

# Padding pentru a uniformiza lungimea secvențelor
max_length = 18
X = pad_sequences(sequences, maxlen=max_length, padding='post')

# Transformare etichete (y) în format numeric
y = y_res

# Împărțire în seturi de antrenare și testare
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Crearea modelului GRU
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    GRU(64, return_sequences=False),  # GRU cu 64 de unități
    Dropout(0.5),                     # Dropout pentru regularizare
    Dense(32, activation='relu'),     # Fully connected layer
    Dense(3, activation='softmax')    # 3 clase (pozitiv, negativ, neutru)
])

# 3. Compilarea modelului
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# 4. Antrenarea modelului
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,  # Ajustați în funcție de resurse și date
    batch_size=64,
    verbose=1
)

# 5. Evaluarea performanței
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 17ms/step - accuracy: 0.5606 - loss: 0.8901 - val_accuracy: 0.6964 - val_loss: 0.6848
Epoch 2/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 18ms/step - accuracy: 0.7375 - loss: 0.6225 - val_accuracy: 0.7252 - val_loss: 0.6391
Epoch 3/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 18ms/step - accuracy: 0.7789 - loss: 0.5372 - val_accuracy: 0.7356 - val_loss: 0.6282
Epoch 4/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 16ms/step - accuracy: 0.8184 - loss: 0.4563 - val_accuracy: 0.7368 - val_loss: 0.6465
Epoch 5/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 17ms/step - accuracy: 0.8464 - loss: 0.3913 - val_accuracy: 0.7497 - val_loss: 0.6646
Epoch 6/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 18ms/step - accuracy: 0.8684 - loss: 0.3394 - val_accuracy: 0.7462 - val_loss: 0.7120
Epoc

In [None]:
#workin

In [61]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

# 1. Pregătirea datelor
num_words = 30000
max_length = 180
embedding_dim = 100

tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
tokenizer.fit_on_texts(data_combined['text'])
sequences = tokenizer.texts_to_sequences(data_combined['text'])
X = pad_sequences(sequences, maxlen=max_length, padding='post')
y = y_res

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 2. Pregătirea embedding-ului GloVe
embedding_index = {}
with open('glove.6B/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.array(values[1:], dtype='float32')
        embedding_index[word] = vectors

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < num_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# 3. Crearea modelului
model = Sequential([
    Embedding(input_dim=num_words, 
              output_dim=embedding_dim, 
              weights=[embedding_matrix], 
              input_length=max_length, 
              trainable=False),
    GRU(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

# 4. Compilarea și antrenarea modelului
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, verbose=1)

# 5. Evaluarea performanței
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/10




[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 53ms/step - accuracy: 0.4118 - loss: 1.0317 - val_accuracy: 0.5612 - val_loss: 0.8758
Epoch 2/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 66ms/step - accuracy: 0.5605 - loss: 0.8806 - val_accuracy: 0.5705 - val_loss: 0.8606
Epoch 3/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 65ms/step - accuracy: 0.5868 - loss: 0.8086 - val_accuracy: 0.6536 - val_loss: 0.7409
Epoch 4/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 64ms/step - accuracy: 0.6650 - loss: 0.7267 - val_accuracy: 0.6901 - val_loss: 0.6865
Epoch 5/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 71ms/step - accuracy: 0.6980 - loss: 0.6756 - val_accuracy: 0.7042 - val_loss: 0.6614
Epoch 6/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 67ms/step - accuracy: 0.7197 - loss: 0.6452 - val_accuracy: 0.7128 - val_loss: 0.6489
Epoch 7/10

int32 int32
object object


In [66]:
from sklearn.model_selection import KFold
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Numărul de folds
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Pregătește datele
X = pad_sequences(sequences, maxlen=max_length, padding='post')
y = y_res  # Asigură-te că `y_res` este corect pregătit

# Pentru stocarea rezultatelor
fold_accuracies = []

# K-Fold Cross-Validation
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Processing fold {fold + 1}/{num_folds}...")
    
    # Împarte datele în train și test
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Creează modelul pentru fiecare fold
    model = Sequential([
        Embedding(input_dim=num_words, 
                  output_dim=embedding_dim, 
                  weights=[embedding_matrix], 
                  input_length=max_length, 
                  trainable=False),
        GRU(64, return_sequences=False),
        Dropout(0.5),
        Dense(32, activation='relu'),
        Dense(3, activation='softmax')
    ])
    
    # Compilează modelul
    model.compile(optimizer=Adam(),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    # Antrenează modelul
    model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)
    
    # Evaluează modelul
    loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
    print(f"Fold {fold + 1} - Test Accuracy: {accuracy:.4f}")
    fold_accuracies.append(accuracy)

# Media acurateței pe toate fold-urile
average_accuracy = np.mean(fold_accuracies)
print(f"Average Accuracy across {num_folds} folds: {average_accuracy:.4f}")


Processing fold 1/5...
Epoch 1/10




[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 67ms/step - accuracy: 0.3909 - loss: 1.0503
Epoch 2/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 66ms/step - accuracy: 0.5835 - loss: 0.8195
Epoch 3/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 67ms/step - accuracy: 0.6066 - loss: 0.7874
Epoch 4/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 70ms/step - accuracy: 0.6207 - loss: 0.7599
Epoch 5/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 65ms/step - accuracy: 0.6318 - loss: 0.7370
Epoch 6/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 63ms/step - accuracy: 0.6871 - loss: 0.6914
Epoch 7/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 66ms/step - accuracy: 0.7306 - loss: 0.6236
Epoch 8/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 65ms/step - accuracy: 0.7531 - loss: 0.5898
Epoch 9/10




[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 62ms/step - accuracy: 0.4203 - loss: 1.0225
Epoch 2/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 62ms/step - accuracy: 0.5845 - loss: 0.8253
Epoch 3/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 70ms/step - accuracy: 0.6856 - loss: 0.7011
Epoch 4/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 66ms/step - accuracy: 0.7170 - loss: 0.6528
Epoch 5/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 65ms/step - accuracy: 0.7385 - loss: 0.6143
Epoch 6/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 63ms/step - accuracy: 0.7603 - loss: 0.5751
Epoch 7/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 58ms/step - accuracy: 0.7749 - loss: 0.5458
Epoch 8/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 59ms/step - accuracy: 0.7889 - loss: 0.5164
Epoch 9/10




[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 57ms/step - accuracy: 0.3891 - loss: 1.0504
Epoch 2/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 56ms/step - accuracy: 0.5667 - loss: 0.8683
Epoch 3/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 57ms/step - accuracy: 0.6057 - loss: 0.7841
Epoch 4/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 59ms/step - accuracy: 0.6895 - loss: 0.6972
Epoch 5/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 58ms/step - accuracy: 0.7245 - loss: 0.6401
Epoch 6/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 58ms/step - accuracy: 0.7448 - loss: 0.6038
Epoch 7/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 58ms/step - accuracy: 0.7646 - loss: 0.5681
Epoch 8/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 58ms/step - accuracy: 0.7788 - loss: 0.5399
Epoch 9/10




[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 58ms/step - accuracy: 0.4251 - loss: 1.0201
Epoch 2/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 58ms/step - accuracy: 0.5678 - loss: 0.8663
Epoch 3/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 58ms/step - accuracy: 0.5851 - loss: 0.8208
Epoch 4/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 58ms/step - accuracy: 0.6730 - loss: 0.7128
Epoch 5/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 67ms/step - accuracy: 0.7138 - loss: 0.6556
Epoch 6/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 62ms/step - accuracy: 0.7362 - loss: 0.6110
Epoch 7/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 59ms/step - accuracy: 0.7593 - loss: 0.5735
Epoch 8/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 63ms/step - accuracy: 0.7801 - loss: 0.5342
Epoch 9/10




[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 62ms/step - accuracy: 0.4122 - loss: 1.0298
Epoch 2/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 66ms/step - accuracy: 0.5766 - loss: 0.8558
Epoch 3/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 61ms/step - accuracy: 0.6753 - loss: 0.7139
Epoch 4/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 62ms/step - accuracy: 0.7177 - loss: 0.6496
Epoch 5/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 68ms/step - accuracy: 0.7441 - loss: 0.6044
Epoch 6/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 62ms/step - accuracy: 0.7597 - loss: 0.5701
Epoch 7/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 65ms/step - accuracy: 0.7790 - loss: 0.5351
Epoch 8/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 63ms/step - accuracy: 0.7924 - loss: 0.5060
Epoch 9/10


In [65]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

# 1. Pregătirea datelor
num_words = 5000
max_length = 180
embedding_dim =300

tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
tokenizer.fit_on_texts(data_combined['text'])
sequences = tokenizer.texts_to_sequences(data_combined['text'])
X = pad_sequences(sequences, maxlen=max_length, padding='post')
y = y_res

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 2. Pregătirea embedding-ului GloVe
embedding_index = {}

with open('glove.6B/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.array(values[1:], dtype='float32')
        embedding_index[word] = vectors

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < num_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# 3. Crearea modelului
from tensorflow.keras.regularizers import l2

model = Sequential([
    Embedding(input_dim=num_words, 
              output_dim=embedding_dim, 
              weights=[embedding_matrix], 
              input_length=max_length, 
              trainable=False),
    GRU(128, return_sequences=True),
    Dropout(0.5),
    GRU(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),  # Adaugă regularizator L2
    Dense(32, activation='relu', kernel_regularizer=l2(0.01)),  # Adaugă regularizator L2
    Dense(3, activation='softmax')  # Fără regularizare aici
])



# 4. Compilarea și antrenarea modelului
from tensorflow.keras.optimizers import Adam

model.compile(optimizer=Adam(learning_rate=0.0001),  # Reduce learning rate
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
                    epochs=30, batch_size=64, verbose=1, callbacks=[early_stopping])


# 5. Evaluarea performanței
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {accuracy:.4f}")

from sklearn.metrics import classification_report

y_pred = model.predict(X_test).argmax(axis=1)
print(classification_report(y_test, y_pred))


Epoch 1/30
[1m 732/1749[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m3:04[0m 182ms/step - accuracy: 0.3619 - loss: 1.9359

KeyboardInterrupt: 

In [7]:
import numpy as np
text_lengths = data_combined['text'].apply(lambda x: len(x.split()))
print(np.percentile(text_lengths, [50, 75, 90, 95]))  # Percentile pentru lungimi



[ 12.  42. 114. 179.]


In [8]:
from collections import Counter
all_words = ' '.join(data_combined['text']).split()
word_counts = Counter(all_words)
print(len(word_counts))  # Total cuvinte unice


103481


In [10]:
import numpy as np
embedding_index = {}
with open('glove.6B/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs


In [15]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

# 1. Pregătirea datelor
# Tokenizare și transformare în secvențe
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")


In [17]:
num_words=10000

In [18]:
embedding_dim = 100  # Depinde de versiunea descărcată (ex: 50, 100, 200, 300)
word_index = tokenizer.word_index
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [20]:
max_length = 100

In [21]:
embedding_layer = Embedding(input_dim=num_words, 
                             output_dim=embedding_dim,
                             weights=[embedding_matrix],
                             input_length=max_length,
                             trainable=False)  # Freezing embedding weights
model = Sequential([
    embedding_layer,
    GRU(64, return_sequences=False),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])




In [9]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import re
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression


#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')
def transform_sentiment_to_number(row):
    if row == 'positive':
        return 2
    elif row == 'neutral':
        return 1
    elif row == 'negative':
        return 0
    return None

def map_sentiment(row):
    if row == 'positive':
        return 'positive'
    elif row == 'neutral':
        return 'neutral'
    elif row == 'negative':
        return 'negative'
    elif row == 'Positive':
        return 'positive'
    elif row == 'Neutral':
        return 'neutral'
    elif row == 'Negative':
        return 'negative'
    elif row == 'Irrelevant':
        return 'neutral'
    return None

def preprocess_text(text):
    # Elimină caracterele non-alfabetice
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Transformă textul în litere mici
    text = text.lower()
    # Tokenizare și eliminarea stopwords + lematizare
    text = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    return text



# Citirea dataseturilor și extragerea textului și a sentimentelor
data_tweets = pd.read_csv('https://raw.githubusercontent.com/Dackohn/ExcaliburAiProject/refs/heads/main/datasets/Tweets.csv')
data_tweets=data_tweets[['text','airline_sentiment']]

data_sentiment = pd.read_csv('https://raw.githubusercontent.com/Dackohn/ExcaliburAiProject/refs/heads/main/datasets/sentimentdataset.csv')
data_sentiment=data_sentiment[['Text','Sentiment']]

data_imdb = pd.read_csv('https://raw.githubusercontent.com/Dackohn/ExcaliburAiProject/refs/heads/main/datasets/IMDB-Dataset.csv')

data_text2 = pd.read_csv('https://raw.githubusercontent.com/Dackohn/ExcaliburAiProject/refs/heads/main/datasets/train.csv', encoding='unicode_escape')
data_text2 = data_text2[['text','sentiment']]

data_text1 = pd.read_csv('https://raw.githubusercontent.com/Dackohn/ExcaliburAiProject/refs/heads/main/datasets/test.csv', encoding='unicode_escape')
data_text1 = data_text1[['text','sentiment']]

data_titter = pd.read_csv('https://raw.githubusercontent.com/Dackohn/ExcaliburAiProject/refs/heads/main/datasets/twitter_training.csv', encoding='unicode_escape')
nume_col = ['Coloana1', 'Coloana2','sentiment', 'text']
data_titter.columns = nume_col
data_titter = data_titter[['text','sentiment']]

# Redenumirea coloanelor pentru consistență
data_imdb = data_imdb.rename(columns={
    'review': 'text',
    'sentiment': 'sentiment'
})

data_tweets = data_tweets.rename(columns={
    'review': 'text',
    'airline_sentiment': 'sentiment'
})

data_sentiment = data_sentiment.rename(columns={
    'Text': 'text',
    'Sentiment': 'sentiment'
})

# Combinarea tuturor dataseturilor într-un singur DataFrame
data_combined = pd.concat([data_titter, data_imdb, data_sentiment, data_text1, data_text2, data_tweets], ignore_index=True)
# Normalizarea sentimentelor

In [44]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [45]:

data_combined = data_combined.dropna(subset=['sentiment'])
data_combined = data_combined.dropna(subset=['text'])


In [46]:
# Preprocesarea textului din dataset
data_combined['text'] = data_combined['text'].apply(preprocess_text)


In [47]:
data_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 170362 entries, 0 to 172329
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   text       170362 non-null  object
 1   sentiment  170362 non-null  object
dtypes: object(2)
memory usage: 3.9+ MB


In [48]:

# 2. Tokenizare
tokenizer = Tokenizer(num_words=70000)#num_words marit ca sa capteze mai multe cuvinte
tokenizer.fit_on_texts(data_combined['text'])
X_seq = tokenizer.texts_to_sequences(data_combined['text'])
X_padded = pad_sequences(X_seq, padding='post')


In [49]:
X_padded

array([[ 819,  285,  129, ...,    0,    0,    0],
       [ 819,  129,   65, ...,    0,    0,    0],
       [ 819,  129,  130, ...,    0,    0,    0],
       ...,
       [ 392,  207,  598, ...,    0,    0,    0],
       [ 392,  175,  299, ...,    0,    0,    0],
       [ 392, 3532,   98, ...,    0,    0,    0]], dtype=int32)

In [70]:

X = X_padded
y = data_combined['sentiment'].apply(lambda x: {'positive': 2, 'neutral': 1, 'negative': 0}.get(x, -1))  # Mapează sentimentul în valori numerice


In [71]:
# Reshape X pentru a fi în formă corectă (n_samples, 1)
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_res, y_res = rus.fit_resample(X, y)

In [72]:
# 4. K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)


In [73]:
# 5. Crearea și antrenarea modelului GRU
model = Sequential()
model.add(Embedding(input_dim=40000, output_dim=128, input_length=X_res.shape[1]))
model.add(GRU(128, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))  # 3 clase (pozitiv, neutru, negativ)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [74]:
# Calcularea greutății claselor pentru a corecta dezechilibrul
class_weights = compute_class_weight('balanced', classes=np.unique(y_res), y=y_res)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}


In [77]:
# K-Fold Cross-Validation
fold_no = 1
for train_index, val_index in kf.split(X_res):
    print(f"\nTraining fold {fold_no}...")

    # Verificarea valorilor din y_train și y_val
    # Verificarea valorilor din y_train și y_val pentru valori mari

    # Verificarea valorilor din y_train și y_val pentru valori mari
    print(f"Maxim valoare eticheta y_train: {np.max(y_train)}")
    print(f"Maxim valoare eticheta y_val: {np.max(y_val)}")

    # Dacă valorile depășesc dimensiunea embedding-ului, le putem ajusta
    dim_embedding = 40000  # Dimensiunea vocabularului

    # Asigură-te că etichetele sunt în intervalul [0, dim_embedding-1]
    y_train = np.clip(y_train, 0, dim_embedding - 1)
    y_val = np.clip(y_val, 0, dim_embedding - 1)

    # Antrenarea modelului
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val), class_weight=class_weight_dict)

    # Evaluarea pe fold-ul de validare
    val_loss, val_acc = model.evaluate(X_val, y_val)
    print(f"Fold {fold_no} - Validation accuracy: {val_acc}")
    fold_no += 1



Training fold 1...
Maxim valoare eticheta y_train: 2
Maxim valoare eticheta y_val: 2
Epoch 1/5


InvalidArgumentError: Graph execution error:

Detected at node sequential_10_1/embedding_10_1/GatherV2 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\asyncio\base_events.py", line 608, in run_forever

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\asyncio\base_events.py", line 1936, in _run_once

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\asyncio\events.py", line 84, in _run

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\User\AppData\Local\Temp\ipykernel_17120\2050866308.py", line 15, in <module>

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\backend\tensorflow\trainer.py", line 368, in fit

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\backend\tensorflow\trainer.py", line 216, in function

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\backend\tensorflow\trainer.py", line 129, in multi_step_on_iterator

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\backend\tensorflow\trainer.py", line 110, in one_step_on_data

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\backend\tensorflow\trainer.py", line 56, in train_step

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\layers\layer.py", line 899, in __call__

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\ops\operation.py", line 46, in __call__

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\models\sequential.py", line 213, in call

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\models\functional.py", line 182, in call

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\ops\function.py", line 171, in _run_through_graph

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\models\functional.py", line 632, in call

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\layers\layer.py", line 899, in __call__

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\ops\operation.py", line 46, in __call__

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\layers\core\embedding.py", line 140, in call

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\ops\numpy.py", line 5239, in take

  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\keras\src\backend\tensorflow\numpy.py", line 2063, in take

indices[63,27] = 47704 is not in [0, 40000)
	 [[{{node sequential_10_1/embedding_10_1/GatherV2}}]] [Op:__inference_multi_step_on_iterator_464387]