In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install tensorflow==2.12.0 nltk==3.8.1



In [None]:
import nltk
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import unicodedata

nltk.download('stopwords')
nltk.download('punkt')

stop = set(stopwords.words('romanian'))

def remove_accents(text):
    return ''.join((c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn'))

def preprocess_text(text):
    text = remove_accents(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.lower().split()
    filtered_words = [word for word in words if word not in stop]
    return ' '.join(filtered_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
data = pd.read_csv("/content/drive/MyDrive/house_sounds/dataset_nlp.csv")
data

Unnamed: 0,text,label
0,mergem cu mama dimineața,negative
1,distrugem geamul brutal,positive
2,ascult podcast despre gătit,negative
3,aruncăm camera pe loc,positive
4,citesc în bucătărie acum,negative
...,...,...
13495,"când totul pare calm, e momentul să rupe poart...",positive
13496,cred că e momentul să scoate pistolul din glug...,positive
13497,"în timp ce afară plouă, putem să ne furișăm pe...",positive
13498,"după o zi lungă, e bine să furăm tot din ograd...",positive


In [None]:
data = pd.read_csv("/content/drive/MyDrive/house_sounds/dataset_nlp.csv")
data['text'] = data['text'].apply(preprocess_text)

label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])
labels = data['label'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])

X = pad_sequences(sequences, padding='post')

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42, shuffle=True)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X.shape[1]))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=20, batch_size=8, validation_data=(X_test, y_test))

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

sample_text = "Intră pe geam, rapid"
sample_text_processed = preprocess_text(sample_text)
sample_seq = tokenizer.texts_to_sequences([sample_text_processed])
sample_pad = pad_sequences(sample_seq, padding='post', maxlen=X.shape[1])
prediction = model.predict(sample_pad)

print(f"Predicția pentru '{sample_text}': {prediction[0][0]}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Loss: 2.0666975331096182e-07
Accuracy: 1.0
Predicția pentru 'Intră pe geam, rapid': 1.0


In [None]:
sample_text = "nu strica camera de securitate"
sample_text_processed = preprocess_text(sample_text)
sample_seq = tokenizer.texts_to_sequences([sample_text_processed])
sample_pad = pad_sequences(sample_seq, padding='post', maxlen=X.shape[1])
prediction = model.predict(sample_pad)

print(f"Predicția pentru '{sample_text}': {prediction[0][0]}")

Predicția pentru 'nu strica camera de securitate': 1.0


In [None]:
import json
model.save("/content/drive/MyDrive/house_sounds/new2_nlp_model_shieldwave")
tokenizer_json = tokenizer.to_json()
with open("/content/drive/MyDrive/house_sounds/new2_tokenizer.json", "w") as json_file:
    json.dump(tokenizer_json, json_file)



In [None]:
X.shape[1]

16