In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import gensim.downloader as api

import string
import re


import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('TRAINING_DATA.txt', delimiter='\t')

# Rename columns for easier reference
df.columns = ['label', 'sentence']

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text

df['sentence'] = df['sentence'].apply(preprocess_text)

# Load pre-trained FastText embeddings
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['sentence'])
sequences = tokenizer.texts_to_sequences(df['sentence'])

# Pad sequences to ensure equal length
max_sequence_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Convert labels to numpy array
y = df['label'].values

# Create embedding matrix
embedding_dim = 300
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    if word in fasttext_model:
        embedding_matrix[i] = fasttext_model[word]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the neural network
model = Sequential([
    Embedding(input_dim=len(word_index) + 1,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_sequence_length,
              trainable=False),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32")

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 1/10
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m452s[0m 1s/step - accuracy: 0.4978 - loss: 0.6956 - val_accuracy: 0.5109 - val_loss: 0.6804
Epoch 2/10
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m437s[0m 1s/step - accuracy: 0.5370 - loss: 0.6916 - val_accuracy: 0.5205 - val_loss: 0.6805
Epoch 3/10
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m453s[0m 2s/step - accuracy: 0.5333 - loss: 0.6810 - val_accuracy: 0.5142 - val_loss: 0.6783
Epoch 4/10
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m453s[0m 2s/step - accuracy: 0.5311 - loss: 0.6823 - val_accuracy: 0.5197 - val_loss: 0.6771
Epoch 5/10
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 1s/step - accuracy: 0.5365 - loss: 0.6823 - val_accuracy: 0.5193 - val_loss: 0.6804
Epoch 6/10
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m488s[0m 2s/step - accuracy: 0.5476 - loss: 0.6749 - val_accuracy: 0.5352 - val_loss: 0.6785
Epoch 7/10
[1m299/299

In [18]:
# Load the real data for prediction
real_data_file = 'REAL_DATA.txt'
with open(real_data_file, 'r', encoding='utf-8') as file:
    sentences = file.readlines()

# Create a DataFrame
real_data = pd.DataFrame(sentences, columns=['sentence'])

# Preprocess the real data
real_data['sentence'] = real_data['sentence'].apply(preprocess_text)

# Tokenize the real data sentences
real_sequences = tokenizer.texts_to_sequences(real_data['sentence'])

# Pad sequences to ensure equal length
X_real = pad_sequences(real_sequences, maxlen=max_sequence_length)

# Make predictions on the real data
real_data_pred_prob = model.predict(X_real)
real_data_predictions = (real_data_pred_prob > 0.5).astype("int32")

# Add predictions to the real_data dataframe
real_data['label'] = real_data_predictions



# Print the predictions
print(real_data)

[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 515ms/step
                                               sentence  predicted_label
0     \tyo no creo que a nadie le haya encantado un ...                0
1     \tno va a resolver sus problemas de crédito o ...                0
2                                \tte encantará este \n                1
3     \tyo estaba a volar a un aeropuerto varias hor...                1
4     \t maid en manhattan  the wedding planner  jer...                1
...                                                 ...              ...
2196  \trobert pattinson se está moviendo desde su i...                0
2197                                \tera tan fresco \n                0
2198  \tal salir de la sala de ensayos de laboratori...                0
2199  \t bueno  si usted pensaba que no era bueno pa...                1
2200  \tcuando josh tenía  años  se sentó con las pi...                0

[2201 rows x 2 columns]


In [19]:
# Load the real data for prediction
real_data_file = 'REAL_DATA.txt'
with open(real_data_file, 'r', encoding='utf-8') as file:
    sentences = file.readlines()

# Create a DataFrame
real_data = pd.DataFrame(sentences, columns=['sentence'])

# Preprocess the real data
real_data['sentence'] = real_data['sentence'].apply(preprocess_text)

# Tokenize the real data sentences
real_sequences = tokenizer.texts_to_sequences(real_data['sentence'])

# Pad sequences to ensure equal length
X_real = pad_sequences(real_sequences, maxlen=max_sequence_length)

# Make predictions on the real data
real_data_pred_prob = model.predict(X_real)
real_data_predictions = (real_data_pred_prob > 0.5).astype("int32")

# Add predictions to the real_data dataframe
real_data['label'] = real_data_predictions



# Print the predictions
print(real_data)

[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 1s/step
                                               sentence  label
0     \tyo no creo que a nadie le haya encantado un ...      0
1     \tno va a resolver sus problemas de crédito o ...      0
2                                \tte encantará este \n      1
3     \tyo estaba a volar a un aeropuerto varias hor...      1
4     \t maid en manhattan  the wedding planner  jer...      1
...                                                 ...    ...
2196  \trobert pattinson se está moviendo desde su i...      0
2197                                \tera tan fresco \n      0
2198  \tal salir de la sala de ensayos de laboratori...      0
2199  \t bueno  si usted pensaba que no era bueno pa...      1
2200  \tcuando josh tenía  años  se sentó con las pi...      0

[2201 rows x 2 columns]


In [None]:
# Save the final results as a CSV file
real_data.to_csv('Real_Data_Predicted2.txt', index=False)

In [20]:


# Print the original columns
print("Original columns:", real_data.columns)

# Move the second column to the first position
cols = real_data.columns.tolist()
cols.insert(0, cols.pop(1))  # Pop the second column and insert it at the first position
real_data = real_data[cols]

# Print the new columns to verify
print("Modified columns:", real_data.columns)

# Save the modified DataFrame to a .txt file with tab-separated values and without column names
real_data.to_csv('real_data_predicted2.txt', sep='\t', index=False, header=False)

# Verify the changes
print(real_data.head())


Original columns: Index(['sentence', 'label'], dtype='object')
Modified columns: Index(['label', 'sentence'], dtype='object')
   label                                           sentence
0      0  \tyo no creo que a nadie le haya encantado un ...
1      0  \tno va a resolver sus problemas de crédito o ...
2      1                             \tte encantará este \n
3      1  \tyo estaba a volar a un aeropuerto varias hor...
4      1  \t maid en manhattan  the wedding planner  jer...


In [21]:
report.to_csv('expectations', sep='\t', index=False, header=False)

AttributeError: 'str' object has no attribute 'to_csv'

In [22]:
report_str = f"Accuracy: {accuracy}\n\nClassification Report:\n{report}"

# Save the report to a .txt file
with open('expectations.txt', 'w') as file:
    file.write(report_str)

# Print the report to verify
print(report_str)

Accuracy: 0.5219430485762144

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.58      0.55      1519
           1       0.51      0.46      0.49      1466

    accuracy                           0.52      2985
   macro avg       0.52      0.52      0.52      2985
weighted avg       0.52      0.52      0.52      2985

