<a href="https://colab.research.google.com/github/Anou26/NLP-Tasks/blob/main/Assignment_3_Part_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Submitted By:**

**Name: Anoushka Mergoju**

**SUID: 328542442**

**1. Write a Python program that uses the scikit-learn
MLPClassifier class, and the TfidfVectorizer class to create a neural network that can be applied to sentiment analysis.**

**2. Use Stanford Sentiment Treebank (SST)
Dataset, And produce the same kind output:
Precision = 0.8133 (using TF-IDF)
Recall = 0.8133 (using TF-IDF)
Accuracy = 0.8133 (using TF-IDF)**

In [3]:
import numpy as np
import nltk
import re
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split

# Define a function to read and process text data from a specified file path
def load_and_preprocess(file_path):
    '''
    Extract sentences and their corresponding labels from the file, converting labels to a binary format.

    Arguments:
    file_path: str - Path to the data file.
    Returns:
    list, list - Lists containing the processed sentences and their binary labels.
    '''
    labels = []
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        pattern = r'^(__label__[0-9]+)\s+'
        for line in file:
            match = re.match(pattern, line)
            if match:
                original_label = match.group(1)
                text = line[len(original_label):].strip()
                if original_label in ["__label__1", "__label__2"]:
                    binary_label = 0
                elif original_label in ["__label__4", "__label__5"]:
                    binary_label = 1
                else:
                    continue
                labels.append(binary_label)
                sentences.append(text)
    return sentences, labels

# Process data from train, test, and development files
train_sentences, train_labels = load_and_preprocess('/content/sst_train.txt')
test_sentences, test_labels = load_and_preprocess('/content/sst_test.txt')
dev_sentences, dev_labels = load_and_preprocess('/content/sst_dev.txt')

# Set up and configure the TF-IDF Vectorizer, then apply it to the training data
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_sentences)
X_test = tfidf_vectorizer.transform(test_sentences)
X_dev = tfidf_vectorizer.transform(dev_sentences)

# Configure and train the MLPClassifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(512, 128), max_iter=500, activation='relu', solver='adam', learning_rate='adaptive', random_state=42)
mlp_classifier.fit(X_train, train_labels)

# Generate predictions for the test and development datasets
y_test_pred = mlp_classifier.predict(X_test)
y_dev_pred = mlp_classifier.predict(X_dev)

# Evaluate and display the model's performance metrics for the test dataset
test_precision = precision_score(test_labels, y_test_pred)
test_recall = recall_score(test_labels, y_test_pred)
test_accuracy = accuracy_score(test_labels, y_test_pred)

print(f"Test Precision: {test_precision}, Test Recall: {test_recall}, Test Accuracy: {test_accuracy}")


Test Precision: 0.7716115261472786, Test Recall: 0.7953795379537953, Test Accuracy: 0.7803404722679846


**3. Write a Python program that uses tensorflow.keras.Sequential() to create a neural network that can be applied to sentiment analysis.**

**4. Redo Step 2, and produce the same kind output:
Precision = 0.8215 (TFIDF)
Recall = 0. 8215 (TFIDF)
Accuracy = 0. 8215 (TFIDF)**

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score
import re

# Define a function to read and process text data and their labels from a file
def load_and_preprocess(file_path):
    '''
    This function reads a file to extract sentences and assign binary labels to them.
    Arguments:
    file_path: str - Location of the dataset file.
    Returns:
    list, list - Lists of sentences and their corresponding binary labels.
    '''
    labels = []
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        pattern = r'^(__label__[0-9]+)\s+'
        for line in file:
            match = re.match(pattern, line)
            if match:
                original_label = match.group(1)
                text = line[len(original_label):].strip()
                if original_label in ["__label__1", "__label__2"]:
                    binary_label = 0
                elif original_label in ["__label__4", "__label__5"]:
                    binary_label = 1
                else:
                    continue
                labels.append(binary_label)
                sentences.append(text)
    return sentences, labels

# Extract and preprocess data
train_sentences, train_labels = load_and_preprocess('/content/sst_train.txt')
test_sentences, test_labels = load_and_preprocess('/content/sst_test.txt')
dev_sentences, dev_labels = load_and_preprocess('/content/sst_dev.txt')

# Initialize and configure TF-IDF Vectorizer, then apply it to the training data
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_sentences)
X_test = tfidf_vectorizer.transform(test_sentences)
X_dev = tfidf_vectorizer.transform(dev_sentences)

# Function to construct and train the neural network
def build_and_train_nn(X_train, y_train, X_test, y_test):
    model = Sequential([
        Dense(300, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.5),
        Dense(150, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.00005), loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train.toarray(), np.array(y_train), epochs=10, batch_size=64, validation_data=(X_test.toarray(), np.array(y_test)))
    return model

# Train the model using the training data
neural_network = build_and_train_nn(X_train, train_labels, X_dev, dev_labels)

# Predict using the trained model on the test dataset
y_test_pred = (neural_network.predict(X_test.toarray()) > 0.5).astype('int32')

# Evaluate and output the model's performance metrics on the test data
precision = precision_score(test_labels, y_test_pred)
recall = recall_score(test_labels, y_test_pred)
accuracy = accuracy_score(test_labels, y_test_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Precision: 0.7896341463414634
Recall: 0.8547854785478548
Accuracy: 0.8138385502471169
