# Import Libraries

In [None]:
import numpy as np
import os
import re
import string
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import zipfile
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.semi_supervised import LabelPropagation
warnings.filterwarnings('ignore')

2024-06-27 20:23:30.100595: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-27 20:23:30.100657: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-27 20:23:30.102512: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Check if WordNet is available, if not, download it and extract
try:
    nltk.data.find('corpora/wordnet.zip')
except LookupError:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    with zipfile.ZipFile('/kaggle/working/corpora/wordnet.zip', 'r') as zip_ref:
        zip_ref.extractall('/kaggle/working/corpora')
    nltk.data.path.append('/kaggle/working/')

# Now you can import wordnet from nltk.corpus
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /kaggle/working/...
[nltk_data]   Package wordnet is already up-to-date!


# Text Preprocessing

In [None]:
# Define preprocessing function
def preprocess_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [word for word in words if word.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Read Data

In [None]:
# Function to read files from folder and create DataFrame
def create_dataframe(folder, label):
    file_list = os.listdir(folder)
    texts = []
    labels = []
    for file in file_list:
        with open(os.path.join(folder, file), 'r', encoding='utf-8') as f:
            text = f.read()
            texts.append(preprocess_text(text))
            labels.append(label)
    return pd.DataFrame({'text': texts, 'label': labels})

In [None]:
# Paths to positive and negative folders for training and testing
positive_folder_train = '/kaggle/input/imdb-nlp/aclImdb/train/pos'
negative_folder_train = '/kaggle/input/imdb-nlp/aclImdb/train/neg'
positive_folder_test = '/kaggle/input/imdb-nlp/aclImdb/test/pos'
negative_folder_test = '/kaggle/input/imdb-nlp/aclImdb/test/neg'

In [None]:
# Create DataFrames for positive and negative reviews
df_train_positive = create_dataframe(positive_folder_train, 1)
df_train_negative = create_dataframe(negative_folder_train, 0)
df_test_positive = create_dataframe(positive_folder_test, 1)
df_test_negative = create_dataframe(negative_folder_test, 0)

In [None]:
# Combine DataFrames
df_train = pd.concat([df_train_positive, df_train_negative], ignore_index=True).sample(frac=1).reset_index(drop=True)
df_test = pd.concat([df_test_positive, df_test_negative], ignore_index=True).sample(frac=1).reset_index(drop=True)

# Save Data

In [None]:
# Save combined data
df_train.to_csv('imdb_train.csv', index=False)
df_test.to_csv('imdb_test.csv', index=False)

# Word Embedding

In [None]:
model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def get_embeddings(texts):
    return model(texts).numpy()

In [None]:
# Generate embeddings for train and test sets
train_embeddings = get_embeddings(df_train['text'].tolist())
test_embeddings = get_embeddings(df_test['text'].tolist())

In [None]:
train_embeddings[0]

array([-0.04525758, -0.0526232 , -0.01014766,  0.04062925,  0.0123058 ,
       -0.01809353,  0.04096839,  0.033743  ,  0.01182944,  0.00996784,
        0.0685721 , -0.01391728,  0.05978886, -0.03836506,  0.06837686,
       -0.05075344, -0.0198391 , -0.01898024,  0.04844068, -0.05835446,
        0.00498058, -0.05688594, -0.04241044, -0.02337722,  0.06099991,
       -0.06403549,  0.06839496, -0.02904262,  0.06546114,  0.03911061,
       -0.02278733, -0.04727154,  0.04120048,  0.03924045, -0.04610619,
       -0.0631501 , -0.03527929, -0.044189  , -0.0630409 , -0.01752204,
        0.00076708, -0.0672782 , -0.05479724,  0.00806737,  0.03632718,
       -0.03719103,  0.01311381,  0.0609249 ,  0.05223392, -0.01754012,
        0.06829661, -0.03461567, -0.05955932,  0.01997112,  0.04085742,
       -0.01057106,  0.06483255,  0.0107857 ,  0.01377313, -0.01535443,
        0.04442367, -0.0430274 , -0.00094905, -0.05589837, -0.00386348,
        0.04924662, -0.00944751,  0.00110074,  0.05669277, -0.06

In [None]:
len(train_embeddings)

25000

In [None]:
train_embeddings.shape

(25000, 512)

In [None]:
# model_name = 'princeton-nlp/sup-simcse-bert-base-uncased'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)

In [None]:
# def get_embeddings(texts):
#     inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
#     with torch.no_grad():
#         embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
#     return embeddings.cpu().numpy()

In [None]:
# # Generate embeddings for train and test sets
# train_embeddings = get_embeddings(df_train['text'].tolist())
# test_embeddings = get_embeddings(df_test['text'].tolist())

# Save Embedding

In [None]:
# # Save embeddings
# np.save('train_embeddings.npy', train_embeddings)
# np.save('test_embeddings.npy', test_embeddings)



# Specify the directory path where you want to save the embeddings file
directory_path = '/kaggle/working/'

# Create the directory if it doesn't exist
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

# Save the embeddings file in the specified directory
np.save(os.path.join(directory_path, 'train_embeddings.npy'), train_embeddings)
np.save(os.path.join(directory_path, 'test_embeddings.npy'), test_embeddings)

In [None]:
# Load embeddings and labels
train_embeddings = np.load('train_embeddings.npy')
test_embeddings = np.load('test_embeddings.npy')

In [None]:
df_train=pd.read_csv("/kaggle/working/imdb_train.csv")
df_test=pd.read_csv("/kaggle/working/imdb_test.csv")

In [None]:
train_labels = df_train['label'].values
test_labels = df_test['label'].values

In [None]:
train_labels[:10]

array([0, 1, 0, 1, 0, 0, 0, 1, 1, 0])

# Supervised Learning Model

In [None]:
# Train supervised model
clf_supervised = LogisticRegression(max_iter=1000)
clf_supervised.fit(train_embeddings, train_labels)
preds_supervised = clf_supervised.predict(test_embeddings)
accuracy_supervised = accuracy_score(test_labels, preds_supervised)

In [None]:
# Print accuracy
print(f"Supervised Model Accuracy: {accuracy_supervised}")

Supervised Model Accuracy: 0.83248


In [None]:
def evaluate_model(clf, test_embeddings, test_labels):
    preds = clf.predict(test_embeddings)
    accuracy = accuracy_score(test_labels, preds)
    return accuracy

# Label Propagation

In [None]:
def label_propagation(train_embeddings, train_labels):
    label_prop_model = LabelPropagation()
    label_prop_model.fit(train_embeddings, train_labels)
    return label_prop_model

In [None]:
 fraction = 0.1

In [None]:
# Label Propagation
num_labeled = int(fraction * len(train_labels[:10000]))
semi_supervised_labels = np.copy(train_labels[:10000])
semi_supervised_labels[num_labeled:] = -1  # -1 indicates unlabeled data for LabelPropagation
label_prop_model = label_propagation(train_embeddings[:10000], semi_supervised_labels)
propagated_labels = label_prop_model.transduction_

In [None]:
len(propagated_labels)

10000

In [None]:
# train_labels[:100]

# Self-Supervised Model

In [None]:
# Self-Supervised Model with propagated labels
clf_self_supervised = LogisticRegression(max_iter=1000)
clf_self_supervised.fit(train_embeddings[:10000], propagated_labels)
accuracy_self_supervised = evaluate_model(clf_self_supervised, test_embeddings, test_labels)
print(f"Self-Supervised Model Accuracy: {accuracy_self_supervised}")

Self-Supervised Model Accuracy: 0.73696


# Semi_Supervised Model

In [None]:
def train_semi_supervised_model(train_embeddings, train_labels, fraction):
    num_labeled = int(fraction * len(train_labels))
    clf_semi_supervised = LogisticRegression(max_iter=1000)
    clf_semi_supervised.fit(train_embeddings[:num_labeled], train_labels[:num_labeled])
    return clf_semi_supervised

### Find the minimum amount of labeled data required to achieve the result of the fully supervised model.

In [None]:
def find_min_fraction(train_embeddings, train_labels, test_embeddings, test_labels, accuracy_supervised, threshold=0.01):
    fractions = np.linspace(0.1, 1.0, 10)
    best_fraction = 1.0
    for fraction in fractions:
        clf_semi_supervised = train_semi_supervised_model(train_embeddings, train_labels, fraction)
        accuracy_semi_supervised = evaluate_model(clf_semi_supervised, test_embeddings, test_labels)
        print(f"Semi-Supervised Model Accuracy (using {fraction * 100}% labeled data): {accuracy_semi_supervised*100}")
        if accuracy_supervised - accuracy_semi_supervised <= threshold:
            best_fraction = fraction
            break
    return best_fraction

# Minimum Fraction Of Labeled Data

In [None]:
# Find minimum fraction of labeled data
min_fraction = find_min_fraction(train_embeddings, train_labels, test_embeddings, test_labels, accuracy_supervised)
print(f"Minimum fraction of labeled data required: {min_fraction}")

Semi-Supervised Model Accuracy (using 10.0% labeled data): 81.592
Semi-Supervised Model Accuracy (using 20.0% labeled data): 82.28
Minimum fraction of labeled data required: 0.2
