# Importing necessary libraries

In [28]:
import os
import nltk
nltk.download('punkt')
import spacy
import en_core_web_sm
from nltk.tokenize import word_tokenize
from gsdmm import MovieGroupProcess
import gensim
import os
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from sklearn.feature_extraction.text import CountVectorizer
from gensim import models
from sklearn.ensemble import  AdaBoostClassifier

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akhil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading the data into document_data

In [8]:
# Load data
path_to_data = r"C:\Users\akhil\Desktop\NLP Assignment-2\comments1k"
document_data = []
for file_name in os.listdir(path_to_data):
    with open(os.path.join(path_to_data, file_name), "r", encoding="utf-8") as f:
        doc = f.read()
        document_data.append(doc)

# Preprocessing Data

In [13]:
def preprocessing(document_data):
    # Split comments into sentences
    sentences = []
    for data in document_data:
        sentences += nltk.sent_tokenize(data)


    # Do tokenization for the dataset
    tokens = []
    for data1 in document_data:
        tokens += nltk.word_tokenize(data1)

    # choose necessary text preprocessing techniques for the dataset to generate a corpus for word embeddings
    stopwords = set(nltk.corpus.stopwords.words('english'))
    stopwords.add("br")
    stopwords.add("film")
    stopwords.add("the")
    stopwords.add("a")
    stopwords.add("one")
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    corpus = []
    for comment in document_data:
        # Lowercase the comment
        comment = comment.lower()

        # Tokenize the comment
        tokens = nltk.word_tokenize(comment)

        # Remove stopwords and lemmatize the tokens
        filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords and token.isalpha()]

        # Add the filtered tokens to the corpus
        corpus.append(" ".join(filtered_tokens))

    return corpus


In [14]:
fine_data = preprocessing(document_data)

In [15]:
fine_data

['bromwell high cartoon comedy ran time program school life teacher year teaching profession lead believe bromwell high satire much closer reality teacher scramble survive financially insightful student see right pathetic teacher pomp pettiness whole situation remind school knew student saw episode student repeatedly tried burn school immediately recalled high classic line inspector sack teacher student welcome bromwell high expect many adult age think bromwell high far fetched pity',
 'scott bartlett nine minute pure craziness assault psychedelic pulsating flashing light colour first true merging video cinema story speak bartlett us image nature particularly human face form provoke sequence emotional reaction integrating biological phenomenon form modern technology sense represents merging humanity tool machinery technology theme connects loosely subplot stanley kubrick space odyssey indeed bartlett opening sequence image flashing colour human eye recall dave bowman journey stargate v

# LDA Topic Modeling

In [16]:

# Create a CountVectorizer object and transform the documents
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X = vectorizer.fit_transform(fine_data)

# Convert the corpus into a list of tokenized documents
tokenized_docs = [doc.split() for doc in fine_data]

# Create a dictionary of the tokenized documents
dictionary = Dictionary(tokenized_docs)

# Convert the tokenized documents into a document-term matrix

doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Train the LDA topic model with 10 topics
num_topics = 10
lda_model = LdaModel(corpus=doc_term_matrix, id2word=dictionary, num_topics=num_topics, random_state=1622)

# Print the top 8 words for each topic
for topic_id, topic in lda_model.show_topics(num_topics=num_topics, num_words=8, formatted=False):
    top_words = [word for word, _ in topic]
    print(f"Topic {topic_id}: {top_words}")

# Assign topics to specific documents
doc_0_9_bow = dictionary.doc2bow(document_data[0].split())
doc_1_7_bow = dictionary.doc2bow(document_data[1].split())
doc_0_9_topic_distribution = lda_model.get_document_topics(doc_0_9_bow)
doc_1_7_topic_distribution = lda_model.get_document_topics(doc_1_7_bow)

# Print the topics assigned to the documents
print(f"Topics assigned to document 0_9.txt: {doc_0_9_topic_distribution}")
print(f"Topics assigned to document 1_7.txt: {doc_1_7_topic_distribution}")


Topic 0: ['movie', 'great', 'show', 'story', 'character', 'best', 'star', 'like']
Topic 1: ['movie', 'good', 'well', 'time', 'get', 'like', 'life', 'think']
Topic 2: ['movie', 'like', 'see', 'story', 'good', 'much', 'character', 'get']
Topic 3: ['movie', 'like', 'character', 'show', 'good', 'life', 'scene', 'see']
Topic 4: ['movie', 'good', 'like', 'character', 'also', 'story', 'time', 'love']
Topic 5: ['movie', 'like', 'story', 'character', 'good', 'love', 'time', 'show']
Topic 6: ['movie', 'story', 'even', 'great', 'like', 'time', 'life', 'character']
Topic 7: ['movie', 'like', 'good', 'would', 'story', 'see', 'time', 'well']
Topic 8: ['movie', 'time', 'story', 'character', 'film', 'would', 'well', 'good']
Topic 9: ['movie', 'like', 'time', 'great', 'best', 'love', 'show', 'good']
Topics assigned to document 0_9.txt: [(1, 0.0747315), (2, 0.1071436), (3, 0.15282355), (4, 0.6527097)]
Topics assigned to document 1_7.txt: [(7, 0.06506149), (8, 0.92784405)]


# GSDMM Model 


In [35]:
Gsdmm_Data_variable = [word_tokenize(doc) for doc in fine_data]
dictionary_gsdmm = gensim.corpora.Dictionary(var1)
# filtering  extreme cases out of dictionary
dictionary_gsdmm.filter_extremes(no_below=15, no_above=1)

# creating a variable containing length of dictionary and vocabulary length
vocabulary_length = len(dictionary_gsdmm)

# creating Bag of words dictionary
corpus_bag_of_words = [dictionary.doc2bow(GDV2) for GDV2 in Gsdmm_Data_variable]
gsdmm = MovieGroupProcess(K=10, alpha=0.1, beta=0.3, n_iters=15)

# fitting GSDMM model
y = gsdmm.fit(Gsdmm_Data_variable, vocabulary_length)
# Assign topics to specific documents
doc_0_9_bow = dictionary_gsdmm.doc2bow(document_data[0].split())
doc_1_7_bow = dictionary_gsdmm.doc2bow(document_data[1].split())
doc_0_9_topic_distribution = gsdmm.choose_best_label(doc_0_9_bow)
doc_1_7_topic_distribution = gsdmm.choose_best_label(doc_1_7_bow)

# Print the topics assigned to the documents
print(f"Topics assigned to document 0_9.txt: {doc_0_9_topic_distribution}")
print(f"Topics assigned to document 1_7.txt: {doc_1_7_topic_distribution}")

print('-----Word_Distribution------')
for cluster in range(10):
    
    print(sorted(gsdmm.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:8])

In stage 0: transferred 871 clusters with 10 clusters populated
In stage 1: transferred 247 clusters with 10 clusters populated
In stage 2: transferred 95 clusters with 10 clusters populated
In stage 3: transferred 63 clusters with 10 clusters populated
In stage 4: transferred 55 clusters with 10 clusters populated
In stage 5: transferred 45 clusters with 10 clusters populated
In stage 6: transferred 31 clusters with 10 clusters populated
In stage 7: transferred 27 clusters with 10 clusters populated
In stage 8: transferred 32 clusters with 10 clusters populated
In stage 9: transferred 38 clusters with 10 clusters populated
In stage 10: transferred 35 clusters with 10 clusters populated
In stage 11: transferred 34 clusters with 10 clusters populated
In stage 12: transferred 35 clusters with 10 clusters populated
In stage 13: transferred 49 clusters with 10 clusters populated
In stage 14: transferred 48 clusters with 10 clusters populated
Topics assigned to document 0_9.txt: (6, 0.81281

# Active Learning Strategy

In [38]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Generate a synthetic dataset
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)
# Split the dataset into initial training set and pool set
X_train, X_pool, y_train, y_pool = train_test_split(X, y, test_size=0.9, random_state=42)
# Initialize the active learning loop
iterations = 10
batch_size = 10
model = LogisticRegression(random_state=42)
for i in range(iterations):
    print("Iteration {}:".format(i+1))
    # Train the model on the current training set
    model.fit(X_train, y_train)
    # Predict the labels of the unlabeled instances in the pool set
    y_pool_pred = model.predict(X_pool)
    ### below
    y_pool_prob = model.predict_proba(X_pool)
    entropy = -np.sum(y_pool_prob * np.log(y_pool_prob), axis=1)
    query_idx = np.argsort(entropy)[-batch_size:]
    ### above
    X_query = X_pool[query_idx]
    y_query = y_pool[query_idx]
    # Add the labeled instances to the training set and remove them from the pool set
    X_train = np.concatenate([X_train, X_query])
    y_train = np.concatenate([y_train, y_query])
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)
    # Compute and print the accuracy of the model on the test set
    y_test_pred = model.predict(X_pool)
    accuracy = accuracy_score(y_pool, y_test_pred)
    print("Accuracy: {:.3f}\n".format(accuracy))

Iteration 1:
Accuracy: 0.828

Iteration 2:
Accuracy: 0.834

Iteration 3:
Accuracy: 0.851

Iteration 4:
Accuracy: 0.864

Iteration 5:
Accuracy: 0.874

Iteration 6:
Accuracy: 0.879

Iteration 7:
Accuracy: 0.881

Iteration 8:
Accuracy: 0.883

Iteration 9:
Accuracy: 0.886

Iteration 10:
Accuracy: 0.894



# Query By Commitee Strategy

In [37]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

# Generate a synthetic dataset
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

# Split the dataset into initial training set and pool set
X_train, X_pool, y_train, y_pool = train_test_split(X, y, test_size=0.9, random_state=42)

# Initialize the active learning loop
iterations = 10
batch_size = 10
models = []
for i in range(3):
    models.append(LogisticRegression(random_state=42))

for i in range(iterations):
    print("Iteration {}:".format(i+1))

    # Train the committee of models on the current training set
    for j in range(len(models)):
        model = models[j]
        model.fit(X_train, y_train)

    # Compute the disagreement among the committee members for each unlabeled instance in the pool set
    y_pool_preds = []
    for j in range(len(models)):
        model = models[j]
        y_pool_pred = model.predict(X_pool)
        y_pool_preds.append(y_pool_pred)
    y_pool_preds = np.array(y_pool_preds)
    disagreement = np.sum(y_pool_preds != np.expand_dims(y_pool_preds.mean(axis=0), axis=0), axis=0)

    # Select the instances with the highest disagreement for labeling
    query_idx = np.argsort(disagreement)[-batch_size:]
    X_query = X_pool[query_idx]
    y_query = y_pool[query_idx]

    # Add the labeled instances to the training set and remove them from the pool set
    X_train = np.concatenate([X_train, X_query])
    y_train = np.concatenate([y_train, y_query])
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)

    # Shuffle the pool set to avoid selecting the same instances in the next iteration
    X_pool, y_pool = shuffle(X_pool, y_pool, random_state=42)

    # Compute and print the accuracy of the committee on the test set
    y_test_preds = []
    for j in range(len(models)):
        model = models[j]
        y_test_pred = model.predict(X_pool)
        y_test_preds.append(y_test_pred)
    y_test_preds = np.array(y_test_preds)
    y_test_pred_ensemble = np.round(y_test_preds.mean(axis=0))
    accuracy = accuracy_score(y_pool, y_test_pred_ensemble)
    print("Accuracy: {:.3f}\n".format(accuracy))


Iteration 1:
Accuracy: 0.825

Iteration 2:
Accuracy: 0.830

Iteration 3:
Accuracy: 0.840

Iteration 4:
Accuracy: 0.848

Iteration 5:
Accuracy: 0.846

Iteration 6:
Accuracy: 0.855

Iteration 7:
Accuracy: 0.854

Iteration 8:
Accuracy: 0.855

Iteration 9:
Accuracy: 0.856

Iteration 10:
Accuracy: 0.854

