1. Using Gensim, train a doc2vec model on the Brown Corpus. Try to classify documents from each category.

In [2]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from nltk.corpus import brown
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

#Loading the Brown Corpus
brown_corpus = brown.sents()

#Preprocessing the corpus and create TaggedDocuments
tagged_data = [TaggedDocument(words=word_tokenize(' '.join(sent).lower()), tags=[str(i)]) for i, sent in enumerate(brown_corpus)]

#Training the Doc2Vec model
model = gensim.models.Doc2Vec(vector_size=100, min_count=2, epochs=40)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

#Creating Infer document vectors for each document
document_vectors = [model.infer_vector(doc.words) for doc in tagged_data]

#Creating training data and labels
X_train = document_vectors[:500] + document_vectors[700:1200]  # Select documents from 'news' and 'editorial' categories
y_train = ['news'] * 500 + ['editorial'] * 500

#Creating test data and labels
X_test = document_vectors[500:700] + document_vectors[1200:1400]  # Select documents from 'news' and 'editorial' categories
y_test = ['news'] * 200 + ['editorial'] * 200

#Training a classifier (e.g., Logistic Regression) on the document vectors
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

#Evaluating the classifier
accuracy = classifier.score(X_test, y_test)
print("Accuracy:", accuracy)


[nltk_data] Downloading package punkt to /Users/db/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.7375


   2. Use the stop word removal code from earlier on the 20 user groups:
        
        How does that effect the word model distance of documents?
        
        How does it effect the logistic regression classifier?

In [None]:
import nltk
##nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#Preprocessing the corpus and create TaggedDocuments with stop word removal
stop_words = set(stopwords.words('english'))
tagged_data = []
for i, sent in enumerate(brown_corpus):
    filtered_sent = [word.lower() for word in sent if word.lower() not in stop_words]
    tagged_data.append(TaggedDocument(words=filtered_sent, tags=[str(i)]))

#Training the Doc2Vec model
model = gensim.models.Doc2Vec(vector_size=100, min_count=2, epochs=40)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

#Infering document vectors for each document
document_vectors = [model.infer_vector(doc.words) for doc in tagged_data]

#Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(document_vectors, [str(i) for i in range(len(brown_corpus))], test_size=0.2, random_state=42)

#Training logistic regression classifier without stop word removal
classifier_no_stopwords = LogisticRegression()
classifier_no_stopwords.fit(X_train, y_train)
y_pred_no_stopwords = classifier_no_stopwords.predict(X_test)
accuracy_no_stopwords = accuracy_score(y_test, y_pred_no_stopwords)
print("Accuracy without stop word removal:", accuracy_no_stopwords)

#Training logistic regression classifier with stop word removal
classifier_with_stopwords = LogisticRegression()
classifier_with_stopwords.fit(X_train, y_train)
y_pred_with_stopwords = classifier_with_stopwords.predict(X_test)
accuracy_with_stopwords = accuracy_score(y_test, y_pred_with_stopwords)
print("Accuracy with stop word removal:", accuracy_with_stopwords)
