# Code for LDA Model Work
### Mon April 17th



### 1. Imports

In [None]:

import os
import pandas as pd
import numpy as np
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


### 1. LMR Models

In [None]:

# Load the data
train_data = pd.read_csv("./lmr_train_mixed_labels.csv")
test_data = pd.read_csv("lmr_test.csv")

# Remove any rows with missing values
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

# Tokenize the reviews and remove stop words
stop_words = set(stopwords.words('english'))

def tokenize_review(review):
    tokens = word_tokenize(review)
    return [token.lower() for token in tokens if token.lower() not in stop_words]

train_data['tokens'] = train_data['review'].apply(tokenize_review)
test_data['tokens'] = test_data['review'].apply(tokenize_review)

# Convert the tokenized reviews to a bag of words representation
vectorizer = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
train_bow = vectorizer.fit_transform(train_data['tokens'])
test_bow = vectorizer.transform(test_data['tokens'])

# Train the LDA model
n_topics = 50
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(train_bow)

# Generate sparse vectors (word embeddings) for the train, validation and test sets
train_embeddings = lda.transform(train_bow)
test_embeddings = lda.transform(test_bow)

print(train_embeddings[0:5,:])
# Train the logistic regression model
lr = LogisticRegression(random_state=42)
lr.fit(train_embeddings, train_data['label'])

# Evaluate the model on the validation set
val_preds = lr.predict(test_embeddings)
val_acc = accuracy_score(test_data['label'], val_preds)
val_f1 = f1_score(test_data['label'], val_preds, average='weighted')
val_precision = precision_score(test_data['label'], val_preds, average='weighted')
val_recall = recall_score(test_data['label'], val_preds, average='weighted')

print("Validation set results:")
print("Accuracy: {:.4f}".format(val_acc))
print("F1 Score: {:.4f}".format(val_f1))
print("Precision: {:.4f}".format(val_precision))
print("Recall: {:.4f}".format(val_recall))

# Make predictions on the test set
test_preds = lr.predict(test_embeddings)

# Save the predictions to a CSV file
test_data['label'] = test_preds
test_data[['review', 'label']].to_csv("lmr_test_preds.csv", index=False)


#Increasing the number of topics

# Train the LDA model
n_topics = 10
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(train_bow)

# Generate sparse vectors (word embeddings) for the train, validation and test sets
train_embeddings = lda.transform(train_bow)
test_embeddings = lda.transform(test_bow)

print(train_embeddings[0:5,:])
# Train the logistic regression model
lr = LogisticRegression(random_state=42)
lr.fit(train_embeddings, train_data['label'])

# Evaluate the model on the validation set
val_preds = lr.predict(test_embeddings)
val_acc = accuracy_score(test_data['label'], val_preds)
val_f1 = f1_score(test_data['label'], val_preds, average='weighted')
val_precision = precision_score(test_data['label'], val_preds, average='weighted')
val_recall = recall_score(test_data['label'], val_preds, average='weighted')

print("Validation set results:")
print("Accuracy: {:.4f}".format(val_acc))
print("F1 Score: {:.4f}".format(val_f1))
print("Precision: {:.4f}".format(val_precision))
print("Recall: {:.4f}".format(val_recall))

# Make predictions on the test set
test_preds = lr.predict(test_embeddings)


### 2. ARP Models

In [None]:
# Load the data
train_data = pd.read_csv("arp_train.csv")
test_data = pd.read_csv("arp_test.csv")

# Remove any rows with missing values
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [None]:
def clean_text(lines, review_lines):
    for line in lines:
        tokens = word_tokenize(line)
        #convert to lower case
        tokens = [w.lower() for w in tokens]
        #remove punctuation from each word
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        #remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
        #filter out stop words
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        words = [wn.lemmatize(w) for w in words]
        review_lines.append(words)
    return review_lines

In [None]:
clean_test = list()
clean_text(test_data["review"], clean_test)
clean_train = list()
clean_text(train_data["review"], clean_train)

train_docs = [' '.join(sublist) for sublist in clean_train]
test_docs = [' '.join(sublist) for sublist in clean_test]

In [None]:
train_labels = train_data['label'].apply(lambda x: 1 if x == 'pos' else 0)

test_labels = test_data['label'].apply(lambda x: 1 if x == 'pos' else 0)


In [None]:

vectorizer = CountVectorizer()
train_bow = vectorizer.fit_transform(train_docs)
test_bow = vectorizer.fit_transform(test_docs)

In [None]:
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(train_bow)


train_embeddings = lda.transform(train_bow)
print(2)
test_embeddings = lda.transform(test_bow)

In [None]:
lr = LogisticRegression(random_state=42)
lr.fit(train_embeddings, train_labels)

# Evaluate the model on the validation set
val_preds = lr.predict(test_embeddings)
val_acc = accuracy_score(test_labels, val_preds)
val_f1 = f1_score(test_labels, val_preds, average='weighted')
val_precision = precision_score(test_labels, val_preds, average='weighted')
val_recall = recall_score(test_labels, val_preds, average='weighted')

print("Validation set results:")
print("Accuracy: {:.4f}".format(val_acc))
print("F1 Score: {:.4f}".format(val_f1))
print("Precision: {:.4f}".format(val_precision))
print("Recall: {:.4f}".format(val_recall))

In [1]:
# old imports


import os
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
nltk.download('punkt')
nltk.download('stopwords')


import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wn = WordNetLemmatizer()

import os
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
