## Assignment 2

## Part A

#### Prompt: Generate code for sentiment analysis

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# example text to analyze
text = "I love this product! It's amazing."

# analyze the sentiment of the text
scores = sia.polarity_scores(text)

# print the sentiment scores
print(scores)

#### Prompt: add topic modeling into the function above

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

def analyze_sentiment_and_topics(text):
    # analyze the sentiment of the text
    sentiment_scores = sia.polarity_scores(text)
    
    # tokenize the text
    tokens = word_tokenize(text.lower())
    
    # create a bag-of-words representation of the tokens
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(tokens)
    
    # perform topic modeling using Latent Dirichlet Allocation (LDA)
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(X)
    topics = lda.components_
    
    # return the sentiment scores and topics
    return sentiment_scores, topics


tp = analyze_sentiment_and_topics(text)
tp

#### Prompt: add tokenize feature into the code above and increase the complexity of the code

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# initialize the lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')

def analyze_sentiment_and_topics(text):
    # tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # analyze the sentiment of each sentence
    sentiment_scores = []
    for sentence in sentences:
        # tokenize the sentence and remove stopwords and punctuation
        tokens = [lemmatizer.lemmatize(token.lower()) for token in word_tokenize(sentence) 
                  if token.lower() not in stopwords and token.isalnum()]
        
        # analyze the sentiment of the sentence
        sentence_sentiment = sia.polarity_scores(' '.join(tokens))
        sentiment_scores.append(sentence_sentiment)
    
    # create a bag-of-words representation of the tokens
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform([' '.join(tokens) for tokens in sentences])
    
    # perform topic modeling using Latent Dirichlet Allocation (LDA)
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(X)
    topics = lda.components_
    
    # return the sentiment scores and topics
    return sentiment_scores, topics


#### Prompt: transform the code above to do function that only do sentiment analysis on sentences contains snacks  brands:

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# initialize the lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')

# define a list of snack brands
snack_brands = ['lays', 'doritos', 'cheetos', 'pringles', 'fritos']

def analyze_sentiment_snack_sentences(text):
    # tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # analyze the sentiment of sentences that mention snack brands
    sentiment_scores = []
    for sentence in sentences:
        # check if the sentence mentions any snack brands
        snack_mentioned = any([brand in sentence.lower() for brand in snack_brands])
        
        if snack_mentioned:
            # tokenize the sentence and remove stopwords and punctuation
            tokens = [lemmatizer.lemmatize(token.lower()) for token in word_tokenize(sentence) 
                      if token.lower() not in stopwords and token.isalnum()]

            # analyze the sentiment of the sentence
            sentence_sentiment = sia.polarity_scores(' '.join(tokens))
            sentence_sentiment['sentence'] = sentence
            sentiment_scores.append(sentence_sentiment)
    
    # return the sentiment scores
    return sentiment_scores


#### Prompt: transform the code above that find the sentiment score of each snacks in a text file and calcute the lift value for each snack:

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import math

# initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# initialize the lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')

# define a list of snack brands
snack_brands = ['lays', 'doritos', 'cheetos', 'pringles', 'fritos']

def calculate_lift_value(file_path):
    # read the contents of the file
    with open(file_path, 'r') as f:
        text = f.read()

    # tokenize the text into sentences
    sentences = sent_tokenize(text)

    # create a dictionary to store the frequency of each snack brand in the text
    snack_frequencies = {brand: 0 for brand in snack_brands}

    # loop through each sentence in the text
    for sentence in sentences:
        # loop through each snack brand
        for brand in snack_brands:
            # check if the sentence mentions the snack brand
            if brand in sentence.lower():
                snack_frequencies[brand] += 1

    # calculate the total number of sentences in the text
    total_sentences = len(sentences)

    # calculate the lift value of each snack brand
    snack_lift_values = {}
    for brand in snack_brands:
        # calculate the frequency of the snack brand in the text
        snack_frequency = snack_frequencies[brand]

        # calculate the frequency of the snack brand in the absence of the text
        non_snack_frequency = total_sentences - snack_frequency

        # calculate the frequency of all other snack brands in the text
        other_snack_frequencies = sum([snack_frequencies[b] for b in snack_brands if b != brand])

        # calculate the frequency of all other words in the text
        all_words = word_tokenize(text.lower())
        all_word_frequencies = Counter(all_words)
        other_word_frequencies = sum([all_word_frequencies[w] for w in all_word_frequencies.keys() 
                                      if w not in snack_brands and w not in stopwords])

        # calculate the lift value of the snack brand
        lift_value = (total_sentences * snack_frequency) / (snack_frequency + other_snack_frequencies) \
                     * (snack_frequency + non_snack_frequency) / (non_snack_frequency + other_snack_frequencies) \
                     * (snack_frequency + other_snack_frequencies) / (snack_frequency + other_snack_frequencies + other_word_frequencies)

        # round the lift value to 2 decimal places
        lift_value = round(lift_value, 2)

        # add the lift value to the dictionary
        snack_lift_values[brand] = lift_value

    # return the lift values of each snack brand
    return snack_lift_values


#### The code above has an error where it misses the case that the same brand of the snack got mention multiple times in a sentence. So if a review sentence start with "Lays, Lays, Lays", it will count Lays three times instead of 1 which will make the result inaccurate.

## Part B. Code Improvements with ChatGPT

#### To begin with we picked up some sample code from nltk on VADER sentiment 
#### analysis. We made the code very choppy by adding unecessary import 
#### statements all over the place. We also changed variable names from 
#### decriptive to very vague names like var1, var2. The code was very
#### slopy to begin with and we were using depracated ways of doing things
#### which was in the sample code to begin with. The initial code is in the 
#### cell below.

In [None]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
nltk.download('subjectivity')
nltk.download('punkt')
nltk.download('vader_lexicon')
n_instances = 100
var1 = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
var2 = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
len(var1), len(var2)
var1[0]
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity


var3 = var1[:80]
var4 = var1[80:100]
var5 = var2[:80]
var6 = var2[80:100]
var7 = var3 + var5
var8 = var4 + var6
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in var7])
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
len(unigram_feats)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
training_set = sentim_analyzer.apply_features(var7)
test_set = sentim_analyzer.apply_features(var8)
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
     print('{0}: {1}'.format(key, value))
from nltk.classify import NaiveBayesClassifier



from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentences = ["VADER is smart, handsome, and funny.", # positive sentence example
                  "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
                 "VADER is very smart, handsome, and funny."
                  ]
paragraph = "It was one of the worst movies I've seen, despite good reviews. \
 Unbelievably bad acting!! Poor direction. VERY poor production. \
 The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *



lines_list = tokenize.sent_tokenize(paragraph)
sentences.extend(lines_list)


tricky_sentences = [
         "Most automated sentiment analysis tools are shit.",
         "VADER sentiment analysis is the shit.",
     "",
         "Sentiment analysis has never been good."

      ]
sentences.extend(tricky_sentences)
for sentence in sentences:
     sid = SentimentIntensityAnalyzer()
     print(sentence)
     ss = sid.polarity_scores(sentence)
     for k in sorted(ss):
              print('{0}: {1}, '.format(k, ss[k]), end='')
     print()

#### ChatGPT improved this code a lot and removed all the unnecessary import
#### statements. It also removed some instances of using old depracted 
#### string format methods with newer modern ones. It also used a more 
#### efficient method to tokenize strings. It also did some cleanup and added
#### indentation where needed. it did not however, change the variable names

In [None]:
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import mark_negation
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

nltk.download('subjectivity')
nltk.download('punkt')
nltk.download('vader_lexicon')

n_instances = 100

var1 = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
var2 = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

var3 = var1[:80]
var4 = var1[80:100]
var5 = var2[:80]
var6 = var2[80:100]
var7 = var3 + var5
var8 = var4 + var6

sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in var7])
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)

sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

training_set = sentim_analyzer.apply_features(var7)
test_set = sentim_analyzer.apply_features(var8)

trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)

for key, value in sorted(sentim_analyzer.evaluate(test_set).items()):
     print('{0}: {1}'.format(key, value))

sentences = ["VADER is smart, handsome, and funny.", # positive sentence example
                  "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
                 "VADER is very smart, handsome, and funny."
                  ]
paragraph = "It was one of the worst movies I've seen, despite good reviews. \
Unbelievably bad acting!! Poor direction. VERY poor production. \
The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"

lines_list = sent_tokenize(paragraph)
sentences.extend(lines_list)

tricky_sentences = [
         "Most automated sentiment analysis tools are shit.",
         "VADER sentiment analysis is the shit.",
         "",
         "Sentiment analysis has never been good."
]

sentences.extend(tricky_sentences)

for sentence in sentences:
     sid = SentimentIntensityAnalyzer()
     print(sentence)
     ss = sid.polarity_scores(sentence)
     for k, v in sorted(ss.items()):
         print(f"{k}: {v}, ", end='')
     print()


#### Upon asking it to improve further, ChatGPT did change the ambiguous 
#### variable names to more descriptive ones. It also changed the string
#### formatting method again to a supposedly more easier to read one. We think 
#### this was not necessary and the .format() it changed previously was more
#### than sufficient. It again did some cleanup and moved some imports into 
#### one line and added some new lines for readability. There was overall no 
#### change in logic.

In [None]:
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import mark_negation
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

nltk.download('subjectivity')
nltk.download('punkt')
nltk.download('vader_lexicon')

n_instances = 100

subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

train_docs = subj_docs[:80] + obj_docs[:80]
test_docs = subj_docs[80:100] + obj_docs[80:100]

sentim_analyzer = SentimentAnalyzer()

all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in train_docs])
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)

sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

training_set = sentim_analyzer.apply_features(train_docs)
test_set = sentim_analyzer.apply_features(test_docs)

trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)

for key, value in sorted(sentim_analyzer.evaluate(test_set).items()):
     print(f'{key}: {value}')

sentences = ["VADER is smart, handsome, and funny.", # positive sentence example
             "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
             "VADER is very smart, handsome, and funny."
            ]

paragraph = "It was one of the worst movies I've seen, despite good reviews. \
Unbelievably bad acting!! Poor direction. VERY poor production. \
The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"

lines_list = sent_tokenize(paragraph)
sentences.extend(lines_list)

tricky_sentences = [
         "Most automated sentiment analysis tools are shit.",
         "VADER sentiment analysis is the shit.",
         "",
         "Sentiment analysis has never been good."
]

sentences.extend(tricky_sentences)

for sentence in sentences:
     sid = SentimentIntensityAnalyzer()
     ss = sid.polarity_scores(sentence)
     for k, v in sorted(ss.items()):
         print(f"{k}: {v}, ", end='')
     print()


#### We asked it again to improve the code and this time it yet again said it 
#### changed the string formatting method from using the depracated % to 
#### .format() but the code we gave it to improve was the same code we got from 
#### the previous step which didnt have the % string formatter. It also 
#### expanded the previously written variable names to more descriptive ones.
#### Which was not needed but still does improve readability. Finally, it 
#### did add comments for each relevant line of code. There was no logic change

In [None]:
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import mark_negation
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

# Download necessary NLTK resources
nltk.download('subjectivity')
nltk.download('punkt')
nltk.download('vader_lexicon')

# Set number of instances
n_instances = 100

# Load subjectivity and objectivity sentences
subjectivity_sentences = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
objectivity_sentences = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

# Split data into training and testing sets
train_docs = subjectivity_sentences[:80] + objectivity_sentences[:80]
test_docs = subjectivity_sentences[80:100] + objectivity_sentences[80:100]

# Initialize sentiment analyzer
sentim_analyzer = SentimentAnalyzer()

# Extract unigram features
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in train_docs])
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)

sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

# Apply feature extractor to training and testing sets
training_set = sentim_analyzer.apply_features(train_docs)
test_set = sentim_analyzer.apply_features(test_docs)

# Train Naive Bayes classifier on training set
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)

# Evaluate classifier on testing set
for key, value in sorted(sentim_analyzer.evaluate(test_set).items()):
     print(f'{key}: {value}')

# Analyze sentiment of sample sentences
sentences = [
    "VADER is smart, handsome, and funny.", # positive sentence example
    "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
    "VADER is very smart, handsome, and funny."
]

paragraph = "It was one of the worst movies I've seen, despite good reviews. \
Unbelievably bad acting!! Poor direction. VERY poor production. \
The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"

lines_list = sent_tokenize(paragraph)
sentences.extend(lines_list)

tricky_sentences = [
    "Most automated sentiment analysis tools are shit.",
    "VADER sentiment analysis is the shit.",
    "",
    "Sentiment analysis has never been good."
]

sentences.extend(tricky_sentences)

for sentence in sentences:
     sid = SentimentIntensityAnalyzer()
     ss = sid.polarity_scores(sentence)
     for k, v in sorted(ss.items()):
         print(f"{k}: {v}, ", end='')
     print()


**bold text**## Part B. Adding complexity 

###Then we added more conplexity to our code. Following the following procedure


1. Added more data: Changed the amount of data used from 100 instances to 5000 instances.

2. The pre-processing steps used: the nltk library is used to perform tokenization and lemmatization on the text data, which can help improve the quality of the features used in the model. 

3. Added a new classifier: In addition to the Naive Bayes classifier, we added a Support Vector Machine (SVM) classifier to the pipeline. The SVM classifier is known to work well on text classification tasks and can handle high-dimensional feature spaces. Also, We added TF-IDF to assign weights to each word in the text and capture the importance of rare words. Finally, *mark_negation* is applied to the preprocessed text data before the TF-IDF feature extraction, which an improvement compared to the previous version, as it can help the model better capture the nuances of negated sentiments.


###After adding the complexity, the code looks as follows:


In [None]:
import nltk
from nltk.classify import SklearnClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.sentiment.util import mark_negation
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Download necessary NLTK resources
nltk.download('subjectivity')
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Set number of instances
n_instances = 5000

# Load subjectivity and objectivity sentences
subjectivity_sentences = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
objectivity_sentences = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

# Split data into training and testing sets
train_docs = subjectivity_sentences[:4000] + objectivity_sentences[:4000]
test_docs = subjectivity_sentences[4000:] + objectivity_sentences[4000:]

# Initialize sentiment analyzer
lemmatizer = WordNetLemmatizer()
vectorizer = TfidfVectorizer(tokenizer=word_tokenize, preprocessor=lemmatizer.lemmatize)
classifier = SklearnClassifier(Pipeline([('tfidf', vectorizer), ('svm', SVC(kernel='linear'))]))

# Apply feature extractor to training and testing sets
train_data = [mark_negation(doc) for doc in train_docs]
train_X = vectorizer.fit_transform([doc for doc, _ in train_data])
train_y = [label for _, label in train_data]

test_data = [mark_negation(doc) for doc in test_docs]
test_X = vectorizer.transform([doc for doc, _ in test_data])
test_y = [label for _, label in test_data]

# Train classifier on training set
classifier.train(zip(train_X, train_y))

# Evaluate classifier on testing set
accuracy = nltk.classify.accuracy(classifier, zip(test_X, test_y))
print(f'Accuracy: {accuracy:.4f}')

# Analyze sentiment of sample sentences
sentences = [
    "VADER is smart, handsome, and funny.", # positive sentence example
    "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
    "VADER is very smart, handsome, and funny."
]

paragraph = "It was one of the worst movies I've seen, despite good reviews. \
Unbelievably bad acting!! Poor direction. VERY poor production. \
The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"

lines_list = nltk.sent_tokenize(paragraph)
sentences.extend(lines_list)

tricky_sentences = [
    "Most automated sentiment analysis tools are shit.",
    "VADER sentiment analysis is the shit.",
    "",
    "Sentiment analysis has never been good."
]

sentences.extend(tricky_sentences)

for sentence in sentences:
     sid = SentimentIntensityAnalyzer()
     ss = sid.polarity_scores(sentence)
     print(f'{sentence}\nPositive: {ss["pos"]:.4f}\nNegative: {ss["neg"]:.4f}\nNeutral: {ss["neu"]:.4f}\n')


## Does ChatGPT handle complexity

#### After asking ChatGPT to improve this code, ChatGPT did not give us a code
#### block. It simply gave us tips on how to improve it further. For example it
#### asked us to use more data which is not exactly improving the code. We then 
#### specifically asked ChatGPT to improve the code and not explain what it did.
#### This was an interesting occurance because it gave us the same code back 
#### unchanged. When we asked it to explain what it did, it said that it did 
#### not change anything because we asked it not explain anything. It was 
#### almost as if it was being sassy with us. After that we changed what we 
#### asked in the prompt to "improve this piece of code and keep explanations 
#### of improvements concise" it gave us some general improvements again like
#### removing unused imports. But it also asked us to use Grid Search to 
#### improve our svc classifier. Which is a step in the right direction but
#### it is a very general advice which programmers/data scientists would 
#### do in their field to explore the best paramaters (Hyper parameter tuning).
#### In conclusion, we can say that for general low level improvements like 
#### removing unused imports and changing variable names ChatGPT can be helpful.
#### However, these improvements are better suited for IDE suggestions and code
#### scan tools like SonarQube. For more complex code ChatGPT cannot add much 
#### besides suggesting general best practices.


##**PART C**

#### For this part of the assignment, we gave ChatGPT a NLP code snippets and checked if it was able to explain what the code is about.

Using ChatGpt to understand Text Classification with BERT in PyTorch

Referred the following github repo - https://github.com/nlptown/nlp-notebooks/blob/master/Text%20classification%20with%20BERT%20in%20PyTorch.ipynb 

In [None]:
#Prompt1
import ndjson
from sklearn.model_selection import train_test_split

with open(CORPUS_PATH) as i:
    data = ndjson.load(i)
    
texts = [" ".join([doc["title"], doc["body"]]) for doc in data]
labels = [doc["rating"] for doc in data]
    
rest_texts, test_texts, rest_labels, test_labels = train_test_split(texts, labels, test_size=0.1, random_state=1)
train_texts, dev_texts, train_labels, dev_labels = train_test_split(rest_texts, rest_labels, test_size=0.1, random_state=1)

print("Train size:", len(train_texts))
print("Dev size:", len(dev_texts))
print("Test size:", len(test_texts))


ChatGpt Response: <br>
This code loads the dataset stored in the file specified by CORPUS_PATH, splits it into three subsets (training, development, and testing) using the train_test_split function from Scikit-learn, and prints the sizes of the resulting subsets.

The first block of code reads in the data from the file using the ndjson.load function, which is able to parse each line of the file as a separate JSON object. The resulting data object is a list of dictionaries, where each dictionary represents a review and contains two keys: "title", which stores the review's title, and "body", which stores the main body of the review.

The second block of code creates two lists: texts and labels. texts is a list of strings that concatenates the "title" and "body" fields of each review into a single string. labels is a list of integers representing the rating for each review (on a scale of 1 to 5).

The third block of code splits the texts and labels lists into three separate sets: train_texts, dev_texts, and test_texts, and train_labels, dev_labels, and test_labels. The train_test_split function randomly partitions the data, with 90% used for training, and 10% used for testing. The test_size parameter specifies the proportion of the data to be used for testing, while random_state ensures reproducibility of the random partitioning.

Finally, the code prints the number of examples in each of the resulting subsets.

In [None]:
#Prompt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression(multi_class="ovr", solver="lbfgs"))
])

parameters = {'lr__C': [0.1, 0.5, 1, 2, 5, 10, 100, 1000]}

best_classifier = GridSearchCV(pipeline, parameters, cv=5, verbose=1)
best_classifier.fit(train_texts, train_labels)
best_predictions = best_classifier.predict(test_texts)

baseline_accuracy = np.mean(best_predictions == test_labels)
print("Baseline accuracy:", baseline_accuracy)

ChatGpt Response: <br>
This code defines a logistic regression model for sentiment analysis, and uses grid search to find the best hyperparameter C for the model. The resulting model is evaluated on the test set to obtain a baseline accuracy.

The logistic regression model is defined as a Pipeline object that chains three different components:

A CountVectorizer, which converts each review text into a sparse matrix of word counts.
A TfidfTransformer, which applies term frequency-inverse document frequency (TF-IDF) weighting to the word counts.
A LogisticRegression classifier, which uses a one-vs-rest approach (multi_class="ovr") and the limited-memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) solver (solver="lbfgs") to perform multi-class classification.
The Pipeline object allows the three components to be treated as a single model that can be trained and used for prediction.

The parameters dictionary defines a single hyperparameter C for the logistic regression classifier, and specifies a set of eight possible values to try.

The GridSearchCV function is used to perform a grid search over the hyperparameter space defined by parameters. The cv parameter specifies 5-fold cross-validation to estimate the generalization performance of each hyperparameter setting. The verbose parameter is set to 1 to print progress messages during the search.

The resulting best_classifier object is the grid search estimator that has been trained on the training set with the best hyperparameters found during the search.

The best_predictions array contains the predicted ratings for the test set, which are compared against the true ratings test_labels to compute the baseline accuracy using NumPy's np.mean function.

The resulting baseline accuracy provides a measure of the performance of the logistic regression model before any further improvements are made.

In [None]:
#prompt
!pip install transformers
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BERT_MODEL = "bert-base-uncased"
from transformers.tokenization_bert import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

ChatGpt Response: <br>
This code installs the transformers library using the !pip install command, and then imports the torch library for working with PyTorch tensors. The device variable is set to "cuda" if a GPU is available, and "cpu" otherwise. This will be used to move tensors to the appropriate device for computation.

The BERT_MODEL variable is set to "bert-base-uncased", which is a pre-trained BERT model from the Hugging Face Transformers library that has been trained on lower-cased English text.

The BertTokenizer class from the transformers.tokenization_bert module is then imported, and an instance of the tokenizer is created using the from_pretrained method with the BERT_MODEL variable as the argument. This tokenizer will be used to tokenize and encode the review texts for input to the BERT model.

In [None]:
#Prompt
import logging
import numpy as np

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

MAX_SEQ_LENGTH=100

class BertInputItem(object):
    """An item with all the necessary attributes for finetuning BERT."""

    def __init__(self, text, input_ids, input_mask, segment_ids, label_id):
        self.text = text
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        

def convert_examples_to_inputs(example_texts, example_labels, label2idx, max_seq_length, tokenizer, verbose=0):
    """Loads a data file into a list of `InputBatch`s."""
    
    input_items = []
    examples = zip(example_texts, example_labels)
    for (ex_index, (text, label)) in enumerate(examples):

        # Create a list of token ids
        input_ids = tokenizer.encode(f"[CLS] {text} [SEP]")
        if len(input_ids) > max_seq_length:
            input_ids = input_ids[:max_seq_length]

        # All our tokens are in the first input segment (id 0).
        segment_ids = [0] * len(input_ids)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label2idx[label]

        input_items.append(
            BertInputItem(text=text,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          label_id=label_id))

        
    return input_items

train_features = convert_examples_to_inputs(train_texts, train_labels, label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0)
dev_features = convert_examples_to_inputs(dev_texts, dev_labels, label2idx, MAX_SEQ_LENGTH, tokenizer)
test_features = convert_examples_to_inputs(test_texts, test_labels, label2idx, MAX_SEQ_LENGTH, tokenizer)



ChatGpt Response: <br>
Great job on implementing the convert_examples_to_inputs function! It takes in a list of example texts and their corresponding labels, as well as a dictionary mapping the possible label names to their corresponding index, and converts them into a list of BertInputItems that can be used to train a BERT model.

It's worth noting that the function truncates examples that are longer than the maximum sequence length specified by MAX_SEQ_LENGTH and pads shorter examples with zeros. This is a common approach when working with fixed-length inputs, and BERT requires fixed-length inputs to work properly.

By calling this function on the training, development, and test datasets, you've created three lists of BertInputItems that can be used to train, tune, and evaluate a BERT model.

In [None]:
#prompt
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

def get_data_loader(features, max_seq_length, batch_size, shuffle=True): 

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
    return dataloader

BATCH_SIZE = 16

train_dataloader = get_data_loader(train_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=True)
dev_dataloader = get_data_loader(dev_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)
test_dataloader = get_data_loader(test_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)

ChatGpt Response: <br>
The get_data_loader() function takes in a list of BertInputItem objects, as well as other parameters, and converts them into PyTorch DataLoader objects that can be fed into the neural network for training or evaluation. Specifically, it creates a TensorDataset from the input features, and then creates a DataLoader that iterates over batches of the TensorDataset.

In the function, the all_input_ids, all_input_mask, all_segment_ids, and all_label_ids tensors are created from the BertInputItem objects. These tensors correspond to the input token IDs, input mask (indicating which tokens are padding tokens), segment IDs (indicating which tokens belong to which sentence in a pair of sentences), and label IDs (corresponding to the integer indices of the labels in the label2idx mapping), respectively. These tensors are then used to create a TensorDataset, which is a PyTorch dataset that stores these tensors in a way that is amenable to batching.

Finally, the DataLoader is created from the TensorDataset, with the batch_size and shuffle parameters specified. The batch_size parameter determines how many examples are processed in each batch, and the shuffle parameter determines whether the examples in each batch are randomly shuffled or not. The resulting DataLoader can then be iterated over to obtain batches of data for feeding into the neural network. 

In [None]:
#THis is the complex code given by ChatGpt 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
import transformers

class BERTTextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(BERTTextClassificationModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids, attention_mask=attention_mask, return_dict=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoded_text = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return encoded_text['input_ids'], encoded_text['attention_mask'], label

def train(model, train_loader, optimizer, criterion, device):
    model.train()
    for input_ids, attention_mask, labels in train_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

def evaluate(model, val_loader, criterion, device):
    model.eval()
    correct_preds = 0
    total_preds = 0
    running_loss = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in val_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            running_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            correct_preds += torch.sum(preds == labels)
            total_preds += len(labels)

    val_loss = running_loss / len(val_loader)
    val_acc = correct_preds / total_preds
    return val_loss, val_acc

if __name__ == '__main__':
    train_texts = ['This is the first text', 'This is the second text', 'This is the third text']
    train_labels = [0, 1, 1] # binary labels
    val_texts = ['This is the fourth text', 'This is the fifth text']
    val_labels = [0, 1] # binary labels
    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=2)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BERTTextClassificationModel(num_classes


ChatGpt Response: <br>
It seems like the code is incomplete as the line model = BERTTextClassificationModel(num_classes is not complete. But based on the code provided, it looks like a text classification model using BERT is being implemented.

The BERTTextClassificationModel class defines the architecture of the model. It uses the BERT model as the base, with a dropout layer and a linear classifier layer on top to classify text into num_classes categories.

The TextClassificationDataset class defines a custom dataset to be used with the DataLoader. It takes in texts, labels, and a tokenizer, and returns the input_ids, attention_mask, and label for each instance in the dataset.

The train function trains the model on the given train_loader, using the specified optimizer and criterion. It sets the model to training mode and performs backpropagation to update the weights.

The evaluate function evaluates the model on the given val_loader, using the specified criterion. It sets the model to evaluation mode, calculates the validation loss and accuracy, and returns them.

In the main function, the code defines train and validation datasets, initializes the data loaders, selects the device for training, initializes the model with the given number of classes, and trains and evaluates the model using the defined functions.

Overall, this code implements a simple binary text classification model using BERT and PyTorch.

So i asked ChatGpt to remove error and improve the code

In [None]:
#Prompt with improved code
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
import transformers

class BERTTextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(BERTTextClassificationModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids, attention_mask=attention_mask, return_dict=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoded_text = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return encoded_text['input_ids'], encoded_text['attention_mask'], label

def train(model, train_loader, optimizer, criterion, device):
    model.train()
    for input_ids, attention_mask, labels in train_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

def evaluate(model, val_loader, criterion, device):
    model.eval()
    correct_preds = 0
    total_preds = 0
    running_loss = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in val_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            running_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            correct_preds += torch.sum(preds == labels)
            total_preds += len(labels)

    val_loss = running_loss / len(val_loader)
    val_acc = correct_preds / total_preds
    return val_loss, val_acc

if __name__ == '__main__':
    train_texts = ['This is the first text', 'This is the second text', 'This is the third text']
    train_labels = [0, 1, 1] # binary labels
    val_texts = ['This is the fourth text', 'This is the fifth text']
    val_labels = [0, 1] # binary labels
    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=2)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BERTTextClassificationModel


It looks like there is a typo in the code.

In the last line of the code, the model is not instantiated. Instead, it is only being assigned the class BERTTextClassificationModel. To instantiate the model, parentheses need to be added after the class name. The corrected last line should be:

scss
Copy code
model = BERTTextClassificationModel(num_classes=2)
This creates an instance of the BERTTextClassificationModel with 2 output classes (binary classification).






##HOW DID CHATGPT HANDLED COMPLEXITY
#### After testing ChatGpt for part C. ChatGpt performed pretty good in 
#### explaining almost all kinds of code. But when the complexity is increased
#### the accuracy of CHatGpt starts reducing and responses becomes redundant. 

# PART D

#### For this part of the assignment, we gave ChatGPT a few broken code snippets and checked if it was able to debug or find errors in the codes.

## Task 1: Debugging Example

In [None]:
# Broke Code 1
import spacy

nlp = spacy.load('en_core_web_sm')
text = "I like to eat apples and oranges."

doc = nlp(text)

for token in doc:
    if token.text == "apples":
        token.text = "bananas"

print(doc.text)

#### This code attempts to replace the word "apples" with "bananas" in the input text, using the Spacy library for NLP processing. However, this code will produce an error, because the token.text attribute is read-only and cannot be modified directly.
####To fix this code, you would need to create a new string with the modified tokens, rather than trying to modify the tokens themselves.

####ChatGPT impressively found the mistake at once and gave the corrected code along with the explanation:

## ChatGPT Response

In [None]:
# Correct Code 1
import spacy

nlp = spacy.load('en_core_web_sm')
text = "I like to eat apples and oranges."

doc = nlp(text)

new_tokens = []
for token in doc:
    if token.text == "apples":
        new_tokens.append("bananas")
    else:
        new_tokens.append(token.text)

new_text = " ".join(new_tokens)
print(new_text)


####Next, we removed an import statement from the broken code and tried to find out if it could detect whether the import statement was removed. 
####However, this was not the case and ChatGPT gave the same corrected code that it gave in the above case (Corrected Code 1). 
####This proved that ChatGPT did not detect the missing import statement and only gave the output that it was initially trained to give. It could not detect the newly introduced bug.

## Task 2:Debugging
Problematic Code:


In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import gensim
from gensim.models import Word2Vec, LdaModel
from gensim.corpora import Dictionary

# Load the spaCy English language model and define the stop words
nlp = spacy.load('en_core_web_sm')
stop_words = STOP_WORDS

# Define a function to preprocess the text
def preprocess(text):
    # Tokenize the text
    doc = nlp(text)
    # Remove stop words and punctuation, and lemmatize the tokens
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    # Return the preprocessed tokens as a list
    return tokens

# Define the hotel review

# Preprocess the hotel review
processed_review = preprocess(review)

# Train a Word2Vec model on the hotel review
model = Word2Vec([processed_review], min_count=1)

# Define a function to perform topic modeling using LDA
def topic_modeling(corpus, num_topics):
    # Create a dictionary from the corpus
    dictionary = Dictionary(corpus)
    # Create a bag-of-words representation of the corpus
    bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]
    # Train an LDA model on the bag-of-words representation
    lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary)
    # Return the trained LDA model
    return lda_model

# Perform topic modeling on the hotel review using LDA
corpus = [processed_review]
num_topics = 1
lda_model = topic_modeling(corpus, num_topics)

# Print the topics and their associated keywords
topics = lda_model.show_topics(num_topics=num_topics, num_words=5, formatted=False)
for topic in topics:
    print("Topic {}: {}".format(topic[0], [word[0] for word in topic[1]]))

## ChatGPT Response:

The code appears to be missing the variable review, which is used in the line processed_review = preprocess(review). You need to define the review variable and assign it a string value before running the code.

Assuming that review has been defined, the code may also throw an error if the processed_review variable is an empty list after being processed by the preprocess() function. This can happen if the review variable contains only stop words or punctuation.

To avoid this, you can add a check to ensure that the processed_review variable is not empty before passing it to the Word2Vec function. For example:


```
# This is formatted as code
processed_review = preprocess(review)
if processed_review:
    model = Word2Vec([processed_review], min_count=1)
else:
    print("Error: processed_review is empty")
```
Other than that, the code looks fine. It loads the spaCy English language model, defines a function to preprocess text by removing stop words and lemmatizing the remaining words, trains a Word2Vec model on the preprocessed hotel review, and performs topic modeling on the review using LDA. Finally, it prints the topics and their associated keywords.


## Task 3: debugging snippet of code

Problematic code:

In [None]:
# Create a list of all the aspirational phrases or words list, to replace the comments containing these texts with "aspiration"
aspirational_words = ['hope that', 'hope to', 'wish to', 'wish that', 'dream to', 'dream that', 'would like', 'would love', 'aim', 'plan to', 'need to', 'needs to']
aspiration = pd.DataFrame(aspirational_words)
aspiration.columns=['phrases']
aspiration_list = aspiration['phrases'].to_list()

for i in aspirational_words:
    df['comments_aspiration'] = df['comments'].str.replace(i,"aspiration")

df['comments_tokens'] = df['comments_aspiration'].apply(lambda each_post: word_tokenize(re.sub(r'[^\w\s]',' ',each_post.lower())))
df['comments_tokens'] = df['comments_tokens'].apply(lambda list_of_words: [x if x not in model_brand_dict else model_brand_dict[x] for x in list_of_words])
df['comments_tokens'] = df['comments_tokens'].apply(lambda list_of_words: [x for x in list_of_words if x not in stop])
df['comments_unique'] = df['comments_tokens'].apply(lambda x: sorted(set(x)))
# Lift: features and brands
lift_aspiration_brand = pd.DataFrame(columns = ['brand','feature','lift'])
i=0
for brand in brands:
    temp_lift=calculate_lift(str(brand), df.comments_unique.contains('aspiratio'), df.comments_unique)
    lift_aspiration_brand.loc[i]=[str(brand),str(df.comments_unique).contains('aspiratio'),temp_lift]
    i=i+1

lift_aspiration_brand.sort_values('lift',ascending = False).head(10)

## ChatGPT Response:

There are several issues with the code:

pd module is not imported. Add import pandas as pd at the beginning of the code.

df dataframe is not defined in the code. You need to define and populate df with the comments data before running the code.

The line df['comments_aspiration'] = df['comments'].str.replace(i,"aspiration") is inside the for loop but should be outside. Otherwise, it will only replace the last aspirational word in the list.

df.comments_unique.contains('aspiratio') and str(df.comments_unique).contains('aspiratio') should be replaced with df.comments_tokens.apply(lambda x: 'aspiration' in x) because contains is not a method of a Python list object.

calculate_lift function is not defined in the code. You need to define it before using it.

Here's the corrected code:


```
# This is formatted as code
import pandas as pd
from nltk.tokenize import word_tokenize
import re

# Define df with comments data
df = pd.DataFrame({'comments': ['I hope that I can buy this product', 'I wish to have this product', 'This is my dream to own', 'I would like to purchase this', 'I plan to buy this', 'I need to have this']})

# Create a list of all the aspirational phrases or words list, to replace the comments containing these texts with "aspiration"
aspirational_words = ['hope that', 'hope to', 'wish to', 'wish that', 'dream to', 'dream that', 'would like', 'would love', 'aim', 'plan to', 'need to', 'needs to']
aspiration_list = ['aspiration']

for i in aspirational_words:
    df['comments'] = df['comments'].str.replace(i, "aspiration")

df['comments_tokens'] = df['comments'].apply(lambda each_post: word_tokenize(re.sub(r'[^\w\s]',' ',each_post.lower())))
df['comments_tokens'] = df['comments_tokens'].apply(lambda list_of_words: [x if x not in model_brand_dict else model_brand_dict[x] for x in list_of_words])
df['comments_tokens'] = df['comments_tokens'].apply(lambda list_of_words: [x for x in list_of_words if x not in stop])
df['comments_unique'] = df['comments_tokens'].apply(lambda x: sorted(set(x)))

# Lift: features and brands
lift_aspiration_brand = pd.DataFrame(columns=['brand', 'feature', 'lift'])
i=0
for brand in brands:
    temp_lift = calculate_lift(str(brand), df.comments_tokens.apply(lambda x: 'aspiration' in x), df.comments_unique)
    lift_aspiration_brand.loc[i] = [str(brand), 'aspiration', temp_lift]
    i += 1

lift_aspiration_brand.sort_values('lift', ascending=False).head(10)
```
Note: You will need to define brands, stop, and model_brand_dict variables and the calculate_lift function before running the corrected code.


## Task 4: Sentiment Analysis

Problematic Code:

In [None]:
## Imports
import pandas as pd
import nltk
nltk.download('stopwords')
#Created dataframe from scraped tweets
df = pd.read_csv("tweetsv3.csv", names=["username", "location", "text"])
#Remove all na's
df = df.fillna('NA')
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
#Function to categorize the compound value as positive, negative, or neutral
def senti(df):
    if df['compound'] >= 0.05:
        val = "Positive"
    elif df['compound'] <= -0.05:
        val = "Negative"
    else:
        val = "Neutral"
    return val

#Calling function  
df['Comp_score'] = df.apply(senti, axis=1)
#Function to implement sentiment analysis by running loop through text token column and looking for issue and candidate using the window=10 function
def sentimentIssue(df,issue, candidate, window):
    scores=[]
    for i in df["text_tokens"]:
        if ((issue in d) and (candidate in i)):
            a = ' '.join(e for e in i[i.index(issue)-window:i.index(issue)+window])
            if a != "":
                sentiment_score = analyser.polarity_scores(a)['compound']
                scores.append(sentiment_score)
    return np.mean(scores)
#Printing sentiment values
print("Sentiment between abortion and Oz is negative and the sentiment score is %.4f"% sentimentIssue(df,"abortion", "oz", window=10))
print("Sentiment between crime and Oz is negative and the sentiment score is %.4f"% sentimentIssue(df,"crime", "oz", window=10))
print("Sentiment between tax and Oz is neutral and the sentiment score is %.4f"% sentimentIssue(df,"tax", "oz", window=10))
print("Sentiment between foreign and Oz is positive and the sentiment score is %.4f"% sentimentIssue(df,"foreign", "oz", window=10))
print("Sentiment between abortion and Fetterman is negative and the sentiment score is %.4f"% sentimentIssue(df,"abortion", "fetterman", window=10))
print("Sentiment between crime and Fetterman is negative and the sentiment score is %.4f"% sentimentIssue(df,"crime", "fetterman", window=10))
print("Sentiment between tax and Fetterman is neutral and the sentiment score is %.4f"% sentimentIssue(df,"tax", "fetterman", window=10))
print("Sentiment between foreign and Fetterman is positive and the sentiment score is %.4f"% sentimentIssue(df,"foreign", "fetterman", window=10))



## ChatGPT Response:

There are a few issues with the provided code:

The import statement for the NumPy library is missing, but it is required for the code to run. You can add it at the top of the code with import numpy as np.

In the sentimentIssue function, there is a typo where d should be i. The corrected line should be if ((issue in i) and (candidate in i)):.

In the sentimentIssue function, the scores list is not initialized, which can cause an error if no sentiment scores are found. You can initialize it as an empty list at the beginning of the function with scores = [].

Here's the corrected code:



```
# This is formatted as code
## Imports
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
#Created dataframe from scraped tweets
df = pd.read_csv("tweetsv3.csv", names=["username", "location", "text"])
#Remove all na's
df = df.fillna('NA')
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
#Function to categorize the compound value as positive, negative, or neutral
def senti(df):
    if df['compound'] >= 0.05:
        val = "Positive"
    elif df['compound'] <= -0.05:
        val = "Negative"
    else:
        val = "Neutral"
    return val

#Calling function  
df['Comp_score'] = df.apply(senti, axis=1)

#Function to implement sentiment analysis by running loop through text token column and looking for issue and candidate using the window=10 function
def sentimentIssue(df, issue, candidate, window):
    scores = []
    for i in df["text_tokens"]:
        if ((issue in i) and (candidate in i)):
            a = ' '.join(e for e in i[i.index(issue)-window:i.index(issue)+window])
            if a != "":
                sentiment_score = analyser.polarity_scores(a)['compound']
                scores.append(sentiment_score)
    return np.mean(scores)
```




## Analysis: 

ChatGPT is a large language model trained by OpenAI. It provides guidance on the process of debugging code and suggests potential sources of errors based on its knowledge of programming concepts and syntax. ChatGPT can suggest potential syntax errors, logical errors, or algorithmic errors that may be causing the problem, and can provide information on debugging techniques and tools that can be used to find and fix the error. However, the extent to which ChatGPT can detect a bug in the code depends on the complexity of the code, the nature of the error, and the accuracy and completeness of the information provided to it. Ultimately, the effectiveness of the debugging process relies on the skill and experience of the programmer, as well as the tools and resources available to them. 

In task 3, ChatGPT Failed to fill in missing lines of code that is needed to sort the colums of the dataframe needed for the sentiment analysis. The code missed is as follow:

```
df['scores'] = df['text'].apply(lambda text: analyser.polarity_scores(text))
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
```


## Part E


#### As an AI language model, ChatGPT is designed to generate responses to user queries, and provide information and suggestions based on a vast amount of data and training. In terms of its competence level, ChatGPT is significantly more advanced than a novice in the tasks it is designed to perform. However, it may not yet reach the level of expertise in certain areas that require specific domain knowledge.
####In comparison to a human expert, ChatGPT may lack the ability to understand and interpret complex social cues, nuances, and contextual information that require high-level cognitive skills, emotional intelligence, and experience. However, it can process and analyze vast amounts of data and provide insights and suggestions based on its training and previous interactions.
####To improve ChatGPT's performance in a real-world enterprise setting, there are several areas that need improvement. One of the main areas is to enhance its ability to understand and respond appropriately to the context of the conversation. This could be achieved by training the model with more contextual data and providing it with better natural language processing capabilities. Additionally, ChatGPT's responses could be fine-tuned by using real-time feedback from users to improve accuracy and relevance.
####Another area of improvement for ChatGPT is to expand its knowledge and expertise in specific domains. This could be achieved by training the model with more domain-specific data and incorporating subject matter experts' input to ensure accuracy and relevancy.
####Finally, to perform better in a real-world enterprise setting, ChatGPT needs to be designed to handle various data formats, such as images, audio, and video. This would enable the model to provide more comprehensive and relevant responses to users' queries.
####In conclusion, ChatGPT's level of competence is advanced, and it outperforms a novice in the tasks it is designed to perform. However, there is still room for improvement in certain areas to enhance its accuracy, relevancy, and ability to handle complex social cues and contextual information. Achieving these improvements will require ongoing training, fine-tuning, and integration of real-time feedback from users to continuously improve its performance in a real-world enterprise setting.