# Setting up the environment
Don't forget to set the runtime to GPU.
Mount your Google Drive. It'll be used to install the requirements and load the saved models.



In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Install requirements.

In [None]:
!pip install -r /content/drive/My\ Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/requirements.txt

In [None]:
# install apex
%%writefile setup.sh

export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex



In [None]:
!sh setup.sh

# Loading our model and making predictions

## Load your model


In [None]:
from simpletransformers.classification import ClassificationModel


model2 = ClassificationModel(
    model_type="distilbert",
    model_name= "/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/outputs2/best_model/",
    use_cuda=True,
    num_labels=3,
    args={
        "output_dir": "/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/outputs2/best_model/",
        "reprocess_input_data": True,
        "sliding_window": True,
        "max_seq_length": 512,
    },
)

Read in the reviews and classify them. WARNING! Classification takes some time, you can have a coffee or two before continouning this section.

In [None]:
# import random

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/raw/reviews_without_ratings.txt", "r") as f:
    reviews = f.read().split("\n")

# reviews = random.sample(reviews, 100)

In [None]:
predictions = model2.predict(reviews)

predicted_class, predicted_probas = predictions[0], predictions[1]

In [None]:
# save the predictions, otherwise you have to run every sinppets above whenever you want to work with them
import pickle
with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/predicted_class.pkl", "wb") as outfile:
  pickle.dump(predicted_class, outfile)

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/predicted_probs.pkl", "wb") as outfile:
  pickle.dump(predicted_probas, outfile)


# Extracting keywords/key phrases

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# only if needed
import pickle
predicted_class = pickle.load(open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/predicted_class.pkl", "rb"))


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

blacklist = set(stopwords.words())


def tokenize_review(review):
    wds = []
    for sent in sent_tokenize(review):
        for wd in word_tokenize(sent):
            if wd.lower() not in blacklist and wd.isalpha():
                wds.append(wd.lower())
    return wds


reviews = [tokenize_review(review) for review in reviews]

bigrams = [[e[0] + "_" + e[1] for e in list(nltk.bigrams(e))] for e in reviews]

positive_reviews = [bigrams[i] for i in range(len(reviews)) if predicted_class[i] == 2]

negative_reviews = [bigrams[i] for i in range(len(reviews)) if predicted_class[i] == 0]

neutral_reviews = [bigrams[i] for i in range(len(reviews)) if predicted_class[i] == 1]




In [None]:
import altair as alt
import pandas as pd

df = pd.DataFrame({"sentiments": ["negative", "neutral", "positive"],
                   "counts": [len(negative_reviews), len(neutral_reviews),
                              len(positive_reviews)]})

alt.Chart(df).mark_bar().encode(x="sentiments", y="counts")


In [None]:
# save the review bigrams so you don't have to re-run the snippets above
import pickle

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/positive_reviews.pkl", "wb") as outfile:
  pickle.dump(positive_reviews, outfile)

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/negative_reviews.pkl", "wb") as outfile:
  pickle.dump(negative_reviews, outfile)

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/neutral_reviews.pkl", "wb") as outfile:
  pickle.dump(neutral_reviews, outfile)

Now, we can extract the key bigrams.

In [None]:
from keyness import log_likelihood

positive_keys = log_likelihood(bigrams, positive_reviews)[:150]
negative_keys = log_likelihood(bigrams, negative_reviews)[:150]
neutral_keys = log_likelihood(bigrams, neutral_reviews)[:150]

# serialize keywords, so you don't have t
with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/positive_keys.pkl", "wb") as outfile:
  pickle.dump(positive_keys, outfile)

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/negative_keys.pkl", "wb") as outfile:
  pickle.dump(negative_keys, outfile)

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/neutral_keys.pkl", "wb") as outfile:
  pickle.dump(neutral_keys, outfile)



The three list contain tuples of bigram, log likelihood, overall frequency and frequency in the subcorpus.

# Interpret your results
## Keyness vs frequency

In [None]:
# only needed if we have to reconnect
import pickle
positive_keys = pickle.load(open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/positive_keys.pkl", "rb"))
negative_keys = pickle.load(open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/negative_keys.pkl", "rb"))
neutral_keys = pickle.load(open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/neutral_keys.pkl", "rb"))


In [None]:
with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/positive_keys.tsv", "w") as outfile:
    h = "bigram\tloglikelihood\tcorpus_freq\treference_freq\n"
    outfile.write(h)
    for e in positive_keys:
        wd, ll, cf, rf = e[0], str(e[1]), str(e[2]), str(e[3])
        o = "\t".join([wd, ll, cf, rf]) + "\n"
        outfile.write(o)

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/negative_keys.tsv", "w") as outfile:
    h = "bigram\tloglikelihood\tcorpus_freq\treference_freq\n"
    outfile.write(h)
    for e in negative_keys:
        wd, ll, cf, rf = e[0], str(e[1]), str(e[2]), str(e[3])
        o = "\t".join([wd, ll, cf, rf]) + "\n"
        outfile.write(o)

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/neutral_keys.tsv", "w") as outfile:
    h = "bigram\tloglikelihood\tcorpus_freq\treference_freq\n"
    outfile.write(h)
    for e in neutral_keys:
        wd, ll, cf, rf = e[0], str(e[1]), str(e[2]), str(e[3])
        o = "\t".join([wd, ll, cf, rf]) + "\n"
        outfile.write(o)


In [None]:
import pandas as pd

positive_df = pd.DataFrame({"bigrams": [e[0] for e in positive_keys],
                     "keyness": [e[1] for e in positive_keys],
                     "corpus_freq": [e[2] for e in positive_keys],
                     "reference_freq": [e[3] for e in positive_keys]})


In [None]:
import altair as alt

alt.Chart(positive_df).mark_point().encode(
    x='keyness:Q',
    y='reference_freq:Q',
    color='reference_freq:Q',
    tooltip=["bigrams:N", "reference_freq:Q"],
).interactive()

In [None]:
negative_df = pd.DataFrame({"bigrams": [e[0] for e in negative_keys],
                     "keyness": [e[1] for e in negative_keys],
                     "corpus_freq": [e[2] for e in negative_keys],
                     "reference_freq": [e[3] for e in negative_keys]})

alt.Chart(negative_df).mark_point().encode(
    x='keyness:Q',
    y='reference_freq:Q',
    color='keyness:Q',
    tooltip=["words:N", "reference_freq:Q"],
).interactive()

In [None]:
neutral_df = pd.DataFrame({"bigrams": [e[0] for e in neutral_keys],
                     "keyness": [e[1] for e in neutral_keys],
                     "corpus_freq": [e[2] for e in neutral_keys],
                     "reference_freq": [e[3] for e in neutral_keys]})

alt.Chart(neutral_df).mark_point().encode(
    x='keyness:Q',
    y='reference_freq:Q',
    color='keyness:Q',
    tooltip=["words:N", "reference_freq:Q"],
).interactive()

## See the context
First, we have to make nltk corpora from the subcorpora.

In [None]:
import nltk

positive_texts = [reviews[i] for i in range(len(reviews)) if predicted_class[i] == 2]
negative_texts = [reviews[i] for i in range(len(reviews)) if predicted_class[i] == 0]
neutral_texts = [reviews[i] for i in range(len(reviews)) if predicted_class[i] == 1]

positive_texts = "\n".join(positive_texts)
negative_texts = "\n".join(negative_texts)
neutral_texts = "\n".join(neutral_texts)

def make_text_corpus(text):
    tokens = nltk.word_tokenize(text)
    return nltk.Text(tokens)


positive_text = make_text_corpus(positive_texts)
negative_text = make_text_corpus(negative_texts)
neutral_text = make_text_corpus(neutral_texts)

Now, we can check the context of words.

In [None]:
words_to_check = [
    "word",
    "great",
    "game",
    "love",
    "works",
    "highly",
    "recommend",
    "best",
    "product",
    "awesome",
    "best",
    "ever",
    "great",
    "excellent"
]

for wd in words_to_check:
    positive_text.concordance(wd)
