In [1]:
# Need to modify this: get a service account from GCP
%env GOOGLE_APPLICATION_CREDENTIALS=/Users/lucas/Documents/GCP-service-account/cmu-piazza-nlp.json

env: GOOGLE_APPLICATION_CREDENTIALS=/Users/lucas/Documents/GCP-service-account/cmu-piazza-nlp.json


In [2]:
import os
import functools
from html.parser import HTMLParser
from collections import Counter

# map: ta_id => filenames
def get_ta_to_filename_dict():
    filenames = os.listdir('TA_posts')
    ta_to_filename = {}
    for filename in filenames:
        parts_of_filename = filename.split('_')
        parts_of_filename[1]
        if parts_of_filename[1] not in ta_to_filename:
            ta_to_filename[parts_of_filename[1]] = []
            ta_to_filename[parts_of_filename[1]].append(filename)
        else:
            ta_to_filename[parts_of_filename[1]].append(filename)
    return ta_to_filename

# comparator for filenames
def filename_compare(a, b):
    start_week_a = int(a.split('_')[4])
    start_week_b = int(b.split('_')[4])
    return start_week_a - start_week_b

# get ta_ids
def get_ta_ids(ta_to_filename):
    return list(ta_to_filename.keys())

# sort the filenames
def get_sorted_filenames(ta_id):
    return sorted(ta_to_filename[ta_id], key=functools.cmp_to_key(filename_compare))

# post data sanitizer
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

# remove html tags
def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

# read text, clean the separators and sanitize text
def read_ta_posts(filename):
    lines = []
    with open('TA_posts/' + filename) as f:
        for line in f.readlines():
            lines.append(line)
    cleaned_lines = list(filter(lambda a: a != '----\n', lines))
    sanitized_lines = [strip_tags(line) for line in cleaned_lines]
    sanitized_non_empty_lines = [line for line in sanitized_lines if line != '\n']
    return sanitized_non_empty_lines

In [3]:
ta_to_filename = get_ta_to_filename_dict()
ta_ids = get_ta_ids(ta_to_filename)
id_to_filenames = {}
for id in ta_ids:
    id_to_filenames[id] = get_sorted_filenames(id)
id_to_filenames

{'10': ['USERID_10_Username_week_0_to_3_posts.txt',
  'USERID_10_Username_week_3_to_11_posts.txt',
  'USERID_10_Username_week_11_to_15_posts.txt',
  'USERID_10_Username_week_15_to_18_posts.txt'],
 '12': ['USERID_12_Username_week_0_to_3_posts.txt',
  'USERID_12_Username_week_3_to_18_posts.txt',
  'USERID_12_Username_week_12_to_18_posts.txt'],
 '13': ['USERID_13_Username_week_0_to_5_posts.txt',
  'USERID_13_Username_week_5_to_8_posts.txt',
  'USERID_13_Username_week_8_to_10_posts.txt',
  'USERID_13_Username_week_10_to_18_posts.txt'],
 '16': ['USERID_16_Username_week_0_to_4_posts.txt',
  'USERID_16_Username_week_4_to_9_posts.txt',
  'USERID_16_Username_week_9_to_12_posts.txt',
  'USERID_16_Username_week_12_to_18_posts.txt'],
 '2': ['USERID_2_Username_week_0_to_4_posts.txt',
  'USERID_2_Username_week_4_to_8_posts.txt',
  'USERID_2_Username_week_8_to_18_posts.txt'],
 '4': ['USERID_4_Username_week_0_to_3_posts.txt',
  'USERID_4_Username_week_3_to_6_posts.txt',
  'USERID_4_Username_week_6_to_

In [4]:
id_to_posts = {}
for id in id_to_filenames:
    for filename in id_to_filenames[id]:
        if id in id_to_posts:
            id_to_posts[id].append(read_ta_posts(filename))
        else:
            id_to_posts[id] = []
            
def get_classification_result(user_posts):
    count = 0
    invalid_post_count = 0
    user_classifications = []
    for period_posts in user_posts:
        for post in period_posts:
            try:
                tmp = classify_text(post)
                if tmp[1] > 0.6:
                    tmp.append(post)
                    user_classifications.append(tmp)
            except:
                invalid_post_count += 1
            count += 1
    return user_classifications, invalid_post_count, count 

In [7]:
import csv

for id in id_to_posts:
    user_classifications, invalid_count, count = get_classification_result(id_to_posts[id])

    with open('ta_' + id + '_post_classification.csv', mode='w') as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerow(['TA id:', id])
        csv_writer.writerow(['Invalid posts count:', invalid_count])
        csv_writer.writerow(['Total posts count:', count])
        for row in user_classifications:
            csv_writer.writerow(row)

In [5]:
# 1. sentiment analysis
# get average sentiment score and magnitude
def get_average_sentiment_result(lines):
    score = 0
    magnitude = 0
    for line in lines:
        tmp_score, tmp_magnitude = sentiment_text(line)
        score += tmp_score
        magnitude += tmp_magnitude
    return score / len(lines), magnitude / len(lines)

In [8]:
score, magnitude = get_average_sentiment_result(user_10_lines_period_1)
print('Score: {}'.format(score))
print('Magnitude: {}'.format(magnitude))

Score: -0.10769231101641288
Magnitude: 0.569230782871063


In [9]:
# 2. syntax analysis
# get part of speech structure of user 10's first post in period 1
part_of_speech_tag_names, token_contents = syntax_text(user_10_lines_period_1[0])

In [10]:
# show an example mapping
print('Part of speech tag: {} \nToken content: {}\n'.format(part_of_speech_tag_names[0], token_contents[0]))

# check occurrence of tags and tokens in the first post
print(Counter(part_of_speech_tag_names))
print(Counter(token_contents))

Part of speech tag: ADP 
Token content: In

Counter({'VERB': 12, 'NOUN': 11, 'PUNCT': 8, 'PRON': 7, 'ADP': 5, 'DET': 5, 'ADV': 1, 'CONJ': 1, 'ADJ': 1})
Counter({'it': 3, ',': 2, 'that': 2, 'a': 2, 'security': 2, 'group': 2, 'name': 2, '"': 2, '.': 2, 'you': 2, 'the': 2, 'In': 1, 'your': 1, 'screenshot': 1, "'s": 1, 'complaining': 1, 'could': 1, "n't": 1, 'find': 1, 'with': 1, 'SSH_HTTP_MYSQL': 1, 'You': 1, 'should': 1, 'use': 1, 'defined': 1, 'or': 1, 'remove': 1, 'whole': 1, 'parameter': 1, '(': 1, 'so': 1, 'will': 1, 'be': 1, 'using': 1, 'default': 1, 'values': 1, 'like': 1, 'said': 1, ')': 1})


In [11]:
# 3. content classification analysis
# using user 10's first post in peroid 1
classify_text(user_10_lines_period_1[0])

[name: "/Computers & Electronics"
confidence: 0.8500000238418579
]
name            : /Computers & Electronics
confidence      : 0.8500000238418579


In [12]:
# 4. entity and entity sentiment analysis
# using user 10's first post in peroid 1
entities_text(user_10_lines_period_1[0])
entity_sentiment_text(user_10_lines_period_1[0])

name            : security group name
type            : OTHER
metadata        : {}
salience        : 0.4291812777519226
wikipedia_url   : -
name            : screenshot
type            : OTHER
metadata        : {}
salience        : 0.26048579812049866
wikipedia_url   : -
name            : SSH_HTTP_MYSQL
type            : OTHER
metadata        : {'mid': '/m/0749d', 'wikipedia_url': 'https://en.wikipedia.org/wiki/Secure_Shell'}
salience        : 0.15657617151737213
wikipedia_url   : https://en.wikipedia.org/wiki/Secure_Shell
name            : security group
type            : ORGANIZATION
metadata        : {}
salience        : 0.1252780556678772
wikipedia_url   : -
name            : values
type            : OTHER
metadata        : {}
salience        : 0.017576366662979126
wikipedia_url   : -
name            : parameter
type            : OTHER
metadata        : {}
salience        : 0.01090233400464058
wikipedia_url   : -
Mentions: 
Name: "security group name"
  Begin Offset : 121
  Content

In [6]:
import argparse
import sys
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
import six

# [START language_sentiment_text]
def sentiment_text(text):
    """Detects sentiment in the text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    # [START language_python_migration_sentiment_text]
    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)

    # Detects sentiment in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    sentiment = client.analyze_sentiment(document).document_sentiment
    return sentiment.score, sentiment.magnitude
    # [END language_python_migration_sentiment_text]
# [END language_sentiment_text]


# [START language_entities_text]
def entities_text(text):
    """Detects entities in the text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    # [START language_python_migration_entities_text]
    # [START language_python_migration_document_text]
    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)
    # [END language_python_migration_document_text]

    # Detects entities in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    entities = client.analyze_entities(document).entities

    for entity in entities:
        entity_type = enums.Entity.Type(entity.type)
        print('=' * 20)
        print(u'{:<16}: {}'.format('name', entity.name))
        print(u'{:<16}: {}'.format('type', entity_type.name))
        print(u'{:<16}: {}'.format('metadata', entity.metadata))
        print(u'{:<16}: {}'.format('salience', entity.salience))
        print(u'{:<16}: {}'.format('wikipedia_url',
              entity.metadata.get('wikipedia_url', '-')))
    # [END language_python_migration_entities_text]
# [END language_entities_text]

# [START language_syntax_text]
def syntax_text(text):
    """Detects syntax in the text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    # [START language_python_migration_syntax_text]
    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)

    # Detects syntax in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    tokens = client.analyze_syntax(document).tokens

    part_of_speech_tag_names = []
    token_contents = []
    for token in tokens:
        part_of_speech_tag = enums.PartOfSpeech.Tag(token.part_of_speech.tag)
        part_of_speech_tag_names.append(part_of_speech_tag.name)
        token_contents.append(token.text.content)
    return part_of_speech_tag_names, token_contents
    # [END language_python_migration_syntax_text]
# [END language_syntax_text]


# [START language_entity_sentiment_text]
def entity_sentiment_text(text):
    """Detects entity sentiment in the provided text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document = types.Document(
        content=text.encode('utf-8'),
        type=enums.Document.Type.PLAIN_TEXT)

    # Detect and send native Python encoding to receive correct word offsets.
    encoding = enums.EncodingType.UTF32
    if sys.maxunicode == 65535:
        encoding = enums.EncodingType.UTF16

    result = client.analyze_entity_sentiment(document, encoding)

    for entity in result.entities:
        print('Mentions: ')
        print(u'Name: "{}"'.format(entity.name))
        for mention in entity.mentions:
            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
            print(u'  Content : {}'.format(mention.text.content))
            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
            print(u'  Sentiment : {}'.format(mention.sentiment.score))
            print(u'  Type : {}'.format(mention.type))
        print(u'Salience: {}'.format(entity.salience))
        print(u'Sentiment: {}\n'.format(entity.sentiment))
# [END language_entity_sentiment_text]


# [START language_classify_text]
def classify_text(text):
    """Classifies content categories of the provided text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document = types.Document(
        content=text.encode('utf-8'),
        type=enums.Document.Type.PLAIN_TEXT)

    categories = client.classify_text(document).categories

#     print(categories)
    
#     for category in categories:
#         print(u'=' * 20)
#         print(u'{:<16}: {}'.format('name', category.name))
#         print(u'{:<16}: {}'.format('confidence', category.confidence))
    return [category.name, category.confidence]
# [END language_classify_text]

In [14]:
user_10_lines_period_1

['In your screenshot, it\'s complaining that it couldn\'t find a security group with name "SSH_HTTP_MYSQL". You should use a security group name that you defined, or remove the whole parameter (so it will be using the default\xa0values like you said).\n',
 'Are you saying the\xa0"--security-groups" parameter for the run-instance command? If so, you may have to read the manual again since what you provided is\xa0not in a correct format.\xa0https://docs.aws.amazon.com/cli/latest/reference/ec2/run-instances.html\n',
 "No, it doesn't, you can install sysbench by running `apt-get update` then `apt-get install sysbench`. For higher version ones, please find its corresponding parameter for init-rng.\n",
 'Did you pick the basic Ubuntu AMI? It seems that the version of your sysbench is higher than the default one installed by apt-get.\n',
 'If you start\xa0a new instance (ami-41e0b93b) with the 2 lines of command you listed above, it should work.\xa0Are you saying that you are still getting "T

In [15]:
# reference: https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings 

warnings.filterwarnings(action = 'ignore') 

import gensim 
from gensim.models import Word2Vec 


data = []

# iterate through each sentence in the list 
for line in user_10_lines_period_1:
    for i in sent_tokenize(line): 
        temp = [] 
        # tokenize the sentence into words 
        for j in word_tokenize(i): 
            temp.append(j.lower()) 
        data.append(temp) 

# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5) 
print("Model 1, similarity between 'that' and 'it': " + str(model1.similarity('that', 'it')))

# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5, sg = 1) 
print("Model 2, similarity between 'that' and 'it': " + str(model2.similarity('that', 'it')))



Model 1, similarity between 'that' and 'it': -0.03197614
Model 2, similarity between 'that' and 'it': -0.0058905706
