## Import packages

In [1]:
import re
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

## Load file

In [2]:
my_file = json.loads(open(r"comments_with_emoji_toarray.json", "r").read())
df = json_normalize(my_file)
print(df.head())
print(df.shape)

                                payload.comment.body
0  ## Step 2: Turn on GitHub Pages\n\nNicely done...
1  # [Codecov](https://codecov.io/gh/gitcoinco/we...
2  # [Codecov](https://codecov.io/gh/gitcoinco/we...
3  ## Nice work\n\n![celebrate](https://octodex.g...
4  # [Codecov](https://codecov.io/gh/poliastro/po...
(1104, 1)


## Extract comments into a list

In [3]:
list_of_comments = []
row_count = df.shape[0]
for i in range(row_count):
    list_of_comments.append(df['payload.comment.body'][i])
print(list_of_comments[0])

## Step 2: Turn on GitHub Pages

Nicely done @sromanowski12! At least one task in your Markdown to-do list is correct. :sparkles:

Now let's turn our focus to the content of your portfolio. Because this portfolio will be displayed as a static web page, you'll need to enable GitHub Pages.

### :keyboard: Activity: Enable GitHub Pages
1. Under your repository name, click [**Settings**](https://github.com/sromanowski12/markdown-portfolio/settings).
1. In the **GitHub Pages** section, use the **Select source** drop-down menu to select `master` as your GitHub Pages publishing source.
1. Click **Save**.

> _I may take up to a minute to respond as I wait for GitHub Pages to create a deployment of your repository_.

> _For more information, see [Configuring a publishing source for GitHub Pages](https://help.github.com/articles/configuring-a-publishing-source-for-github-pages/) in the GitHub Help._

<hr>
<h3 align="center">Return to this issue for next steps</h3>



## Extract emoji from each comment

In [4]:
# Find emoji
list_of_emoji = []
set_of_emoji = set()
for i in range(len(list_of_comments)):
    emoji = re.findall(":100:|:[^:\"|\s*|\[A-Z\]][a-z_]+[0-9]*:|:[+-]1:|:[a-z]+[0-9]*:", list_of_comments[i])
    # print(emoji)
    set_of_emoji.update(emoji)
    list_of_emoji.append(emoji)
    
print("__Set__")
print(set_of_emoji)
print("__List__")
print(list_of_emoji[:20])

__Set__
__List__
[[':sparkles:', ':keyboard:'], [':arrow_up:', ':arrow_down:', ':arrow_up:'], [':arrow_up:', ':arrow_down:', ':arrow_up:'], [':tada:'], [':arrow_up:', ':arrow_up:', ':arrow_down:'], [':keyboard:', ':sparkles:'], [':muscle:', ':keyboard:'], [':+1:', ':+1:'], [':tada:', ':sunglasses:', ':keyboard:'], [':phone:', ':no_entry:'], [':muscle:', ':keyboard:'], [':arrow_up:', ':arrow_up:', ':arrow_up:', ':arrow_up:'], [':tada:', ':sunglasses:', ':keyboard:'], [':muscle:', ':keyboard:'], [':keyboard:'], [':grin:'], [':keyboard:'], [':smiley:'], [':tada:', ':sunglasses:', ':keyboard:'], [':book:', ':keyboard:']]


## Clean up comments

In [5]:
# Remove emoji, URLs, commit SHAs, punctuation, extra whitespaces, convert to lowercase

processed = []
for i in range(len(list_of_comments)):
    comment_i = list_of_comments[i]
    # Emoji
    comment_i = re.sub(":100:|:[^:\"|\s*|\[A-Z\]][a-z_]+[0-9]*:|:[+-]1:|:[a-z]+[0-9]*:", "metaemoji", comment_i)
    # URLs
    comment_i = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "metaurl", comment_i)
    # Commit SHAs
    comment_i = re.sub("[a-zA-Z0-9]{40}", "metacommitsha", comment_i)
    # Punctuation
    comment_i = re.sub("[^\w\d\s]", " ", comment_i)
    # To lowercase
    comment_i = comment_i.lower()
    # Replace whitespace between terms with a single space
    comment_i = re.sub("\s+", " ", comment_i)
    # Remove leading and trailing whitespace
    comment_i = re.sub("^\s+|\s+?$", "", comment_i)
    processed.append(comment_i)

# print(processed)

In [6]:
# Tokenize, remove stop words, lemmatize

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
clean_comments = []

for i in range(len(processed)):
    tokenized_comment = word_tokenize(processed[i])
    clean = []
    for w in tokenized_comment:
        if w not in stop_words:
            lemmatizer.lemmatize(w)
            clean.append(w)
    clean = " ".join(clean)
    clean_comments.append(clean)

# print(clean_comments)

## Define postive, negative, and neutral emoji

In [7]:
# Positive emoji
pos_emoji = [":+1:", ":smile:", ":rocket:", ":confetti_ball:", ":laughing:", ":white_check_mark:",
             ":wink:", ":grinning:", ":wave:", ":exclamation:", ":fireworks:", ":ok_hand:", ":joy:",
             ":heart_eyes:", ":heart:", ":boom:", ":slightly_smiling_face:", ":sunglasses:", ":tada:",
             ":sparkles:", ":clap:", ":trollface:", ":stuck_out_tongue:", ":heavy_check_mark:",
             ":star:", ":grin:", ":blush:", ":see_no_evil:", ":sunny:", ":thumbsup:", ":cocktail:",
             ":yum:", ":relaxed:", ":champagne:", ":moneybag:", ":smile_cat:", ":green_heart:",
             ":sweat_smile:", ":smiling_imp:", ":crossed_fingers:", ":christmas_tree:", ":santa:",
             ":gift:", ":bulb:", ":smiley:", ":innocent:", ":vulkan:", ":trumpet:", ":upside_down_face:",
             ":bowing_man:"]

# Negative emoji
neg_emoji = [":cry:", "-1", ":fearful:", ":cold_sweat:", ":unamused:", ":confused:", ":angry:", ":rage:",
             ":shit:", ":sheepishly:", ":x:", ":no_entry:", ":no_entry_sign:", ":sob:", ":warning:",
             ":rotating_light:", ":bug:", ":man_facepalming:", ":woman_facepalming:", ":disappointed:",
             ":frowning_face:", ":roll_eyes:", ":worried:", ":panicking:", ":panic:", ":imp:"]

# Neutral emoji
neut_emoji = [":book:", ":muscle:", ":memo:", ":keyboard:", ":arrow_up:", ":arrow_down:", ":end:",
              ":robot:", ":wave:", ":abcd:", ":octocat:", ":recyccle:", ":neutral_face:", ":tv:",
              ":start:", ":pushpin:", ":hourglass:", ":tomato:", ":package:", ":phone:", ":pencil2:",
              ":scream_cat:", ":pray:", ":thinking:", ":bee:", ":point_right:", ":eyes:", ":bowtie:",
              ":wrench:", ":point_down:", ":pencil:", ":page_facing_up:", ":point_left:", ":green_apple:",
              ":clock10:", ":scroll:", ":hourglass:", ":umbrella:", ":email:", ":date:", ":bigthink:",
              ":construction:", ":construction_worker:", ":marker:", ":page_facing_up:", ":clock1:",
              ":man_shrugging:", ":new:", ":cat2:"]

## Calculate score of each comment based on emoji

In [8]:
emoji_classified = []
for i in range(len(list_of_emoji)):
    emoji_val = 0
    for j in range(len(list_of_emoji[i])):
        if list_of_emoji[i][j] in pos_emoji:
            emoji_val += 1
        elif list_of_emoji[i][j] in neg_emoji:
            emoji_val -= 1
        elif list_of_emoji[i][j] in neut_emoji:
            emoji_val += 0
        else:
            emoji_val += 0
    emoji_classified.append(emoji_val)

## Create dataframe with cleaned comments and emoji score

In [9]:
emoji_data = {"comment": clean_comments, "emoji_score": emoji_classified}
final_df = pd.DataFrame(data=emoji_data)
final_df[:20]

Unnamed: 0,comment,emoji_score
0,step 2 turn github pages nicely done sromanows...,1
1,codecov metaurl report merging 3362 metaurl ma...,0
2,codecov metaurl report merging 3362 metaurl ma...,0
3,nice work celebrate metaurl congratulations ne...,1
4,codecov metaurl report merging 523 metaurl mas...,0
5,step 9 use emphasis great job lists let try so...,1
6,step 1 assign unassigned issues owners look as...,0
7,githubtesting123 everything lgtm thank helping...,2
8,step 2 turn github pages metaemoji proud manag...,2
9,ok yeah dramatic change poly count however see...,-1


In [10]:
classes = final_df['emoji_score']
print(classes.value_counts())

 0     581
 1     351
 3      79
 2      56
-1      25
-2       3
-3       1
-16      1
 31      1
 21      1
 17      1
 15      1
 12      1
 10      1
 7       1
Name: emoji_score, dtype: int64


## Define features

In [11]:
# Vectorization
tf = TfidfVectorizer()
text_tf = tf.fit_transform(final_df['comment'])
type(text_tf)

scipy.sparse.csr.csr_matrix

In [12]:
# Define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(list_of_comments)

## Apply learning algorithm

In [13]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(text_tf,
                                                    final_df['emoji_score'],
                                                    test_size=0.25,
                                                    random_state=seed)

In [14]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(828, 14126)
(276, 14126)
(828,)
(276,)


In [15]:
# Model Generation Using Multinomial Naive Bayes
# Train the model with train x and train y
clf = MultinomialNB().fit(X_train, y_train)
# Predict y using test x
y_predicted = clf.predict(X_test)
print("MultinomialNB Accuracy:", metrics.accuracy_score(y_test, y_predicted))

MultinomialNB Accuracy: 0.8297101449275363
