# TF-IDF Similarity for Emoticons
Find similar emoticons using TF-IDF, where tokens are the individual characters in the emoticons.

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np
import random
import json

In [2]:
# load a corpus of emoticons as a list
with open('../data/emoticon_dict.json', 'r', encoding='utf-8') as f:
  emoticon_dict = json.load(f)
  corpus = list( emoticon_dict.keys() )
#print(corpus[:50])
print("CORPUS LENGTH:", len(corpus))

for i in corpus[:50]:
    print(i)

CORPUS LENGTH: 62149
( ˘͈ ᵕ ˘͈♡)
(╥﹏╥)
（＾ω＾）
(◍•ᴗ•◍)♡ ✧*。
⸜( ˙˘˙)⸝
♡´･ᴗ･`♡
(｡•́︿•̀｡)
(⋈◍＞◡＜◍)。✧♡
(づ◡﹏◡)づ
(≧◡≦)
(　-_･) ︻デ═一 ▸
(∩｀-´)⊃━━☆ﾟ.*･｡ﾟ
◡̈
(っ◔◡◔)っ ♥
(づ￣ ³￣)づ
\(◡̈ )/♥︎
(。U⁄ ⁄ω⁄ ⁄ U。)
ʕ •ᴥ•ʔ
(◕ᴗ◕✿)
ʕ•ᴥ•ʔ
웃❤유
˚‧º·(˚ ˃̣̣̥⌓˂̣̣̥ )‧º·˚
ᕦ(ò_óˇ)ᕤ
٩(˘◡˘)۶
o(〃＾▽＾〃)o
(っ◔◡◔)っ 🍔
(๑´• .̫ •ू`๑)💓
♡(.◜ω◝.)♡
♡＾▽＾♡
(>‿♥)
(⌐■_■)--︻╦╤─
(　-_･) ︻デ═一' * (/❛o❛)/
̸̱͂ ̸͆̿͞ ̄̿̄͞ ̿̅͞ ̿̅͞ ̄̚
(• ε •)
(>‿◠)✌
(づ｡◕‿‿◕｡)づ
\(◕ ◡ ◕\)
(✿◠‿◠)
╭∩╮(Ο_Ο)╭∩╮
(ღ˘⌣˘)♥ ℒ♡ⓥℯ ㄚ♡ⓤ
( ˘ ³˘)❤
{ᶫᵒᵛᵉᵧₒᵤ}•..(❤️O❤️)
( ◜◒◝ )♡
𝓴𝓲𝓼𝓼 𝓶𝒆 𝓹𝓵𝒆𝓪𝓼𝒆(ʃƪ˶˘ ﻬ ˘˶)
Ｉ ∟ＯⅤ∈ Υ〇∪…..φ(｡･ω･｡ )
(╯︵╰,)
¯¯̿̿¯̿̿'̿̿̿̿̿̿̿'̿̿'̿̿̿̿̿'̿̿̿)͇̿̿)̿̿̿̿ '̿̿̿̿̿̿\̵͇̿̿\=(•̪̀●́)=o/̵͇̿̿/'̿̿ ̿ ̿̿
*:･ﾟ✧(ꈍᴗꈍ)✧･ﾟ:*
ᕕ( ᐛ )ᕗ
◕w◕


In [4]:
# search by regex aand/or labels (specifically 'new_tags')
import re
def search(regex='', labels=[]):
    """returns emoticons that contain a given regex and have all the given labels. 
    ignores regex or labels if none are given."""
    condition_re = lambda em: re.search(regex, em) if regex else True
    condition_labels = lambda em_labels: all( [l in em_labels for l in labels] ) if labels else True
    return [em for em, tags in emoticon_dict.items() if condition_re(em) and condition_labels(tags['new_tags'])]

search(regex='◍.*‿.*◍', labels=['smiling'])

["(◍'‿'◍)", '꒰✩◍´´•‿•´´◍✩꒱', '(◍＾‿＾◍)', '(◍•‿•◍)', '(◍‿◍)', '(◍◕ω◕)人(◕‿◕◍)']

In [5]:
# Fit the TF-IDF NearestNeighbors model
vectorizer = TfidfVectorizer(analyzer = 'char_wb', ngram_range=(1,1))  #1,3 would look at 1-grams, 2-grams, and 3-grams
X = vectorizer.fit_transform(corpus).todense()
X = np.asarray(X)  # Convert matrix to array
nbrs = NearestNeighbors(n_neighbors=3, metric='cosine', algorithm='brute').fit(X)

feature_names = vectorizer.get_feature_names_out()
print("FEATURES:", len(feature_names), feature_names[:10], "...")

FEATURES: 3723 [' ' '!' '"' '#' '$' '%' '&' "'" '(' ')'] ...


In [6]:
# returns the n closest emoticons to the input emoticon
def get_n_most_similar(input_emoticon, n):
  input_vector = vectorizer.transform([input_emoticon]).todense()
  distances_np, indices_np = nbrs.kneighbors(input_vector, n) # indices is like an array of document indices
  distances = distances_np.tolist()[0] #convert from 1D numpy arrays to list (idx 0 because we're only doing one input emoticon at a time)
  closest_indices = indices_np.tolist()[0]
  return [(corpus[ci], distances[idx]) for idx, ci in enumerate(closest_indices)]

In [7]:
# Try it out with your own inputs!
example_emoticons = ['(ﾉ◕ヮ◕)ﾉ*:・ﾟ', '¯\_(ツ)_/¯', '/╲/( ͡͡° ͜ʖ ͡°)/\\╱\\', '(╯°□°)╯︵ ┻━┻', '┬┴┬┴┤ᵒᴥᵒᶅ├┬┴┬┴', '_(:3 」∠)_', '[̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅ ]']
input_emoticon = example_emoticons[0]
num_similar = 20

for item in get_n_most_similar(input_emoticon, num_similar):
  print( "{: <35} {: <10} {: <50}".format(item[0], round(item[1], 3), ", ".join(emoticon_dict[item[0]]['new_tags']) ) )

TypeError: np.matrix is not supported. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html

## Samples of each tag

In [9]:
all_tags = ['angel', 'anger', 'annoyed', 'archery', 'asleep', 'basketball', 'bats_vampires', 'beach', 'bear', 'bird', 'birthday', 'blush', 'bomb', 'breasts', 'butt', 'butterfly', 'cat', 'cheerleader', 'chess', 'christmas', 'cigarette', 'clown', 'computers', 'crab', 'crying', 'dancing', 'dead', 'devil', 'dog', 'donger', 'drink', 'excited', 'fight', 'fish', 'flex', 'flower', 'food', 'football', 'frog', 'glasses', 'goodbye_message', 'gun', 'hamster', 'heart', 'hello_message', 'hug', 'kiss', 'koala', 'lenny', 'lying_down', 'middle finger', 'money', 'monkey', 'monocle', 'morning_night_evening_message', 'mouse', 'music', 'mustache', 'penis', 'pig', 'ping_pong', 'pointing', 'pokemon', 'proposal', 'rabbit', 'radio', 'rain', 'robot', 'rose', 'running', 'sad', 'salute_wave', 'seal', 'sheep', 'shrug', 'smiling', 'smirk', 'soccer', 'sparkles', 'spider', 'spinning', 'surprised', 'sweat', 'sword', 'syringe', 'table_flip', 'table_upright', 'thanks_message', 'thumbs_up', 'wall', 'wand', 'wink', 'writing', 'yummy', 'zombie']
for tag in all_tags:
  tagged_emoticons = [em for em, v in emoticon_dict.items() if tag in v['new_tags']]
  samples = random.sample(tagged_emoticons, 5)
  print(f'____ {tag} ({len(tagged_emoticons)}) __________________')
  print("\n".join(samples), '\n')

____ angel (674) __________________
ଘ(✿ˊ≧◡≦)
o:)
ଘ(੭´꒳`)°* ੈ‧₊˚
ଘ(˵ᅌᴗᅌ)
ʚ(╹.╹)ɞ(≧ε≦)β 

____ anger (2993) __________________
（｀ー´）
୧༼ಠ益ಠ༽୨
(＃´ー´ )旦
(๑≧.≦)(°ε°○)
Ψ(｀▽´)Ψ 

____ annoyed (893) __________________
ᕙ(¬益¬)ᕗᕦ(⇀ヘ↼,)ᕥ
(✿≖‸≖)(¬‸¬)
ಠ‿ಥ
(づ¬ಎ¬)づ
( ╬ ಠ益 ಠ) 

____ archery (231) __________________
⤜(-ヮ-)⤏
─=≡Σᕕ(-u-)ᕗ⤜(⩹ ε ⩺)⤏
⤜(˵ ͡~﹏ ͡°˵)⤏
⤜(✧‸✧)⤏
⤜( ͡ಥ-ಥ)⤏ 

____ asleep (439) __________________
⁽⁽(ᵕ≀ ̠ᵕ [▓▓▓]
(:3 っ)3≡･◦∴*+◦º.+*.•。[][▓▓]
˓(¦:ɝ[▓▓]
.꒰ϱ﹏-๑꒱‧*Zz｡
(-。-●)💤 

____ basketball (63) __________________
🗑 🏀＼(oUo＼)
Ю ● ＼(*ｰ*＼)
(=っ-ェ-=)っ●Ю
ʕ´⁰̈ᴥ⁰̈`ʔฅ     o     ฅʕ´⁰̈ᴥ⁰̈`ʔ
ﾉ(  •́ ◡ •̀ )ﾉ＼(^_^ )🏀ヾ(´°◡°｀)ﾉ 

____ bats_vampires (67) __________________
⚇/ | \ ^ ._. ^ / | \⚇
°/|\ ^._.^ /|\°
<(:,..,:)>
(m￣ー￣)m                 ◥(ºᵥᵥº)◤ ◥(ºᵥᵥº)◤
:F 

____ beach (51) __________________
🌴((っ˘0˘)👙
❆🏄❆
人人人ヾ( ;×o×)〃人人人
‿︵ヽ(✿^0^)ノ︵‿︵‿▨-▨¬︵‿
☀🌴🌴(💮■-■)🍹 

____ bear (4179) __________________
ʕ๑>ᴥ├┬┴┬┴
ʕᵔᴥᵔʔ▿
❁(Θ(ｴ)Θ*)
ʕっ• ᴥ • ʔっ
(っ•ω• ς)ʕ╹ᴥ╹ʔ♥ 

____ bird (228) __________________
ϵ( ‘Θ’)϶
( ˘⊖˘)
( ˙Θ

### Heuristic-Based Kaomoji Detection Using Rare ASCII Clusters

In [1]:
import re

RARE_ASCII = set("｡･・∩∪∫≠≈≡≪≫⊃⊇⊂⊆⋆☆★✧✩☼♥♡♪♫♬乁ヅツシ彡メミ≧≦〜｀￥λμωδΩ≮≯※ΨΦπξ∂∞¤∴╌╭╮╯╰╱╲▄█▓▒░∀∂∃∅∇∈∉∋∌∍∎∏∑−∕∖∗∘∙√∝∞∟∠∡∢∣∤∥∦∧∨∩∪∫∬∭∮∯∰∱∲∳∴∵∶∷∸∹∺∻∼∽∾∿≀≁≂≃¯¿¡☀☁☂☃☄★☆☇☈☉☊☋☌☍☎☏☐☑☒☓☔☕☖☗☘☙☚☛☜☝☞☟☠☡☢☣☤☥☦☧☨☩☪☫☬☭☮☯☰☱☲☳☴☵☶☷☸☹☺☻☼☽☾☿♀♁♂♃♄♅♆♇♈♉♊♋♌♍♎♏♐♑♒♓♔♕♖♗♘♙♚♛♜♝♞♟♠♡♢♣♤♥♦♧♨♩♪♫♬♭♮♯♰♱♲♳♴♵♶♷♸♹♺♻♼♽♾♿⚀⚁⚂⚃⚄⚅⚆⚇⚈⚉⚊⚋⚌⚍⚎⚏⚐⚑⚒⚓⚔⚕⚖⚗⚘⚙⚚⚛⚜⚝⚞⚟⚠⚡⚢⚣⚤⚥⚦⚧⚨⚩⚪⚫⚬⚭⚮⚯⚰⚱⚲⚳⚴⚵⚶⚷⚸⚹⚺⚻⚼⚽⚾⚿⛀⛁⛂⛃⛄⛅⛆⛇⛈⛉⛊⛋⛌⛍⛎⛏⛐⛑⛒⛓⛔⛕⛖⛗⛘⛙⛚⛛⛜⛝⛞⛟⛠⛡⛢⛣⛤⛥⛦⛧⛨⛩⛪⛫⛬⛭⛮⛯⛰⛱⛲⛳⛴⛵⛶⛷⛸⛹⛺⛻⛼⛽⛾⛿")

def is_rare(c):
    return c in RARE_ASCII or ord(c) > 126  # Include fullwidth/Unicode

def detect_kaomoji(text, window=6, min_rare=2):
    kaomoji_spans = []
    i = 0
    while i < len(text):
        rare_count = 0
        for j in range(i, min(i + window, len(text))):
            if is_rare(text[j]):
                rare_count += 1
        if rare_count >= min_rare:
            # Expand left and right
            start = i
            while start > 0 and (is_rare(text[start-1]) or text[start-1].isalnum() == False):
                start -= 1
            end = i + window
            while end < len(text) and (is_rare(text[end]) or text[end].isalnum() == False):
                end += 1
            kaomoji_spans.append((start, end, text[start:end]))
            i = end
        else:
            i += 1
    return kaomoji_spans

In [2]:
text = "I love cats (｡♥‿♥｡) and dogs (╯°□°）╯︵ ┻━┻ but not bugs"
print(detect_kaomoji(text))

[(9, 20, 'ts (｡♥‿♥｡) '), (26, 42, 'gs (╯°□°）╯︵ ┻━┻ ')]
