#SoMe Topic Modeling Notebook | Release canvas 1 📖


In [1]:
# Installations
import sys
if 'google.colab' in sys.modules:
    !pip install emoji --upgrade
    !pip install pandas-profiling==2.*
    !pip install plotly==4.*
    !python -m spacy download en_core_web_lg
    !pip install pyldavis

Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/40/8d/521be7f0091fe0f2ae690cc044faf43e3445e0ff33c574eae752dd7e39fa/emoji-0.5.4.tar.gz (43kB)
[K     |███████▌                        | 10kB 18.1MB/s eta 0:00:01[K     |███████████████                 | 20kB 1.6MB/s eta 0:00:01[K     |██████████████████████▋         | 30kB 1.9MB/s eta 0:00:01[K     |██████████████████████████████▏ | 40kB 1.7MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 1.7MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.5.4-cp36-none-any.whl size=42176 sha256=636c021ff4e469866a19547f051bddd1ecf259d9cf73905d5665bef35dd1907c
  Stored in directory: /root/.cache/pip/wheels/2a/a9/0a/4f8e8cce8074232aba240caca3fade315bb49fac68808d1a9c
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.5.4
Collecting pandas-profiling==2.*
[

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.1MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180944 sha256=943b4d85ac2f302b99b816911a9f5d06a6908857920e4f505f484d47f5ded5b0
  Stored in directory: /tmp/pip-ephem-wheel-cache-fb3la0po/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')
Collecting pyldavis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f

In [0]:
# Required Libraries

#Base and Cleaning 
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib as plt 
import pyLDAvis.gensim

#Natural Language Processing (NLP)
import spacy
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

##Data Cleaning 🧹

In [0]:
# Loading the JSON file 
url_elon = 'https://raw.githubusercontent.com/Lambda-School-Labs/social-media-strategy-ds/feature/topic/python_notebooks/elonmusk_followers_english.json'
url_dutchbros = 'https://raw.githubusercontent.com/Lambda-School-Labs/social-media-strategy-ds/feature/topic/python_notebooks/dutchbros_followers.json'
df = requests.get(url_dutchbros).json()
df

In [0]:
# Converting the dataset to pandas DataFrame and renaming the columns 
df = pd.DataFrame(df.values())
df = df.rename(columns={0:'tweets'})
df.head()

Unnamed: 0,tweets
0,This kid will forever be a legend 😂 https://t....
1,"If you truly believe Lebrons mindset, competit..."
2,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!
3,@Bhuvan_Bam ❤️❤️
4,I'm not crying you're crying.\nhttps://t.co/Bc...


In [0]:
#Removing emojies from text and and puttin them in a new column
#Refrence 1 : https://stackoverflow.com/questions/43146528/how-to-extract-all-the-emojis-from-text
#Refrence 2 : https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
#Refrence 3 : https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python

### Question: How to combine all functions into one 
def extract_emoji(text):
    '''
    Extracts emojies from text
    '''
    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)
    return emoji_list

def emoji_free_text(text):
    '''
    Cleans text from emojies
    '''
    emoji_list_1 = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list_1)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

#Create a new column with emojies from each tweet
df['tweet_emojies'] = df['tweets'].apply(lambda x : extract_emoji(x))

#Create a new column with cleaned tweets from emojies
df['emoji_free_tweets'] = df['tweets'].apply(lambda x : emoji_free_text(x))

#Create a new column with url free tweets
df['url_free_tweets'] = df['emoji_free_tweets'].apply(lambda x : url_free_text(x))

df.head()

Unnamed: 0,tweets,tweet_emojies,emoji_free_tweets,url_free_tweets
0,This kid will forever be a legend 😂 https://t....,[😂],This kid will forever be a legend https://t.co...,This kid will forever be a legend
1,"If you truly believe Lebrons mindset, competit...",[],"If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit..."
2,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,[],BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!
3,@Bhuvan_Bam ❤️❤️,"[❤️, ❤️]",@Bhuvan_Bam,@Bhuvan_Bam
4,I'm not crying you're crying.\nhttps://t.co/Bc...,[],I'm not crying you're crying. https://t.co/BcF...,I'm not crying you're crying.


##Topic Modeling ㊙️

###Tokenizing 🕵🏻‍♂️

In [0]:
# Load spacy
# Make sure to restart the runtime after running installations and libraries tab
nlp = spacy.load('en_core_web_lg')

In [0]:
# Get Tokens 
tokens = []

for doc in nlp.pipe(df['url_free_tweets'], batch_size=500):
    
    doc_tokens = []
    
    for token in doc:
        if (token.is_stop == False) & (token.is_punct == False):
            doc_tokens.append(token.lemma_.lower())
            
    tokens.append(doc_tokens)
        
df['tokens'] = tokens

del tokens

In [0]:
df['tokens'].head()

0                               [kid, forever, legend]
1    [truly, believe, lebrons, mindset, competitive...
2                             [buttlicker, price, low]
3                                        [@bhuvan_bam]
4                                           [cry, cry]
Name: tokens, dtype: object

###id2word 📒

In [0]:
# Create a id2word dictionary
id2word = Dictionary(df['tokens'])
print(len(id2word))

22210


In [0]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

8310


###Corpus Object & Getting Topics 📚

In [0]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in df['tokens']]

In [0]:
# Instantiating a LDA model 
model = LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, workers=12, passes=5)

In [0]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in model.print_topics()]

In [0]:
# Create Topics
topics = [' '.join(t[0:5]) for t in words]

In [0]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
people like government new mask

------ Topic 1 ------
obamagate time amp need right

------ Topic 2 ------
$ amp know btc bitcoin

------ Topic 3 ------
time like year good think

------ Topic 4 ------
amp like follow   retweet

------ Topic 5 ------
day   amp 2 new

------ Topic 6 ------
work time   go state

------ Topic 7 ------
  think new people good

------ Topic 8 ------
people work go want  

------ Topic 9 ------
trump day say love like



### Topic Distance Visualization 📈

*   List item
*   List item



In [0]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model, corpus, id2word)

In [0]:
# Steps to Score Training Documents
# Already have BOW Represented called 'corpus'

distro = [lda[d] for d in corpus]
num_topics = 20

def update(doc):
        d_dist = {k:0 for k in range(0,num_topics)}
        for topic in doc:
            # Topic is Tuple where the first part is the topic id
            # the second part is the topic distribution in that doc
            d_dist[topic[0]] = topic[1]
        return d_dist
    
new_distro = [update(d) for d in distro]

NameError: ignored