#SoMe Topic Modeling Notebook | Release canvas 1 📖

In [10]:
# Installations
import sys
if 'google.colab' in sys.modules:
    !pip install emoji --upgrade
    !pip install pandas-profiling==2.*
    !pip install plotly==4.*
    !python -m spacy download en_core_web_lg
    !pip install pyldavis
    !pip install gensim
    !pip install squarify==0.4.3

Requirement already up-to-date: emoji in /usr/local/lib/python3.6/dist-packages (0.5.4)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [0]:
# Required Libraries

#Base and Cleaning 
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
import squarify
from collections import Counter

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib as plt 
import pyLDAvis.gensim

#Natural Language Processing (NLP)
import spacy
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)

##Data Cleaning 🧹

In [12]:
# Loading the JSON file 
url_elon = 'https://raw.githubusercontent.com/Lambda-School-Labs/social-media-strategy-ds/feature/topic/python_notebooks/elonmusk_followers_english.json'
url_dutchbros = 'https://raw.githubusercontent.com/Lambda-School-Labs/social-media-strategy-ds/feature/topic/python_notebooks/dutchbros_followers.json'
df = requests.get(url_dutchbros).json()

# Converting the dataset to pandas DataFrame and renaming the columns 
df = pd.DataFrame(df.values())
df = df.rename(columns={0:'original_tweets'})

#Removing emojies from text and and puttin them in a new column
#Refrence 1 : https://stackoverflow.com/questions/43146528/how-to-extract-all-the-emojis-from-text
#Refrence 2 : https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
#Refrence 3 : https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python

### Question: How to combine all functions into one 
def extract_emoji(text):
    '''
    Extracts emojies from text
    '''
    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)
    return emoji_list

def emoji_free_text(text):
    '''
    Cleans text from emojies
    '''
    emoji_list_1 = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list_1)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

#Create a new column with emojies from each tweet
df['tweet_emojies'] = df['original_tweets'].apply(lambda x : extract_emoji(x))

#Create a new column with cleaned tweets from emojies
df['emoji_free_tweets'] = df['original_tweets'].apply(lambda x : emoji_free_text(x))

#Create a new column with url free tweets
df['url_free_tweets'] = df['emoji_free_tweets'].apply(lambda x : url_free_text(x))

df.head()

Unnamed: 0,original_tweets,tweet_emojies,emoji_free_tweets,url_free_tweets
0,Flynn was fucking railroaded. https://t.co/4aZ...,[],Flynn was fucking railroaded. https://t.co/4aZ...,Flynn was fucking railroaded.
1,President @realDonaldTrump talks about Opening...,[],President @realDonaldTrump talks about Opening...,President @realDonaldTrump talks about Opening...
2,"#Obamagate Comey magic: Flynn calls ""turned up...",[],"#Obamagate Comey magic: Flynn calls ""turned up...","#Obamagate Comey magic: Flynn calls ""turned up..."
3,Matt Gaetz predicts President Trump will pardo...,[],Matt Gaetz predicts President Trump will pardo...,Matt Gaetz predicts President Trump will pardo...
4,We're in the 21st century Joe... https://t.co/...,[],We're in the 21st century Joe... https://t.co/...,We're in the 21st century Joe...


##Topic Modeling ㊙️

###Tokenizing 🕵🏻‍♂

In [0]:
# Load spacy
# Make sure to restart the runtime after running installations and libraries tab
nlp = spacy.load('en_core_web_lg')

In [14]:
"""
Import Gensim and Wordcloud to use their stopwords as well and use the combined stopwords of ALL as the variable:
ALL_STOP_WORDS
"""

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)


# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


tokens = []

for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

# View df
df

Unnamed: 0,original_tweets,tweet_emojies,emoji_free_tweets,url_free_tweets,tokens
0,Flynn was fucking railroaded. https://t.co/4aZ...,[],Flynn was fucking railroaded. https://t.co/4aZ...,Flynn was fucking railroaded.,"[flynn, fucking, railroaded.]"
1,President @realDonaldTrump talks about Opening...,[],President @realDonaldTrump talks about Opening...,President @realDonaldTrump talks about Opening...,"[president, @realdonaldtrump, talks, opening, ..."
2,"#Obamagate Comey magic: Flynn calls ""turned up...",[],"#Obamagate Comey magic: Flynn calls ""turned up...","#Obamagate Comey magic: Flynn calls ""turned up...","[#obamagate, comey, magic:, flynn, calls, ""tur..."
3,Matt Gaetz predicts President Trump will pardo...,[],Matt Gaetz predicts President Trump will pardo...,Matt Gaetz predicts President Trump will pardo...,"[matt, gaetz, predicts, president, trump, pard..."
4,We're in the 21st century Joe... https://t.co/...,[],We're in the 21st century Joe... https://t.co/...,We're in the 21st century Joe...,"[we're, 21st, century, joe...]"
...,...,...,...,...,...
18518,After a great phone call with the @ASUFootball...,"[🔥, ☀️, 🔥, ☀️, 🔥, ☀️, 🔥, ☀️, 🔥, ☀️, 🔥]",After a great phone call with the @ASUFootball...,After a great phone call with the @ASUFootball...,"[great, phone, @asufootball, coach’s, humbled,..."
18519,After a great zoom meeting with the @ASUFootba...,[],After a great zoom meeting with the @ASUFootba...,After a great zoom meeting with the @ASUFootba...,"[great, zoom, meeting, @asufootball, staff,, e..."
18520,Morning 💗 https://t.co/uu7MhenMOf,[💗],Morning https://t.co/uu7MhenMOf,Morning,[morning]
18521,Always be careful who you open up to. Only a f...,[],Always be careful who you open up to. Only a f...,Always be careful who you open up to. Only a f...,"[careful, open, to., actually, care,, curious.]"


###Lemmatization🇬🇧

In [0]:
# Make tokens a string again
# credit : https://stackoverflow.com/questions/45306988/column-of-lists-convert-list-to-string-as-a-new-column
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]
# df[['original_tweet', 'lemmas_back_to_text']]

##

In [16]:
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # Removing url's
    pattern = r"http\S+"
    
    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    tokens = re.sub('@', '', text) # Remove @
    tokens = tokens.strip(',') # TESTING THIS LINE
    tokens = tokens.strip('?') # TESTING THIS LINE
    tokens = tokens.strip('!') # TESTING THIS LINE
    tokens = tokens.strip("'") # TESTING THIS LINE
    tokens = tokens.strip(".") # TESTING THIS LINE
    tokens = tokens.strip("$") # TESTING THIS LINE
    tokens = tokens.strip("@") # TESTING THIS LINE

    tokens = tokens.lower().split() # Make text lowercase and split it
    
    return tokens

# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)

# View those tokens (the 4th column)
df


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escap

Unnamed: 0,original_tweets,tweet_emojies,emoji_free_tweets,url_free_tweets,tokens,tokens_back_to_text,lemmas,lemmas_back_to_text,lemma_tokens
0,Flynn was fucking railroaded. https://t.co/4aZ...,[],Flynn was fucking railroaded. https://t.co/4aZ...,Flynn was fucking railroaded.,"[flynn, fucking, railroaded.]",flynn fucking railroaded.,"[flynn, fucking, railroaded]",flynn fucking railroaded,"[flynn, fucking, railroaded]"
1,President @realDonaldTrump talks about Opening...,[],President @realDonaldTrump talks about Opening...,President @realDonaldTrump talks about Opening...,"[president, @realdonaldtrump, talks, opening, ...",president @realdonaldtrump talks opening ameri...,"[president, @realdonaldtrump, talks, opening, ...",president @realdonaldtrump talks opening america,"[president, realdonaldtrump, talks, opening, a..."
2,"#Obamagate Comey magic: Flynn calls ""turned up...",[],"#Obamagate Comey magic: Flynn calls ""turned up...","#Obamagate Comey magic: Flynn calls ""turned up...","[#obamagate, comey, magic:, flynn, calls, ""tur...","#obamagate comey magic: flynn calls ""turned up...","[obamagate, comey, magic, flynn, call, turn, f...",obamagate comey magic flynn call turn find ova...,"[obamagate, comey, magic, flynn, call, turn, f..."
3,Matt Gaetz predicts President Trump will pardo...,[],Matt Gaetz predicts President Trump will pardo...,Matt Gaetz predicts President Trump will pardo...,"[matt, gaetz, predicts, president, trump, pard...",matt gaetz predicts president trump pardon rog...,"[matt, gaetz, predict, president, trump, pardo...",matt gaetz predict president trump pardon roge...,"[matt, gaetz, predict, president, trump, pardo..."
4,We're in the 21st century Joe... https://t.co/...,[],We're in the 21st century Joe... https://t.co/...,We're in the 21st century Joe...,"[we're, 21st, century, joe...]",we're 21st century joe...,"[21st, century, joe]",21st century joe,"[21st, century, joe]"
...,...,...,...,...,...,...,...,...,...
18518,After a great phone call with the @ASUFootball...,"[🔥, ☀️, 🔥, ☀️, 🔥, ☀️, 🔥, ☀️, 🔥, ☀️, 🔥]",After a great phone call with the @ASUFootball...,After a great phone call with the @ASUFootball...,"[great, phone, @asufootball, coach’s, humbled,...",great phone @asufootball coach’s humbled bless...,"[great, phone, @asufootball, coach, humble, bl...",great phone @asufootball coach humble blessed ...,"[great, phone, asufootball, coach, humble, ble..."
18519,After a great zoom meeting with the @ASUFootba...,[],After a great zoom meeting with the @ASUFootba...,After a great zoom meeting with the @ASUFootba...,"[great, zoom, meeting, @asufootball, staff,, e...","great zoom meeting @asufootball staff, extreme...","[great, zoom, meeting, @asufootball, staff, ex...",great zoom meeting @asufootball staff extremel...,"[great, zoom, meeting, asufootball, staff, ext..."
18520,Morning 💗 https://t.co/uu7MhenMOf,[💗],Morning https://t.co/uu7MhenMOf,Morning,[morning],morning,[morning],morning,[morning]
18521,Always be careful who you open up to. Only a f...,[],Always be careful who you open up to. Only a f...,Always be careful who you open up to. Only a f...,"[careful, open, to., actually, care,, curious.]","careful open to. actually care, curious.","[careful, open, actually, care, curious]",careful open actually care curious,"[careful, open, actually, care, curious]"


###id2word 📒

In [17]:
# Create a id2word dictionary
id2word = Dictionary(df['lemma_tokens'])
print(len(id2word))

24941


In [18]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

10190


###Corpus Object & Generating Topics 📚

In [0]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

In [20]:
# Instantiating a LDA model 
model = LdaMulticore(corpus=corpus, num_topics=5, id2word=id2word, workers=12, passes=5)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or th

In [0]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in model.print_topics()]

In [0]:
# Create Topics
topics = [' '.join(t[0:5]) for t in words]

In [23]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
time love feel know think

------ Topic 1 ------
follow come retweet thank tweet

------ Topic 2 ------
go new thing feel people

------ Topic 3 ------
people good love think know

------ Topic 4 ------
fuck find time think shit



###### Topic Distance Visualization 📈

In [24]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model, corpus, id2word)

##Make a pipeline 

In [0]:
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
