In [3]:
import spacy

#!python -m spacy download en_core_web_sm

In [13]:
# Q2
from pymongo import MongoClient
import pandas as pd

__author__ = 'Ming Ho Wu'

"""
The clustering algorithm we are going to use is the K-mean clustering.
That is because we learn this method in the Text as Data course,
and a working implementation is provided in the Lab2 exercise.
"""

# Aim: extract the username, hashtag and full text of all the tweets
#      store them into a panda dataframe
# The reference for the solution is in the Lab2 Exercise in the Text as Data course.

client = MongoClient()
db = client["Q1b_DB"]
collection = db["Q1b_Collection"]

all_data = collection.find({})
posts_tmp = list()
collectAllHashtags = list()

for data in all_data:
    hashtagsList = list()
    for hashtags in data["entities"]["hashtags"]:
            hashtagsList.append(hashtags["text"])
            collectAllHashtags.append(hashtags["text"])
    if "retweeted_status" in data:
        posts_tmp.append((data["_id"], data["user"]["name"], data["retweeted_status"]["full_text"], hashtagsList))
    else:
        posts_tmp.append((data["_id"],data["user"]["name"],data["full_text"], hashtagsList))
    
    
       
labels = ['id','username','full_text','hashtag']
refined_data = pd.DataFrame(posts_tmp,columns=labels)
refined_data.head(4000)

Unnamed: 0,id,username,full_text,hashtag
0,5e8061e159de928bc9f9bbe8,tassawar rehman,This is wonderful news. \n\nBut we need a mech...,[]
1,5e8061e159de928bc9f9bbe9,Aleks Kins,UK broadband providers lift data caps during c...,[technology]
2,5e8061e159de928bc9f9bbea,Jimmy Hunter 🏳️‍🌈🏴󠁧󠁢󠁳󠁣󠁴󠁿🇪🇺 #StayAtHome,This should come as no surprise but Trump supp...,[]
3,5e8061e159de928bc9f9bbeb,Alvaro Lagresa,"Microsoft says it has ""divested its shareholdi...",[]
4,5e8061e159de928bc9f9bbec,Ron Felice,A dashboard from the land before time https:/...,[]
...,...,...,...,...
3995,5e80b2de59de928bc9f9cb83,Liberal Momma,".@GovMikeDeWine: ""This is a matter of life and...",[]
3996,5e80b2de59de928bc9f9cb84,🌊💐Aceysmommy🌺🌊,".@GovMikeDeWine: ""This is a matter of life and...",[]
3997,5e80b2de59de928bc9f9cb85,Bruce Edwards,Some can’t handle the truth.\nMedical Expert W...,[]
3998,5e80b2de59de928bc9f9cb86,Margaret Applin Designs,@IngrahamAngle This is called DUAL USE TECHNOL...,[]


In [5]:
# Aim: Tokenize and normalize the text then vectorize it
# Reference for solution is in the Lab 2 in Text as Data Course

from sklearn.feature_extraction.text import TfidfVectorizer


nlp = spacy.load('en_core_web_sm', disable=['ner'])

def spacy_tokenize(string):
  tokens = list()
  doc = nlp(string)
  for token in doc:
    tokens.append(token)
  return tokens


def normalize(tokens):
  normalized = list()
  for token in tokens:
    if (token.is_alpha or token.is_digit):
      lemma = token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_
      normalized.append(lemma)
  return normalized


def tokenize_normalize(string):
  return normalize(spacy_tokenize(string))


ngram_vectorizer = TfidfVectorizer(tokenizer=tokenize_normalize, stop_words='english',sublinear_tf=True, max_features=50000, ngram_range=(1,2))
ngram_document_term_matrix = ngram_vectorizer.fit_transform(refined_data["full_text"])

  'stop_words.' % sorted(inconsistent))


In [6]:
# Aim: Perform K mean
print("HI")
from sklearn.cluster import KMeans
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, init='random', n_init=5, verbose=10)
kmeans.fit(ngram_document_term_matrix)


HI
Initialization complete
Iteration  0, inertia 18475.633
Iteration  1, inertia 9883.202
Iteration  2, inertia 9467.125
Iteration  3, inertia 9259.111
Iteration  4, inertia 9213.149
Iteration  5, inertia 9158.065
Iteration  6, inertia 9072.000
Iteration  7, inertia 9022.205
Iteration  8, inertia 8999.756
Iteration  9, inertia 8988.123
Iteration 10, inertia 8988.086
Converged at iteration 10: center shift 0.000000e+00 within tolerance 1.950940e-09
Initialization complete
Iteration  0, inertia 20213.622
Iteration  1, inertia 9776.674
Iteration  2, inertia 9354.613
Iteration  3, inertia 9203.818
Iteration  4, inertia 9155.208
Iteration  5, inertia 9134.656
Iteration  6, inertia 9130.158
Iteration  7, inertia 9125.604
Converged at iteration 7: center shift 0.000000e+00 within tolerance 1.950940e-09
Initialization complete
Iteration  0, inertia 20328.724
Iteration  1, inertia 9963.340
Iteration  2, inertia 9625.422
Iteration  3, inertia 9530.665
Iteration  4, inertia 9507.474
Iteration  5,

KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300,
       n_clusters=10, n_init=5, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=10)

In [7]:
# Aim: Extract the top 20 important entities/ concepts for each group
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = ngram_vectorizer.get_feature_names()


for i in range(num_clusters):
  print("Cluster %d:" % i)
  for ind in order_centroids[i, :20]:
    print(' %s' % terms[ind])  
  print("---------------------")


Cluster 0:
 correct trump
 trump target
 correct
 expert correct
 target far
 medical expert
 target
 expert
 medical
 far right
 far
 trump
 right
 right new
 new
 york times
 times
 new york
 york
 shameful attack
---------------------
Cluster 1:
 strongly word
 word press
 strongly
 remember strongly
 deployment mask
 limit deployment
 year remember
 morning blast
 cover 7
 press release
 7 year
 release morning
 blast fda
 deployment
 blast
 cover
 7
 release
 word
 press
---------------------
Cluster 2:
 sterilize
 mask
 day
 ohio
 base battelle
 despite ability
 ability sterilize
 day despite
 battelle sterilize
 authorize columbus
 columbus base
 ohio day
 day ohio
 fda authorize
 mask ohio
 authorize
 mask day
 sterilize surgical
 announce fda
 ability
---------------------
Cluster 3:
 ohio pioneers
 pioneers technology
 ppe radio
 pioneers
 coronavirus ohio
 reuse ppe
 radio
 clean reuse
 technology clean
 reuse
 clean
 ppe
 ohio
 coronavirus
 technology
 godfrey zoom
 gogravi

In [24]:
# find the top 10 usernames



top_10_frequency = 10
top10_usernames = refined_data['username'].value_counts()[:top_10_frequency].index.tolist()

print(top10_usernames)



    

['Security Testing', 'Cyber Security News', 'Cyber Security Feed', 'HubofMachineLearning', 'CyberSecurityBot 🤖', 'Virtual Consultnts', 'Chidambara .ML.', 'Ochieng Scott', 'iC0dE', 'Milo Camacho']


In [25]:
# find the top 10 hashtags

top_10_frequency = 10


# Reference for solution:
# https://www.geeksforgeeks.org/python-find-most-frequent-element-in-a-list/
  
from collections import Counter 
  
def most_frequent(List): 
    occurence_count = Counter(List) 
    return occurence_count.most_common(10)
    
print(most_frequent(collectAllHashtags)) 


[('technology', 384), ('COVID19', 182), ('China', 172), ('CCPVirus', 166), ('CCP', 165), ('Ghilli', 142), ('Technology', 118), ('tech', 106), ('AI', 94), ('coronavirus', 69)]
