In [1]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import numpy as np

In [2]:
pageinfo = pd.read_csv('/home3/usfb/build/output/page/1000-page-info.csv')
# politifact data with fake domain
postdata = pd.read_csv('/home3/usfb/analysis/analysis-fake-news/temp/post-match-domain/1000_page_politifact_domain.csv')
# Join two tables to get page info of each post data
postdata_WithPageInfo = postdata.join(pageinfo[['page_id','category','type']].set_index('page_id'), 
                                                               on='page_id')

In [3]:
postdata_WithPageInfo["post_created_date_CT"] = pd.to_datetime(postdata_WithPageInfo["post_created_date_CT"])

In [4]:
postdata_WithPageInfo.columns

Index(['page_id', 'page_name', 'post_id', 'post_type', 'post_name',
       'post_message', 'post_caption', 'post_picture', 'post_link',
       'post_description', 'post_reactions', 'post_likes', 'post_comments',
       'post_shares', 'post_created_time_CT', 'post_updated_time_CT',
       'post_created_date_CT', 'post_updated_date_CT', 'post_created_time',
       'post_updated_time', 'page_talking_about_count', 'domain', 'category',
       'type'],
      dtype='object')

In [5]:
def filter_post_data( start_date, end_date, date_col ="post_created_date_CT"):
#Select post data between input string varaibles start_date, end_date from gobal pandas dataframe variable postdata_WithPageInfo
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    return postdata_WithPageInfo[(postdata_WithPageInfo[date_col] >= start_date) & (postdata_WithPageInfo[date_col] <= end_date)]

In [6]:
from nltk.corpus import wordnet
import nltk

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [7]:
import string
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

def tokenize_with_lemmentize(document ,lemmentize = True):
#Tokenizer and Lemmantizer fo TfidfVectorizer
#First remove url, then remove stopwords and non-alphabet, and lemmantize the lower cased tokens. 

    tokenized_post = []
    lemmatizer = WordNetLemmatizer()
    removed = stopwords.words('english') + list(string.punctuation)
    
    document =  re.sub(r'http\S+', '', str(document))
    tokens = word_tokenize(document)
    
    words = [word.lower() for word in tokens if word.isalpha() and word not in removed]
    if lemmentize:
        words_lemmantized = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words]
        tokenized_post += words_lemmantized
    else:
        tokenized_post += words
    return tokenized_post

In [8]:
time_vec = ["2016/03", "2016/04", "2016/05", "2016/06", "2016/07", "2016/08", "2016/09"]

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim

removed = stopwords.words('english') + list(string.punctuation)

#TfidfVectorizer using the above tokenizer, 
#which won't include numbers or words that has documnet frequency smaller than 5

vectorizer = TfidfVectorizer(
    tokenizer= tokenize_with_lemmentize,
    min_df = 5,
    use_idf = True,
    token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b')

In [10]:
from sklearn.cluster import KMeans
from collections import Counter

In [11]:
#Create a dictionary of dataframes that records the sorted centriod of each month's clustering result.
sorted_top10_cluster_dic = {}

#Run K-means clustering on each TF-IDF vectorized dataset.
#Then sort the clusters by numbers of observations in the cluster
#Record the top 15 words in the top 10 cluster using sorted_top10_cluster_dict

for i in range(len(time_vec)-1):
    sorted_top10_cluster_dic[time_vec[i]] = pd.DataFrame()
    
    monthly_data = filter_post_data(time_vec[i], time_vec[i+1])
    post_vec_sparse = vectorizer.fit_transform(monthly_data["post_name"][~monthly_data["post_name"].isnull()])

    km_30 = KMeans(n_clusters=30, 
                init='k-means++',
                random_state=0) 
    km_30.fit(post_vec_sparse)
    
    sorted_lables = sorted(Counter(km_30.labels_).items(), key=lambda kv: kv[1], reverse = True )
    for j in sorted_lables[:10]:
        centriod_vec = pd.Series(km_30.cluster_centers_[j[0]] , index = vectorizer.get_feature_names() )
        s = centriod_vec.sort_values(ascending = False)[:15]
        sorted_top10_cluster_dic[time_vec[i]][j[0]] = list(zip(s.index, np.round(s,3)))
    print("done", time_vec[i])

done 2016/03
done 2016/04
done 2016/05
done 2016/06
done 2016/07
done 2016/08


In [18]:
# Print n documents in a cluster of K-means clustering result. Used to examine what the original document looks like
# Variable which_cluster is the index of cluster before sorted, 
#   which is recorded in the column name of DataFrames in sorted_top10_cluster_dic
def find_n_cluster_post(monthly_data, which_cluster, n):

    post_vec_sparse = vectorizer.fit_transform(monthly_data["post_name"][~monthly_data["post_name"].isnull()])
    
    km_30 = KMeans(n_clusters=30, 
                init='k-means++',
                random_state=0) 
    km_30.fit(post_vec_sparse)
    
    print(monthly_data["post_name"][km_30.labels_ == which_cluster][:n])

In [19]:
find_n_cluster_post(filter_post_data("2016/03", "2016/04"), 1, 10)

103    Muslim Infiltrators Angry that Small Town Colo...
232    MUSLIM TEARS: University of Minnesota Says The...
244    Teenager that Beat up Katt Williams Files $100...
250    Kevin Gates Files for Divorce After Wife Refus...
257    Employee Admits To “Pooping” In Wendy’s Chili ...
258    Kevin Gates Files for Divorce After Wife Refus...
266    Million Pounds Of Rat Meat Being Sold As Bonel...
304    'The View' Is On The Chopping Block To Be Canc...
344    Employee Admits To “Pooping” In Wendy’s Chili ...
460    Teenager that Beat up Katt Williams Files $100...
Name: post_name, dtype: object


In [12]:
sorted_top10_cluster_dic["2016/03"][:15]

Unnamed: 0,1,13,4,8,6,15,11,16,20,10
0,"(muslim, 0.026)","(obama, 0.219)","(trump, 0.141)","(hillary, 0.222)","(u, 0.221)","(girl, 0.127)","(see, 0.264)","(one, 0.189)","(man, 0.253)","(dear, 0.262)"
1,"(video, 0.018)","(politics, 0.094)","(question, 0.099)","(video, 0.053)","(herald, 0.182)","(mom, 0.112)","(woman, 0.053)","(kill, 0.103)","(shock, 0.051)","(america, 0.241)"
2,"(isi, 0.016)","(plan, 0.031)","(mitt, 0.084)","(never, 0.039)","(obama, 0.064)","(get, 0.085)","(something, 0.038)","(illegal, 0.059)","(whoa, 0.033)","(everyone, 0.075)"
3,"(trump, 0.016)","(treason, 0.026)","(romney, 0.08)","(trump, 0.038)","(move, 0.026)","(see, 0.047)","(man, 0.037)","(reason, 0.042)","(get, 0.032)","(say, 0.059)"
4,"(attack, 0.013)","(video, 0.026)","(like, 0.061)","(benghazi, 0.033)","(military, 0.022)","(muslim, 0.04)","(behind, 0.027)","(alien, 0.041)","(something, 0.029)","(trump, 0.058)"
5,"(watch, 0.012)","(go, 0.025)","(gop, 0.043)","(say, 0.031)","(isi, 0.021)","(room, 0.035)","(cop, 0.026)","(isi, 0.031)","(homeless, 0.026)","(hillary, 0.047)"
6,"(say, 0.012)","(general, 0.024)","(would, 0.033)","(watch, 0.03)","(muslim, 0.021)","(little, 0.032)","(people, 0.023)","(list, 0.03)","(wife, 0.025)","(obama, 0.043)"
7,"(politics, 0.011)","(michelle, 0.023)","(blast, 0.032)","(islamic, 0.029)","(president, 0.021)","(sick, 0.031)","(car, 0.022)","(thing, 0.03)","(muslim, 0.024)","(deputize, 0.042)"
8,"(brussels, 0.011)","(call, 0.021)","(convention, 0.029)","(mocked, 0.025)","(tell, 0.019)","(big, 0.028)","(girl, 0.022)","(city, 0.028)","(black, 0.023)","(sheriff, 0.041)"
9,"(student, 0.01)","(brussels, 0.019)","(election, 0.025)","(want, 0.023)","(prison, 0.018)","(pic, 0.025)","(muslim, 0.022)","(people, 0.027)","(trump, 0.021)","(agree, 0.041)"


In [13]:
sorted_top10_cluster_dic["2016/04"][:10]

Unnamed: 0,0,13,17,28,8,2,22,4,10,5
0,"(muslim, 0.023)","(video, 0.138)","(trump, 0.234)","(freedom, 0.249)","(politics, 0.19)","(dear, 0.267)","(obama, 0.233)","(see, 0.243)","(u, 0.209)","(get, 0.271)"
1,"(woman, 0.014)","(hillary, 0.112)","(donald, 0.051)","(daily, 0.249)","(video, 0.092)","(america, 0.235)","(politics, 0.081)","(mom, 0.06)","(herald, 0.179)","(go, 0.036)"
2,"(find, 0.013)","(judge, 0.044)","(cruz, 0.035)","(video, 0.04)","(willie, 0.085)","(trump, 0.058)","(video, 0.029)","(girl, 0.047)","(muslim, 0.049)","(penalty, 0.026)"
3,"(isi, 0.012)","(time, 0.035)","(video, 0.034)","(thug, 0.04)","(jim, 0.073)","(mosque, 0.047)","(plan, 0.027)","(woman, 0.035)","(city, 0.049)","(ready, 0.025)"
4,"(new, 0.011)","(gun, 0.027)","(amendment, 0.028)","(watch, 0.039)","(economic, 0.064)","(hillary, 0.044)","(michelle, 0.027)","(dad, 0.033)","(video, 0.049)","(give, 0.025)"
5,"(girl, 0.011)","(politics, 0.023)","(watch, 0.025)","(black, 0.038)","(collapse, 0.054)","(bathroom, 0.034)","(disrespect, 0.021)","(hand, 0.031)","(obama, 0.045)","(death, 0.025)"
6,"(white, 0.01)","(caught, 0.018)","(protest, 0.023)","(muslim, 0.022)","(global, 0.045)","(obama, 0.029)","(treason, 0.019)","(photo, 0.029)","(law, 0.041)","(trump, 0.024)"
7,"(black, 0.01)","(change, 0.017)","(rally, 0.023)","(obama, 0.022)","(dollar, 0.042)","(president, 0.027)","(economy, 0.018)","(boyfriend, 0.024)","(medium, 0.037)","(response, 0.024)"
8,"(make, 0.01)","(make, 0.016)","(million, 0.022)","(life, 0.021)","(market, 0.035)","(angry, 0.026)","(america, 0.017)","(toddler, 0.023)","(america, 0.035)","(guilty, 0.023)"
9,"(one, 0.01)","(dive, 0.016)","(support, 0.02)","(woman, 0.02)","(u, 0.035)","(allow, 0.025)","(military, 0.016)","(muslim, 0.021)","(silent, 0.035)","(welfare, 0.022)"


In [14]:
sorted_top10_cluster_dic["2016/05"][:8]

Unnamed: 0,3,19,5,14,22,11,12,17,4,2
0,"(muslim, 0.031)","(trump, 0.163)","(freedom, 0.247)","(politics, 0.17)","(man, 0.126)","(obama, 0.224)","(see, 0.271)","(u, 0.21)","(watch, 0.231)","(dear, 0.315)"
1,"(girl, 0.014)","(hillary, 0.148)","(daily, 0.247)","(video, 0.128)","(mom, 0.098)","(politics, 0.081)","(pic, 0.031)","(herald, 0.206)","(black, 0.047)","(america, 0.16)"
2,"(isi, 0.013)","(donald, 0.031)","(video, 0.039)","(law, 0.028)","(find, 0.094)","(refugee, 0.03)","(woman, 0.03)","(obama, 0.045)","(chick, 0.043)","(hillary, 0.078)"
3,"(guess, 0.012)","(bill, 0.029)","(thug, 0.033)","(american, 0.025)","(sick, 0.065)","(one, 0.022)","(muslim, 0.03)","(get, 0.037)","(liberal, 0.032)","(patriot, 0.052)"
4,"(get, 0.011)","(supporter, 0.025)","(black, 0.03)","(every, 0.024)","(son, 0.048)","(criticize, 0.022)","(people, 0.03)","(move, 0.033)","(get, 0.031)","(bill, 0.039)"
5,"(video, 0.011)","(email, 0.021)","(obama, 0.028)","(obama, 0.023)","(use, 0.034)","(bush, 0.02)","(girl, 0.029)","(response, 0.028)","(sniper, 0.03)","(trump, 0.035)"
6,"(word, 0.01)","(lie, 0.018)","(muslim, 0.027)","(government, 0.022)","(muslim, 0.028)","(admiral, 0.019)","(video, 0.028)","(break, 0.025)","(try, 0.029)","(muslim, 0.034)"
7,"(liberal, 0.01)","(rally, 0.018)","(white, 0.024)","(mysterious, 0.021)","(cop, 0.028)","(plan, 0.019)","(mom, 0.027)","(terrorize, 0.024)","(muslim, 0.028)","(offend, 0.033)"


In [15]:
sorted_top10_cluster_dic["2016/06"][:8]

Unnamed: 0,9,16,13,1,4,17,14,26,7,3
0,"(obama, 0.02)","(politics, 0.21)","(man, 0.103)","(freedom, 0.253)","(orlando, 0.228)","(see, 0.26)","(watch, 0.262)","(hillary, 0.264)","(herald, 0.234)","(question, 0.299)"
1,"(trump, 0.019)","(video, 0.121)","(find, 0.064)","(daily, 0.251)","(terrorist, 0.099)","(mom, 0.048)","(muslim, 0.074)","(trump, 0.086)","(u, 0.229)","(trump, 0.1)"
2,"(muslim, 0.016)","(obama, 0.078)","(cop, 0.059)","(muslim, 0.049)","(attack, 0.097)","(woman, 0.037)","(brawl, 0.037)","(benghazi, 0.035)","(obama, 0.047)","(agree, 0.053)"
3,"(gun, 0.016)","(military, 0.027)","(inside, 0.052)","(video, 0.041)","(gun, 0.041)","(sick, 0.034)","(shout, 0.032)","(bill, 0.034)","(video, 0.042)","(michelle, 0.047)"
4,"(video, 0.015)","(usa, 0.023)","(girl, 0.045)","(obama, 0.04)","(obama, 0.035)","(man, 0.034)","(patron, 0.031)","(love, 0.032)","(come, 0.037)","(punish, 0.044)"
5,"(get, 0.013)","(today, 0.023)","(call, 0.042)","(thug, 0.026)","(massacre, 0.03)","(girl, 0.033)","(restaurant, 0.031)","(video, 0.029)","(woman, 0.032)","(obama, 0.042)"
6,"(say, 0.013)","(mass, 0.018)","(something, 0.042)","(trump, 0.025)","(shooter, 0.028)","(cop, 0.03)","(decide, 0.03)","(burn, 0.028)","(isi, 0.03)","(allow, 0.041)"
7,"(love, 0.013)","(time, 0.018)","(sick, 0.041)","(say, 0.02)","(fbi, 0.027)","(horrify, 0.026)","(russian, 0.03)","(want, 0.027)","(man, 0.029)","(say, 0.04)"


In [16]:
sorted_top10_cluster_dic["2016/07"][:8]

Unnamed: 0,17,29,6,13,4,27,24,5,8,21
0,"(muslim, 0.034)","(politics, 0.176)","(herald, 0.212)","(freedom, 0.239)","(see, 0.255)","(obama, 0.232)","(black, 0.137)","(hillary, 0.213)","(cop, 0.14)","(trump, 0.253)"
1,"(get, 0.019)","(video, 0.168)","(u, 0.208)","(daily, 0.236)","(cop, 0.052)","(politics, 0.103)","(police, 0.115)","(email, 0.061)","(refuse, 0.109)","(dear, 0.065)"
2,"(woman, 0.016)","(jim, 0.038)","(break, 0.036)","(video, 0.056)","(act, 0.046)","(video, 0.038)","(make, 0.05)","(charge, 0.047)","(something, 0.082)","(america, 0.064)"
3,"(cop, 0.015)","(willie, 0.036)","(stand, 0.034)","(black, 0.043)","(girl, 0.043)","(america, 0.031)","(man, 0.046)","(fbi, 0.041)","(get, 0.048)","(donald, 0.047)"
4,"(sick, 0.013)","(schiff, 0.025)","(obama, 0.034)","(police, 0.034)","(woman, 0.041)","(isi, 0.028)","(watch, 0.045)","(bernie, 0.035)","(muslim, 0.048)","(awesome, 0.046)"
5,"(girl, 0.012)","(peter, 0.025)","(blm, 0.032)","(thug, 0.032)","(look, 0.04)","(barack, 0.028)","(white, 0.044)","(state, 0.034)","(thug, 0.042)","(kid, 0.043)"
6,"(medium, 0.012)","(economic, 0.024)","(huge, 0.029)","(cop, 0.03)","(fast, 0.036)","(gowdy, 0.023)","(cop, 0.042)","(comey, 0.03)","(arm, 0.037)","(voter, 0.038)"
7,"(find, 0.012)","(discover, 0.022)","(enrage, 0.028)","(hillary, 0.027)","(behind, 0.033)","(blast, 0.019)","(support, 0.041)","(patriot, 0.027)","(citizen, 0.036)","(hillary, 0.037)"


In [17]:
sorted_top10_cluster_dic["2016/08"][:8]

Unnamed: 0,8,4,12,16,21,1,25,10,9,3
0,"(watch, 0.018)","(freedom, 0.202)","(u, 0.159)","(trump, 0.171)","(hillary, 0.202)","(obama, 0.243)","(muslim, 0.238)","(politics, 0.218)","(cop, 0.261)","(email, 0.145)"
1,"(man, 0.017)","(daily, 0.202)","(herald, 0.153)","(say, 0.123)","(hilarious, 0.036)","(politics, 0.116)","(girl, 0.037)","(video, 0.157)","(thug, 0.057)","(wikileaks, 0.11)"
2,"(video, 0.017)","(black, 0.048)","(black, 0.072)","(watch, 0.051)","(campaign, 0.031)","(video, 0.03)","(sharia, 0.033)","(willie, 0.044)","(see, 0.044)","(hillary, 0.09)"
3,"(woman, 0.016)","(video, 0.041)","(life, 0.072)","(hillary, 0.044)","(health, 0.03)","(michelle, 0.029)","(hand, 0.025)","(jim, 0.044)","(call, 0.042)","(release, 0.079)"
4,"(make, 0.016)","(thug, 0.032)","(matter, 0.065)","(donald, 0.04)","(song, 0.026)","(treason, 0.028)","(law, 0.024)","(soros, 0.034)","(sick, 0.034)","(fbi, 0.07)"
5,"(find, 0.012)","(white, 0.031)","(obama, 0.033)","(ad, 0.027)","(voter, 0.023)","(presidency, 0.026)","(christian, 0.023)","(medium, 0.029)","(black, 0.03)","(break, 0.063)"
6,"(get, 0.012)","(muslim, 0.028)","(video, 0.028)","(obama, 0.026)","(wicked, 0.02)","(prisoner, 0.025)","(woman, 0.023)","(mainstream, 0.028)","(woman, 0.029)","(watch, 0.059)"
7,"(police, 0.012)","(obama, 0.028)","(make, 0.027)","(trend, 0.025)","(watch, 0.019)","(isi, 0.024)","(medium, 0.021)","(martial, 0.028)","(man, 0.029)","(pervert, 0.04)"
