# Bag of Words
## In Class
### English Corpus

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('./text_analysis_twitter.csv', index_col=0)

In [3]:
df

Unnamed: 0,id,screen_name,time,link,text,source
0,881004944714866688,KamalaHarris,2017-07-01T00:22:01-04:00,https://www.twitter.com/KamalaHarris/statuses/...,Republicans are trying to steal health care fr...,Sprout Social
1,881002538341986304,MarcVeasey,2017-07-01T00:12:28-04:00,https://www.twitter.com/kylegriffin1/statuses/...,RT @kylegriffin1 Rep. Mark Veasay has introduc...,Twitter for Android
2,881001393284419584,RepPaulTonko,2017-07-01T00:07:55-04:00,https://www.twitter.com/RepPaulTonko/statuses/...,A top U.S. science office went dark tonight. N...,Twitter for iPhone
3,881000301007958016,clairecmc,2017-07-01T00:03:34-04:00,https://www.twitter.com/clairecmc/statuses/881...,So the fundraising quarter ends in an hour in ...,Twitter for iPad
4,881015727540637696,tedcruz,2017-07-01T01:04:52-04:00,https://www.twitter.com/larry_kudlow/statuses/...,RT @larry_kudlow Important story @SenTedCruz @...,Twitter for iPhone
...,...,...,...,...,...,...
299995,939743413909549056,RepTedLieu,2017-12-10T01:27:44-05:00,https://www.twitter.com/ReadyLA/statuses/93956...,RT @ReadyLA REPORT FIRE DAMAGE from #CreekFire...,Twitter for iPhone
299996,939743003203301376,RepTedLieu,2017-12-10T01:26:06-05:00,https://www.twitter.com/LAFDtalk/statuses/9397...,RT @LAFDtalk Once a wildfire has burned throug...,Twitter for iPhone
299997,939741567706730497,JuliaBrownley26,2017-12-10T01:20:23-05:00,https://www.twitter.com/SCE/statuses/939739240...,"RT @SCE Update: As of 8:30pm, Saturday, about ...",Twitter Web Client
299998,939739240501596160,SteveKnight25,2017-12-10T01:11:09-05:00,https://www.twitter.com/santaclarita/statuses/...,RT @santaclarita #RyeFire Evening Update (12/9...,Twitter for iPhone


In [4]:
df.text[:100].sum()

'Republicans are trying to steal health care from 22 million people in order to help the wealthiest 0.4%.RT @kylegriffin1 Rep. Mark Veasay has introduced a bill to ensure no taxpayer funds will be used to support Trump\'s Election Commission. http://pbs.twimg.com/media/DDm3PgaXcAAFBIo.jpgA top U.S. science office went dark tonight. Not good. #sciencematters https://twitter.com/elleabella1112/status/880870684485984256 QT @elleabella1112 science division out. mic drop. http://pbs.twimg.com/media/DDl7NT9XoAEeJxb.jpgSo the fundraising quarter ends in an hour in MO. I would really appreciate a $5 contribution. Thank you so much. http://clairemccaskill.com/donateRT @larry_kudlow Important story @SenTedCruz @NRO. http://www.nationalreview.com/article/449134/ted-cruz-conservative-health-care-proposal?utm_campaign=trueanthem&utm_content=595717ee04d3010ccd0c715e&utm_medium=social&utm_source=twitterThreats already higher than all of 2016!  Resist! http://thehill.com/homenews/house/340309-threats-

In [5]:
tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()

nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/fanhongrui/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fanhongrui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fanhongrui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+','',text)
    tokens = tokenizer.tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token != 'rt' and token not in stop_words]
    return tokens

In [7]:
df['preprocessed_text'] = df['text'].apply(preprocess_text)

In [8]:
df.head()

Unnamed: 0,id,screen_name,time,link,text,source,preprocessed_text
0,881004944714866688,KamalaHarris,2017-07-01T00:22:01-04:00,https://www.twitter.com/KamalaHarris/statuses/...,Republicans are trying to steal health care fr...,Sprout Social,"[republican, trying, steal, health, care, mill..."
1,881002538341986304,MarcVeasey,2017-07-01T00:12:28-04:00,https://www.twitter.com/kylegriffin1/statuses/...,RT @kylegriffin1 Rep. Mark Veasay has introduc...,Twitter for Android,"[rep, mark, veasay, introduced, bill, ensure, ..."
2,881001393284419584,RepPaulTonko,2017-07-01T00:07:55-04:00,https://www.twitter.com/RepPaulTonko/statuses/...,A top U.S. science office went dark tonight. N...,Twitter for iPhone,"[top, u, science, office, went, dark, tonight,..."
3,881000301007958016,clairecmc,2017-07-01T00:03:34-04:00,https://www.twitter.com/clairecmc/statuses/881...,So the fundraising quarter ends in an hour in ...,Twitter for iPad,"[fundraising, quarter, end, hour, mo, would, r..."
4,881015727540637696,tedcruz,2017-07-01T01:04:52-04:00,https://www.twitter.com/larry_kudlow/statuses/...,RT @larry_kudlow Important story @SenTedCruz @...,Twitter for iPhone,"[important, story]"


In [9]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['preprocessed_text'].apply(lambda x:str(x)))

In [10]:
X.shape

(300000, 50343)

In [11]:
vectorizer.get_feature_names_out()

array(['aa', 'aaa', 'aaaa', ..., '투표하십시오', '한국어', 'ﬂees'], dtype=object)

### Chinese Corpus

In [3]:
import pandas as pd
import numpy as np
import jieba
import nltk
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
df = pd.read_csv('./text_analysis_weibo.csv', index_col=0)

In [5]:
df.head()

Unnamed: 0,标题/微博内容,点赞,转发,评论,账号昵称UID加密,粉丝数,关注数,地域
0,#高校通报教师图书馆打电话声音过大出言不逊#公道自在人心，谣言自在人心 ​​,0,0,0,a2331b38901d62d2d9a20529177ef3b3,0,22,湖北
1,转发C,0,0,0,d6dc4470f51fce93cc0cbad8abf55a75,0,33,广西
2,【#刘雨昕运动者联濛#河山覆冰雪，健儿迎冬奥[金牌]全能唱跳不设限，运动联濛开新年🇨🇳 期待...,0,0,0,372bc4782eb442b88035f920a7c1a68e,6,85,广东
3,丁程鑫//@丁程鑫后援会官博:#丁程鑫[超话]# ✨#丁程鑫 二十成金筑梦鑫世界# 大年初一...,0,0,0,6fe0d482bd3e78a3483e2a1d57f14ef2,75,1012,广东
4,诶，你们真不要脸诶。。。没资格宣传奥运。。。抵制抵制！,0,0,0,872380d71d6ee9130e8b49d331f2baa9,0,10,广东


In [7]:
def remove_nums(text):
    nonums = re.sub('[^\u4e00-\u9fa5]+', '', text)
    return nonums
test = df['标题/微博内容'][0]
remove_nums(test)

'高校通报教师图书馆打电话声音过大出言不逊公道自在人心谣言自在人心'

In [10]:
stopwords = open('stopwords.txt', encoding = 'utf-8').read()
def clean_text(text):
    words = jieba.lcut(text)
    words = [w for w in words if w not in stopwords and w!='\u200b' and w!='转发']
    return ' '.join(words)
test = df['标题/微博内容'][0]
clean_text(test)

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/7d/507_52mn2sv4475h6l1k244c0000gn/T/jieba.cache
Loading model cost 0.333 seconds.
Prefix dict has been built successfully.


'高校 通报 教师 图书馆 打电话 声音 过大 出言不逊 公道 人心 谣言 人心'

In [17]:
words = pd.DataFrame(df['标题/微博内容'].astype(str))
words.head()

Unnamed: 0,标题/微博内容
0,#高校通报教师图书馆打电话声音过大出言不逊#公道自在人心，谣言自在人心 ​​
1,转发C
2,【#刘雨昕运动者联濛#河山覆冰雪，健儿迎冬奥[金牌]全能唱跳不设限，运动联濛开新年🇨🇳 期待...
3,丁程鑫//@丁程鑫后援会官博:#丁程鑫[超话]# ✨#丁程鑫 二十成金筑梦鑫世界# 大年初一...
4,诶，你们真不要脸诶。。。没资格宣传奥运。。。抵制抵制！


In [18]:
def countwords(text):
    nonums = re.sub('[^\u4e00-\u9fa5]+', '', text)
    char_count = len(nonums)
    return char_count
test = df['标题/微博内容'][30]
countwords(test)

5

In [19]:
words

Unnamed: 0,标题/微博内容
0,#高校通报教师图书馆打电话声音过大出言不逊#公道自在人心，谣言自在人心 ​​
1,转发C
2,【#刘雨昕运动者联濛#河山覆冰雪，健儿迎冬奥[金牌]全能唱跳不设限，运动联濛开新年🇨🇳 期待...
3,丁程鑫//@丁程鑫后援会官博:#丁程鑫[超话]# ✨#丁程鑫 二十成金筑梦鑫世界# 大年初一...
4,诶，你们真不要脸诶。。。没资格宣传奥运。。。抵制抵制！
...,...
49995,935天气路况早知道 2022年3月2日 星期三 今日限行尾号3和8 8：00分发布 ...
49996,4月1日起，交通违法记分规则重大调整！ http://t.cn/A66zxfR0 ​​
49997,转发C
49998,全体 甘肃车主注意 4月1日起，全面实施！ ...


In [20]:
words['character_count'] = words['标题/微博内容'].apply(countwords)
words.head()

Unnamed: 0,标题/微博内容,character_count
0,#高校通报教师图书馆打电话声音过大出言不逊#公道自在人心，谣言自在人心 ​​,32
1,转发C,2
2,【#刘雨昕运动者联濛#河山覆冰雪，健儿迎冬奥[金牌]全能唱跳不设限，运动联濛开新年🇨🇳 期待...,61
3,丁程鑫//@丁程鑫后援会官博:#丁程鑫[超话]# ✨#丁程鑫 二十成金筑梦鑫世界# 大年初一...,64
4,诶，你们真不要脸诶。。。没资格宣传奥运。。。抵制抵制！,19


In [23]:
def tag_extraction(text):
    tags = re.findall(r'#.*?#', text)
    for t in tags:
        if '[超话]' in t:
            tags.remove(t)
    return tags
test = tag_extraction(df['标题/微博内容'][0])
print(test)

['#高校通报教师图书馆打电话声音过大出言不逊#']


In [24]:
words['tags'] = words['标题/微博内容'].apply(tag_extraction)
words.head()

Unnamed: 0,标题/微博内容,character_count,tags
0,#高校通报教师图书馆打电话声音过大出言不逊#公道自在人心，谣言自在人心 ​​,32,[#高校通报教师图书馆打电话声音过大出言不逊#]
1,转发C,2,[]
2,【#刘雨昕运动者联濛#河山覆冰雪，健儿迎冬奥[金牌]全能唱跳不设限，运动联濛开新年🇨🇳 期待...,61,"[#刘雨昕运动者联濛#, #运动者联濛#]"
3,丁程鑫//@丁程鑫后援会官博:#丁程鑫[超话]# ✨#丁程鑫 二十成金筑梦鑫世界# 大年初一...,64,[#丁程鑫 二十成金筑梦鑫世界#]
4,诶，你们真不要脸诶。。。没资格宣传奥运。。。抵制抵制！,19,[]


In [25]:
tags_count = []
for tag in words['tags']:
    tc = len(tag)
    tags_count.append(tc)
words['tags_count'] = tags_count
words.head()

Unnamed: 0,标题/微博内容,character_count,tags,tags_count
0,#高校通报教师图书馆打电话声音过大出言不逊#公道自在人心，谣言自在人心 ​​,32,[#高校通报教师图书馆打电话声音过大出言不逊#],1
1,转发C,2,[],0
2,【#刘雨昕运动者联濛#河山覆冰雪，健儿迎冬奥[金牌]全能唱跳不设限，运动联濛开新年🇨🇳 期待...,61,"[#刘雨昕运动者联濛#, #运动者联濛#]",2
3,丁程鑫//@丁程鑫后援会官博:#丁程鑫[超话]# ✨#丁程鑫 二十成金筑梦鑫世界# 大年初一...,64,[#丁程鑫 二十成金筑梦鑫世界#],1
4,诶，你们真不要脸诶。。。没资格宣传奥运。。。抵制抵制！,19,[],0


In [26]:
def wbemoji_extraction(text):
    temtext = text.replace('[', '<')
    temtext = temtext.replace(']', '>')
    weibo_emoji = re.findall(r'<.*?>', temtext)
    for we in weibo_emoji:
        if we == '<超话>':
            weibo_emoji.remove(we)
    return weibo_emoji
test = wbemoji_extraction(words['标题/微博内容'][333])
test

['<doge>', '<doge>']

In [27]:
words['weibo_emoji'] = words['标题/微博内容'].apply(wbemoji_extraction)
words.head()


Unnamed: 0,标题/微博内容,character_count,tags,tags_count,weibo_emoji
0,#高校通报教师图书馆打电话声音过大出言不逊#公道自在人心，谣言自在人心 ​​,32,[#高校通报教师图书馆打电话声音过大出言不逊#],1,[]
1,转发C,2,[],0,[]
2,【#刘雨昕运动者联濛#河山覆冰雪，健儿迎冬奥[金牌]全能唱跳不设限，运动联濛开新年🇨🇳 期待...,61,"[#刘雨昕运动者联濛#, #运动者联濛#]",2,[<金牌>]
3,丁程鑫//@丁程鑫后援会官博:#丁程鑫[超话]# ✨#丁程鑫 二十成金筑梦鑫世界# 大年初一...,64,[#丁程鑫 二十成金筑梦鑫世界#],1,[]
4,诶，你们真不要脸诶。。。没资格宣传奥运。。。抵制抵制！,19,[],0,[]


In [28]:
weibo_emoji_count = []
for we in words['weibo_emoji']:
    wec = len(we)
    weibo_emoji_count.append(wec)
words['weibo_emoji_count'] = weibo_emoji_count
words.head()

Unnamed: 0,标题/微博内容,character_count,tags,tags_count,weibo_emoji,weibo_emoji_count
0,#高校通报教师图书馆打电话声音过大出言不逊#公道自在人心，谣言自在人心 ​​,32,[#高校通报教师图书馆打电话声音过大出言不逊#],1,[],0
1,转发C,2,[],0,[],0
2,【#刘雨昕运动者联濛#河山覆冰雪，健儿迎冬奥[金牌]全能唱跳不设限，运动联濛开新年🇨🇳 期待...,61,"[#刘雨昕运动者联濛#, #运动者联濛#]",2,[<金牌>],1
3,丁程鑫//@丁程鑫后援会官博:#丁程鑫[超话]# ✨#丁程鑫 二十成金筑梦鑫世界# 大年初一...,64,[#丁程鑫 二十成金筑梦鑫世界#],1,[],0
4,诶，你们真不要脸诶。。。没资格宣传奥运。。。抵制抵制！,19,[],0,[],0


In [31]:
def supertalk_extraction(text):
    supertalk = []
    tempst = re.findall(r'#.*?#', text)
    for tst in tempst:
        if '[超话]' in tst:
            supertalk.append(tst)
    return supertalk
test = supertalk_extraction(words['标题/微博内容'][3])
test

['#丁程鑫[超话]#']

In [32]:
words['supertalks'] = words['标题/微博内容'].apply(supertalk_extraction)
words.head()

Unnamed: 0,标题/微博内容,character_count,tags,tags_count,weibo_emoji,weibo_emoji_count,supertalks
0,#高校通报教师图书馆打电话声音过大出言不逊#公道自在人心，谣言自在人心 ​​,32,[#高校通报教师图书馆打电话声音过大出言不逊#],1,[],0,[]
1,转发C,2,[],0,[],0,[]
2,【#刘雨昕运动者联濛#河山覆冰雪，健儿迎冬奥[金牌]全能唱跳不设限，运动联濛开新年🇨🇳 期待...,61,"[#刘雨昕运动者联濛#, #运动者联濛#]",2,[<金牌>],1,[]
3,丁程鑫//@丁程鑫后援会官博:#丁程鑫[超话]# ✨#丁程鑫 二十成金筑梦鑫世界# 大年初一...,64,[#丁程鑫 二十成金筑梦鑫世界#],1,[],0,[#丁程鑫[超话]#]
4,诶，你们真不要脸诶。。。没资格宣传奥运。。。抵制抵制！,19,[],0,[],0,[]


In [33]:
supertalk_count = []
for st in words['supertalks']:
    stc = len(st)
    supertalk_count.append(stc)
words['supertalk_count'] = supertalk_count
words.head()

Unnamed: 0,标题/微博内容,character_count,tags,tags_count,weibo_emoji,weibo_emoji_count,supertalks,supertalk_count
0,#高校通报教师图书馆打电话声音过大出言不逊#公道自在人心，谣言自在人心 ​​,32,[#高校通报教师图书馆打电话声音过大出言不逊#],1,[],0,[],0
1,转发C,2,[],0,[],0,[],0
2,【#刘雨昕运动者联濛#河山覆冰雪，健儿迎冬奥[金牌]全能唱跳不设限，运动联濛开新年🇨🇳 期待...,61,"[#刘雨昕运动者联濛#, #运动者联濛#]",2,[<金牌>],1,[],0
3,丁程鑫//@丁程鑫后援会官博:#丁程鑫[超话]# ✨#丁程鑫 二十成金筑梦鑫世界# 大年初一...,64,[#丁程鑫 二十成金筑梦鑫世界#],1,[],0,[#丁程鑫[超话]#],1
4,诶，你们真不要脸诶。。。没资格宣传奥运。。。抵制抵制！,19,[],0,[],0,[],0


In [46]:
text = words['标题/微博内容'].apply(clean_text)

In [47]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text)

In [49]:
feature_words = vectorizer.get_feature_names_out()

In [50]:
word_freq = dict(zip(feature_words,X.sum(axis=0).A1))

In [56]:
sorted(word_freq.items(),key = lambda x:x[1], reverse=True)

[('发展', 52421),
 ('建设', 38186),
 ('中国', 37685),
 ('企业', 36472),
 ('市场', 35827),
 ('汽车', 30384),
 ('2022', 29315),
 ('增长', 26601),
 ('经济', 25205),
 ('亿元', 22701),
 ('项目', 21717),
 ('国家', 21290),
 ('政策', 21258),
 ('工作', 21089),
 ('10', 20832),
 ('服务', 19591),
 ('新能源', 19181),
 ('同比', 19083),
 ('投资', 19015),
 ('推进', 18856),
 ('行业', 18277),
 ('产业', 17422),
 ('提升', 17384),
 ('产品', 16814),
 ('数据', 16807),
 ('推动', 15882),
 ('2021', 15693),
 ('持续', 15494),
 ('技术', 14423),
 ('nbsp', 14154),
 ('全国', 13998),
 ('全球', 13897),
 ('创新', 13695),
 ('支持', 13655),
 ('疫情', 13574),
 ('11', 13129),
 ('城市', 13074),
 ('需求', 13039),
 ('加快', 12477),
 ('国际', 12346),
 ('发布', 12308),
 ('国内', 12218),
 ('12', 12139),
 ('一个', 12005),
 ('相关', 11992),
 ('实施', 11824),
 ('领域', 11744),
 ('理想', 11640),
 ('影响', 11637),
 ('生产', 11537),
 ('品牌', 11501),
 ('重点', 11477),
 ('平台', 11265),
 ('价格', 10979),
 ('销量', 10853),
 ('科技', 10636),
 ('提供', 10532),
 ('电池', 10103),
 ('情况', 10069),
 ('能力', 10003),
 ('时间', 9990),
 ('进一步', 9953),
 (

## Out Class