# 数据分析

## 词频分析

首先我对所有新闻内容进行了分词, 并对词频进行了分析.

分词的 python 脚本位于 `./KeywordAnalyser.py`.

用 wordcloud 包建立了词云, 结果如下:

In [None]:
from wordcloud import WordCloud
import sqlite3
import re
import matplotlib.pyplot as plt

PureSymbolRe = re.compile('')

tags = {}
conn = sqlite3.connect('newsInfo.db')
cursor = conn.execute("SELECT * FROM (SELECT word, SUM(count) AS cnt FROM words LEFT JOIN news ON news.docid = words.docid WHERE news.created_at >= 1693267200 GROUP BY word)t ORDER BY cnt DESC")

for row in cursor:
    if len(row[0]) > 1:
        tags[row[0]] = row[1]

In [None]:
font_path = 'C:/Windows/Fonts/msyh.ttc'
wordcloud = WordCloud(font_path=font_path, width=1000, height=1000, max_words=100, background_color='white').generate_from_frequencies(tags)

plt.figure(figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()


## 情感分析

使用 cemotion 库进行了对部分主题文章的情感分析. 由于爬取的数据包含较多苹果区的文本, 我对以 iPhone 为标题进行了情感分析:

In [12]:
from cemotion import Cemotion
import sqlite3
import pandas as pd
import re

ImagePlaceholderRe = re.compile(r'<_IMAGE_\d+_.*?_/>')

c = Cemotion()

dbconn = sqlite3.connect('newsInfo.db')
news = pd.DataFrame(
    dbconn.execute('SELECT news.title, contents.content, news.created_at FROM news LEFT JOIN contents ON news.docid = contents.docid WHERE title LIKE "%%iPhone%%" ORDER BY created_at ASC').fetchall(),
    columns=['title', 'content', 'created_at'],
).dropna()
news['created_at'] = pd.to_datetime(news['created_at'], unit='s')

def ParseNewsContent(content: str) -> list[str]:
    # Remove image placeholders
    ImagePlaceholderRe.sub('', content)

    # The first line is always ads or author or sth unimportant
    content = content.splitlines()[1:]

    # Strip all lines and remove lines too short
    for i in range(len(content)):
        content[i] = content[i].strip()
    
    content = list(filter(lambda x: len(x) > 5, content))
    return content

news['content'] = news['content'].apply(ParseNewsContent)

def GetPrediction(data) -> float:
    # title_emotion = c.predict(data['title'])
    content_emotion = c.predict(' '.join(data['content']))
    # print(f'Parsed {data["title"]}: {title_emotion} {content_emotion}')
    return content_emotion # title_emotion * 0.4 + content_emotion * 0.6


news['emotion'] = news.apply(GetPrediction, axis=1)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
import datetime

font = {'fontname': 'Microsoft YaHei'}

timestamp = np.array(news['created_at'].apply(lambda x: x.timestamp())).reshape(-1, 1)
model = LinearRegression().fit(X=timestamp, y=np.array(news['emotion']).reshape(-1, 1))

image = plt.figure(figsize=(8, 6))
plt.scatter(news['created_at'], news['emotion'], s=1)
plt.plot(news['created_at'], model.predict(timestamp), color='r')
plt.xlabel('时间', font)
plt.ylabel('情感值', font)
plt.title('iPhone 情感值散点图数据补齐版', font)
plt.axhline(y=np.average(news['emotion']), color='g', linestyle='--')
plt.axvline(x=datetime.datetime(2020, 10, 14, 0, 0, 0), color='#00800040', linestyle='-')
plt.text(datetime.datetime(2020, 10, 14, 0, 0, 0), -0.03, 'iPhone 12', font)
plt.axvline(x=datetime.datetime(2021, 9, 14, 0, 0, 0), color='#00800040', linestyle='-')
plt.text(datetime.datetime(2021, 9, 14, 0, 0, 0), -0.03, 'iPhone 13', font)
plt.axvline(x=datetime.datetime(2022, 9, 8, 0, 0, 0), color='#00800040', linestyle='-')
plt.text(datetime.datetime(2022, 9, 8, 0, 0, 0), -0.03, 'iPhone 14', font)
plt.show()

## 关注程度

我们对2023年以来新浪科技对包括 汽车 (燃油车, 电动车等), 电脑 (笔记本, 台式机组件等), 手机, 物联网设备, 人工智能 等类型的关注度 (涉及到的文章占所有文章占比) 进行了统计:

In [None]:
import sqlite3
import matplotlib.pyplot as plt
import numpy as np

StartTimestamp = 1672531200

dbconn = sqlite3.connect('newsInfo.db')

news = dbconn.execute(f'SELECT news.docid, news.title, news.created_at FROM news LEFT JOIN contents ON news.docid = contents.docid WHERE news.created_at >= {StartTimestamp} ORDER BY created_at ASC').fetchall()

car_keyword = ["汽车", "燃油车", "电动车", "电动汽车", "燃油"]
pc_keyword = ["CPU", "笔记本", "电脑", "主机", "显卡", "GPU"]
game_keyword = ["游戏", "游戏机", "游戏主机"]
mobile_keyword = ["手机", "平板"]
web_keyword = ["网站", "网页", "互联网", "TCP", "UDP", "运营商", "路由器", "交换机"]
iot_keyword = ["物联网", "智能家居", "智能穿戴", "XR", "AR", "VR", "智能手表", "Watch", "扫地机器人"]
ai_keyword = ["AI", "大模型", "人工智能"]
os_keyword = ["操作系统", "Windows", "Linux", "MacOS", "iOS", "Android", "HarmonyOS", "OS"]
health_keyword = ["医疗", "健康", "疫苗", "医学"]

def GetCat(cat: list[str]):
    cat = ','.join(f'"{x}"' for x in cat)
    # print(f'SELECT news.docid FROM words LEFT JOIN news ON news.docid = words.docid WHERE word IN ({cat}) AND news.created_at >= {StartTimestamp} GROUP BY news.docid')
    return set(i[0] for i in dbconn.execute(f'SELECT news.docid FROM words LEFT JOIN news ON news.docid = words.docid WHERE word IN ({cat}) AND news.created_at >= {StartTimestamp} GROUP BY news.docid'))

car_cat = GetCat(car_keyword)
pc_cat = GetCat(pc_keyword)
game_cat = GetCat(game_keyword)
mobile_cat = GetCat(mobile_keyword)
web_cat = GetCat(web_keyword)
iot_cat = GetCat(iot_keyword)
ai_cat = GetCat(ai_keyword)
os_cat = GetCat(os_keyword)
health_cat = GetCat(health_keyword)
other_cat = set(n[0] for n in news) - car_cat - pc_cat - mobile_cat - game_cat - iot_cat - ai_cat - web_cat - health_cat - os_cat

cat_names = ['汽车', '电脑', '游戏', '手机', '物联网', 'AI', '互联网', '操作系统', '健康', '其他']
cat_data = np.array([len(car_cat), len(pc_cat), len(game_cat), len(mobile_cat), len(iot_cat), len(ai_cat), len(web_cat), len(os_cat), len(health_cat), len(other_cat)]) / len(news)

In [None]:
print(other_cat)

In [None]:
import matplotlib

matplotlib.rcParams['font.family'] = 'SimHei'
graph = plt.figure(figsize=(8, 6))
plt.bar_label(plt.bar(cat_names, cat_data), fmt='%.2f')
plt.xlabel('分类', font)
plt.ylabel('新闻数量', font)
plt.title('新闻分类统计', font)
plt.show()