# 当我们谈论旅行的时候，我们在谈论什么？

- 知乎爬虫，关键词“旅行，旅游，景点”全部问题及回答
- 小红书IP被限制登录了，遂转战知乎
- 爬虫使用脚本MediaCrawler，语义情感分析使用SnowNLP（中文模型），停用词使用中文停用词表、哈工大停用词表、机器智能实验室停用词库
- 交互使用dash app，不确定是否适配网页版
- plotly rendering设置为notebook，网页端需要更改
- 所有图表颜色样式default，需要更改
- 目前进展： data description

In [1]:
import json
import pandas as pd
import altair as alt
alt.data_transformers.enable('vegafusion')
import numpy as np
from snownlp import SnowNLP
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import plotly.express as px
from plotly import io as pio
pio.renderers.default = 'notebook'
import plotly.graph_objects as go
import dash
from dash import dcc, html, Input, Output
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
from sklearn.decomposition import PCA
import re

## Data Loading & Cleaning

In [2]:
# load data
comments_df = pd.read_json('data/comments.json', encoding='utf-8')
contents_df = pd.read_json('data/contents.json', encoding='utf-8')

In [3]:
# subselect
comments_df[['content', 'sub_comment_count', 'publish_time', 'content_id']]
contents_df = contents_df[['content_id', 'content_type', 'content_text', 'title', 'desc', 'created_time', 'voteup_count', 'comment_count']]

# drop na
comments_df = comments_df.dropna(subset=['content'])
contents_df = contents_df.dropna(subset=['content_text', 'desc'])

## Time Distribution

In [4]:
# time to date format
comments_df['publish_date'] = pd.to_datetime(comments_df['publish_time']).dt.date
contents_df['created_date'] = pd.to_datetime(contents_df['created_time']).dt.date

comments_df['year'] = pd.to_datetime(comments_df['publish_time']).dt.year
contents_df['year'] = pd.to_datetime(contents_df['created_time']).dt.year

In [None]:
# build dash app
app = dash.Dash(__name__)

# app layout
app.layout = html.Div([
    html.H1("Heatmap: Content & Comment"),
    dcc.Dropdown(
        id="dataset-dropdown",
        options=[
            {"label": "Comment Data", "value": "comments"},
            {"label": "Content Data", "value": "contents"}
        ],
        value="comments",
        placeholder="Select Dataset"
    ),
    dcc.Dropdown(
        id="year-dropdown",
        placeholder="Select Year"
    ),
    dcc.Graph(id="heatmap"),
])

# call: updated year selection
@app.callback(
    Output("year-dropdown", "options"),
    Output("year-dropdown", "value"),
    Input("dataset-dropdown", "value")
)
def update_year_options(selected_dataset):
    if selected_dataset == "comments":
        years = sorted(comments_df['year'].unique())
    else:
        years = sorted(contents_df['year'].unique())
    options = [{"label": str(year), "value": year} for year in years]
    return options, years[0]

# call: update heatmap
@app.callback(
    Output("heatmap", "figure"),
    Input("dataset-dropdown", "value"),
    Input("year-dropdown", "value")
)
def update_heatmap(selected_dataset, selected_year):
    if selected_dataset == "comments":
        filtered_df = comments_df[comments_df['year'] == selected_year]
        fig = px.density_heatmap(
            filtered_df,
            x="publish_date",
            y="sub_comment_count",
            labels={'publish_date': 'Date', 'sub_comment_count': 'Comment COunt'},
            nbinsx=365,
            title=f"Comment Distribution by Date - {selected_year}"
        )
    else:
        filtered_df = contents_df[contents_df['year'] == selected_year]
        fig = px.density_heatmap(
            filtered_df,
            x="created_date",
            y='voteup_count',
            labels={'created_date': 'Date', 'VoteUp Count': 'Likes'},
            nbinsx=365,
            title=f"Content Distribution by Date - {selected_year}"
        )
    fig.update_layout(
        xaxis=dict(tickformat="%b-%d", title="Date"),
        yaxis_title="Value",
        title_x=0.5,
    )
    return fig

# run app
if __name__ == '__main__':
    app.run_server(debug=True)


# Content Emotion Analysis

In [5]:
# snownlp emotion func
def analyze_sentiment(text):
    if text.strip():
        s = SnowNLP(text)
        return s.sentiments
    else:
        return None

### Content Analysis

In [6]:
# title desc analysis
contents_df['desc_sentiment'] = contents_df['desc'].apply(analyze_sentiment)

# content text analysis
# dropna
contents_df['content_text'] = contents_df['content_text'].fillna('')

contents_df['sentiment'] = contents_df['content_text'].apply(analyze_sentiment)

In [None]:
contents_df[['content_id', 'title', 'desc_sentiment','content_text', 'sentiment']].head()

### Comment Analysis

In [7]:
# comment analysis
comments_df['sentiment'] = comments_df['content'].apply(analyze_sentiment)

In [None]:
comments_df[['content', 'sentiment']].head()

In [None]:
# visualization
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Content & Comment Emotion Analysis"),
    
    # year selection dropdown
    dcc.Dropdown(
        id="year-selector",
        options=[{'label': str(year), 'value': year} for year in contents_df['year'].unique()],
        value=contents_df['year'].min(),  # default min year
        multi=False
    ),
    
    dcc.Graph(id='content-sentiment-chart'),

    dcc.Graph(id='comment-sentiment-chart'),
])


# update by year
@app.callback(
    [Output('content-sentiment-chart', 'figure'),
     Output('comment-sentiment-chart', 'figure')],
    [Input('year-selector', 'value')]
)
def update_charts(selected_year):
    content_year_df = contents_df[contents_df['year'] == selected_year]
    comments_year_df = comments_df[comments_df['year'] == selected_year]

    # emotion recategory
    content_year_df['sentiment_category'] = content_year_df['desc_sentiment'].apply(lambda x: 'Positive' if x > 0.5 else 'Negative')
    
    #content
    content_sentiment_fig = px.histogram(
        content_year_df,
        x='desc_sentiment',
        color='sentiment_category',
        title=f"Content Emotion Analysis ({selected_year})",
        labels={'desc_sentiment': 'Emotion Value'},
        marginal="box"
    )

    # comment
    comment_sentiment_fig = px.scatter(
        comments_year_df,
        x='sentiment',
        y='content_id',
        color='sentiment',
        title=f"Comment Emotion Analysis ({selected_year})",
        labels={'sentiment': 'Emotion Value', 'content_id': 'Content ID'},
        hover_data=['content_id', 'publish_date', 'sentiment']
    )

    return content_sentiment_fig, comment_sentiment_fig


# launch
if __name__ == "__main__":
    app.run_server(debug=True)


In [None]:
# histogram
# category func
def categorize_sentiment(df, sentiment_column):
    df['sentiment_category'] = pd.cut(df[sentiment_column], bins=[-float('inf'), 0.3, 0.7, float('inf')],
                                      labels=['Negative', 'Neutral', 'Positive'])
    return df

In [None]:
# content text
contents_df = categorize_sentiment(contents_df, 'sentiment') 
contents_text_hist = px.histogram(
    contents_df,
    x='sentiment_category',
    color='sentiment_category',
    title="Content Text Emotion Distribution Histogram",
    labels={'sentiment_category': 'Emotion Category'},
    marginal="box"
)

# content desc
contents_df = categorize_sentiment(contents_df, 'desc_sentiment')
contents_desc_hist = px.histogram(
    contents_df,
    x='sentiment_category',
    color='sentiment_category',
    title="Content Description Emotion Distribution Histogram",
    labels={'sentiment_category': 'Emotion Category'},
    marginal="box"
)

# comment
comments_df = categorize_sentiment(comments_df, 'sentiment')
comments_hist = px.histogram(
    comments_df,
    x='sentiment_category',
    color='sentiment_category',
    title="Comment Emotion Distribution Histogram",
    labels={'sentiment_category': 'Emotion Category'},
    marginal="box"
)

contents_text_hist.show()
contents_desc_hist.show()
comments_hist.show()

In [None]:
# wordcloud

# func
# stopwords
def cut_words_without_stopwords(text, stopwords):
    words = jieba.cut(text)
    return [word for word in words if word not in stopwords and len(word) > 1]

def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        stopwords = set(f.read().splitlines())
    return stopwords

def generate_wordcloud(text_data, title, stopwords):
    # word split (CN)
    text = " ".join(cut_words_without_stopwords(" ".join(text_data), stopwords))
    
    wordcloud = WordCloud(
        font_path="CNFont.ttf", 
        width=800,
        height=400,
        background_color="white",
        max_words=100,
        min_font_size=10
    ).generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(title)
    plt.show()

stopwords = load_stopwords("cn_stopwords.txt")

In [None]:
positive_content = contents_df[contents_df['desc_sentiment'] > 0.5]
negative_content = contents_df[contents_df['desc_sentiment'] <= 0.5]

positive_comment = comments_df[comments_df['sentiment'] > 0.5]
negative_comment = comments_df[comments_df['sentiment'] <= 0.5]

# wordcloud generate
generate_wordcloud(positive_content['desc'], "Positive Content Title Description Wordcloud", stopwords)
generate_wordcloud(positive_content['content_text'], "Positive Content Text Wordcloud", stopwords)
generate_wordcloud(positive_comment['content'], "Positive Comment Wordcloud", stopwords)

generate_wordcloud(negative_content['desc'], "Negative Content Title Description Wordcloud", stopwords)
generate_wordcloud(negative_content['content_text'], "Negative Content Text Wordcloud", stopwords)
generate_wordcloud(negative_comment['content'], "Negative Comment Wordcloud", stopwords)


##  Clustering analysis

In [None]:
# input: desc & comment
texts = contents_df['desc'].dropna().tolist() + comments_df['content'].dropna().tolist()
stopwords = load_stopwords("cn_stopwords.txt")
chinese_digits = ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十","百","千","万"]

def preprocess_text(text):
    # remove digits
    text = re.sub(r'\d+', '', text)
    # remove EN
    text = re.sub(r'[a-zA-Z]+', '', text)
    words = jieba.cut(text)
    # remove CN digits
    words = [word for word in words if not any(digit in word for digit in chinese_digits)]
    return " ".join(word for word in words if word not in stopwords and len(word) > 1)

texts = [preprocess_text(word) for word in texts]

# initialize TF-IDF
tfidf = TfidfVectorizer(max_features=5000)

# TF-IDF feature matrix
text_features = tfidf.fit_transform(texts)

# feature words
feature_names = tfidf.get_feature_names_out()
print(feature_names[:30])

# k
sil_scores = []
k_values = range(2, 10)
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(text_features)
    sil_scores.append(silhouette_score(text_features, labels))

# visualization
plt.plot(k_values, sil_scores, marker='o')
plt.title("Silhouette Scores for Different K")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Silhouette Score")
plt.show()

# best k
best_k = k_values[sil_scores.index(max(sil_scores))]
kmeans = KMeans(n_clusters=best_k, random_state=42)
clusters = kmeans.fit_predict(text_features)

clustered_texts = pd.DataFrame({'text': texts, 'cluster': clusters})
clustered_texts.head()

In [None]:
# PCA
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(text_features.toarray())
reduced_cluster_df = pd.DataFrame(reduced_features, columns=["PC1", "PC2"])
reduced_cluster_df["cluster"] = clusters

plt.figure(figsize=(10, 6))
sns.scatterplot(data=reduced_cluster_df, x="PC1", y="PC2", hue="cluster", palette="viridis")
plt.title("Cluster Visualization")
plt.show()

# Regression & Modeling

In [8]:
# sub comment count sum by content id
sub_comment_sum = comments_df.groupby('content_id')['sub_comment_count'].sum().reset_index()
contents_df = contents_df.merge(sub_comment_sum, on='content_id', how='left')

<bound method NDFrame.head of       content_id content_type  \
0     2983182867       answer   
1     3554122723       answer   
2      106211663       answer   
3      633685987       answer   
4       31530201       answer   
...          ...          ...   
1130   163740657       answer   
1131  2978932676       answer   
1132   136047390      article   
1133  2850330205       answer   
1134   580250300       answer   

                                           content_text  \
0     旅游就是累的。。。全世界旅游都是累的。。。因为旅游和度假是两个东西。。。举个例子好了，巴黎，...   
1     有个朋友，女孩，朋友圈全是旅行。仿佛她世界中只有风景，美照、美食和礼物。突然有一天，她的朋友...   
2     1.我就是那种家庭条件一般，还喜欢频繁到处旅游，还喜欢发微博晒的大学生。2.我旅游没用多少家...   
3     本回答发布于2019.3.27，最新更新于2020.5.21五一假期我又去大西北了，由于时间...   
4     真的能，而且改变很大。 旅行让人变得谦逊、包容、乐观、坚韧，表现在外就是一种见过大世面的气质...   
...                                                 ...   
1130  先说明一下贵州的整体地理情况，贵州整个景点的串联会比较分散，因为它并不像旁边云南等地可以昆明...   
1131  特种兵式旅游，不仅仅是累这么简单。睡 3 小时+日均 9 万步+8 个景点+花费少，想要完成...   
1132  安徽最有名的6大景点，必去旅行目的地,网友强烈推荐的好去处哦！    安

In [9]:
# input feature: voteup count, comment count, sub comment count; target: desc_sentiment
features = ['voteup_count', 'comment_count', 'sub_comment_count']
target = 'desc_sentiment'

data = contents_df[features + [target]].dropna()

# split into training 80%/test 20%
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [10]:
# randomforest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [11]:
# Accuracy
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')


Mean Squared Error: 0.050538825012269986
R² Score: 0.5797429164686416
