In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go


# visualize twitter data
data_path = "./Datasets/TwitterData_Joined.csv"

import pandas as pd

TwitterData = pd.read_csv(data_path)

# 计算总的用户数量，总的推文数量
total_users = TwitterData['Twitter_Account'].nunique()
total_tweets = TwitterData['Tweet_id'].nunique()

# 机器用户数量
bot_users = TwitterData[TwitterData['Label'] == 0]['Twitter_Account'].nunique()

# 人类用户数量
human_users = TwitterData[TwitterData['Label'] == 1]['Twitter_Account'].nunique()

# 机器推文数量
bot_tweets = TwitterData[TwitterData['Label'] == 0]['Tweet_id'].nunique()

# 人类推文数量
human_tweets = TwitterData[TwitterData['Label'] == 1]['Tweet_id'].nunique()

In [14]:
# 绘制饼图
labels = ['Bot', 'Human']
values = [bot_users, human_users]

# 创建饼图
fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='label+value')])

# 更新布局
fig.update_layout(
    title_text='Twitter Accounts',
    title_x=0.5,  # 中心对齐标题
    margin=dict(l=20, r=20, t=50, b=20),  # 边距设置
    legend=dict(x=0.2, y=0.5)  # 自定义图例位置
)

# 显示图表
fig.show()

In [15]:
# 绘制发推文的数量分布
labels = ['Bot', 'Human']
values = [bot_tweets, human_tweets]

# 创建饼图
fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='label+value')])

# 更新布局
fig.update_layout(
    title_text='Tweets',
    title_x=0.5,  # 中心对齐标题
    margin=dict(l=20, r=20, t=50, b=20),  # 边距设置
    legend=dict(x=0.2, y=0.5)  # 自定义图例位置
)

# 显示图表
fig.show()

In [16]:
# 统计发帖时间分布
TwitterData['Tweet_created_at'] = pd.to_datetime(TwitterData['Tweet_created_at'])

bot_tweets = TwitterData[TwitterData['Label'] == 0]
human_tweets = TwitterData[TwitterData['Label'] == 1]

daily_bot_tweet_count = bot_tweets.groupby(bot_tweets['Tweet_created_at'].dt.date).size().reset_index(name='Bot Count')
daily_human_tweet_count = human_tweets.groupby(human_tweets['Tweet_created_at'].dt.date).size().reset_index(name='Human Count')

# Create subplot with two y-axes
fig = make_subplots(rows=2, cols=1, shared_xaxes=True)

# Add bot tweet line chart to the upper subplot
fig.add_trace(
    go.Scatter(x = daily_bot_tweet_count['Tweet_created_at'], y = daily_bot_tweet_count['Bot Count'], mode ='lines', name ='Bot Tweets'),
    row = 1, col = 1
)

# Add human tweet line chart to the lower subplot
fig.add_trace(
    go.Scatter(x = daily_human_tweet_count['Tweet_created_at'], y = daily_human_tweet_count['Human Count'], mode = 'lines', name = 'Human Tweets'),
    row = 2, col = 1
)

fig.update_yaxes(title_text = 'Bot Tweets', row = 1, col = 1)
fig.update_xaxes(title_text = 'Tweet Creation Date', row = 1, col = 1)

fig.update_yaxes(title_text = 'Human Tweets', row = 2, col = 1)
fig.update_xaxes(title_text = 'Tweet Creation Date', row = 2, col =1 )

fig.update_layout(title = 'Number of Tweets Over Time', showlegend = True)

fig.show()

In [3]:
# ------------------
# Sentiment Analysis
# ------------------
from textblob import TextBlob
import plotly.express as px
import plotly.subplots as sp


def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

TwitterData['Sentiment'] = TwitterData['Tweet_text'].apply(get_sentiment)

bot_tweets = TwitterData[TwitterData['Label'] == 0]
human_tweets = TwitterData[TwitterData['Label'] == 1]

# Calculate sentiment distribution for bot and human tweets
bot_sentiment_distribution = bot_tweets.groupby(['Twitter_User_Name', 'Sentiment']).size().unstack().fillna(0)
human_sentiment_distribution = human_tweets.groupby(['Twitter_User_Name', 'Sentiment']).size().unstack().fillna(0)

combined_sentiment_distribution = pd.concat([bot_sentiment_distribution, human_sentiment_distribution], keys=['Bot', 'Human'])

bot_sentiment_distribution.reset_index(inplace=True)
human_sentiment_distribution.reset_index(inplace=True)

fig = sp.make_subplots(rows=2, cols=1, shared_xaxes=False, vertical_spacing = 0.2, subplot_titles=('Bot Tweets', 'Human Tweets'))

# Add grouped bar chart for bot tweets
bot_fig = px.bar(
    bot_sentiment_distribution,
    x='Twitter_User_Name',
    y=['positive', 'neutral', 'negative'],
    color_discrete_map={'positive': '#00ff00', 'neutral': '#888888', 'negative': '#ff0000'},
    barmode='group',
    title='Sentiment Distribution for Bot Tweets',
    labels={'Twitter_User_Name': 'Twitter User', 'value': 'Tweet Count'},
    height=400
)
for data in bot_fig.data:
    fig.add_trace(data, row=1, col=1)

# Add grouped bar chart for human tweets
human_fig = px.bar(
    human_sentiment_distribution,
    x='Twitter_User_Name',
    y=['positive', 'neutral', 'negative'],
    color_discrete_map={'positive': '#00ff00', 'neutral': '#888888', 'negative': '#ff0000'},
    barmode='group',
    title='Sentiment Distribution for Human Tweets',
    labels={'Twitter_User_Name': 'Twitter User', 'value': 'Tweet Count'},
    height=400
)
for data in human_fig.data:
    fig.add_trace(data, row=2, col=1)

fig.update_layout(
    title='Sentiment Distribution for Bot and Human Tweets',
    height=1500,
    showlegend=True
)

fig.show()

In [17]:
# -------------------------
# Tweet Length Distribution
# -------------------------

bot_tweets = TwitterData[TwitterData['Label'] == 0]
human_tweets = TwitterData[TwitterData['Label'] == 1]

bot_tweet_lengths = bot_tweets['Tweet_text'].apply(len)
human_tweet_lengths = human_tweets['Tweet_text'].apply(len)

fig_bot = px.histogram(bot_tweet_lengths, nbins=30, title='Tweet Length Distribution for Bots', labels={'value': 'Tweet Length', 'count': 'Frequency'})
fig_bot.update_traces(marker_color='orangered')

fig_human = px.histogram(human_tweet_lengths, nbins=30, title='Tweet Length Distribution for Humans', labels={'value': 'Tweet Length', 'count': 'Frequency'})
fig_human.update_traces(marker_color='blue')

fig_bot.show()
fig_human.show()

In [19]:
# -----------------
# Location Analysis
# -----------------
TwitterUsers = pd.read_csv("./Datasets/Twitter_Users.csv", encoding='latin1')

df = TwitterUsers.dropna()
df.reset_index(drop=True, inplace=True)
data = pd.read_csv("./Datasets/TwitterData_FE.csv",error_bad_lines=False)

human_count = 0
bot_count = 0

for i in df["Twitter_User_Name"]:
  k = 0
  for j in data["Twitter_User_Name"]:
    if(i == j):
      if(data["Label"][k] == 0.0):
        bot_count += 1
        break
      else:
        human_count += 1
        break
    k += 1
print(human_count)
print(bot_count)

numbers = [human_count , bot_count]
labels = ['Human Location', 'Bot Location']

data = {'Count': numbers, ' ': labels}
df = pd.DataFrame(data)

colors = ['blue', 'orange']

fig = px.bar(df, x=' ', y='Count', text='Count', title='Location Count',color=colors)
fig.show()


The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.





32
21
