In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

In [None]:
# Read the relevant data files from the 'write-folder' folder
tweets_df = pd.read_csv('data/write-folder/tweet_metadata.csv')
tweets_df.head()

In [None]:
tweets_df_sorted = tweets_df.sort_values(by='followers_count')

In [None]:
users_df = pd.read_csv('data/write-folder/twitter_user.csv')
users_df.head()

In [None]:
# Sort the users_df by the 'name' column
users_df_sorted = users_df.sort_values(by='followers_count')

users_df_mostpopular = users_df_sorted.tail(50)

user_df_leastpopular = users_df_sorted.head(50)


In [None]:
# Merge tweets_df with users_df_mostpopular on author_id and id
popular_tweets_df = tweets_df.merge(users_df_mostpopular, left_on='author_id', right_on='id')



In [None]:
# Merge popular_tweets_df with users_df to include the 'screen_name' column
popular_tweets_df = popular_tweets_df.merge(users_df[['id', 'screen_name']], left_on='author_id', right_on='id', suffixes=('', '_user'))

# Drop the redundant 'id_user' column
popular_tweets_df.drop(columns=['id_user'])

# Display the updated DataFrame
popular_tweets_df.head()

In [None]:
# Remove redundant 'screen_name_user' columns
popular_tweets_df = popular_tweets_df.loc[:, ~popular_tweets_df.columns.duplicated()]

# Display the updated DataFrame
popular_tweets_df.head()

In [None]:
most_retweeted_tweets_df = popular_tweets_df.sort_values(by='retweet_count', ascending=False)

most_retweeted_tweets_df.head()

In [None]:
most_liked_tweets_df = popular_tweets_df.sort_values(by='favorite_count', ascending=False)

most_liked_tweets_df.head()


In [None]:
most_quoted_tweets_df = popular_tweets_df.sort_values(by='quote_count', ascending=False)

most_quoted_tweets_df.head()

In [None]:
# Drop duplicates based on the 'id' field
unique_users_df = popular_tweets_df.sort_values(by='quote_count', ascending=False).drop_duplicates(subset='id', keep='first')

# Display the top entries
unique_users_df.head()

In [None]:
author_counts = popular_tweets_df['author_id'].value_counts()

author_counts.head()


In [None]:
# Convert author_counts to a DataFrame
author_counts_df = author_counts.reset_index()
author_counts_df.columns = ['author_id', 'count']

# Merge with users_df to get 'id' and 'screen_name' columns
author_counts_merged = author_counts_df.merge(users_df[['id', 'screen_name',]], left_on='author_id', right_on='id')

# Display the updated DataFrame
author_counts_merged.head(10)

In [None]:
# Calculate the average retweet_count, favorite_count, and quote_count for each author_id
average_metrics = tweets_df.groupby('author_id')[['retweet_count', 'favorite_count', 'quote_count']].mean().reset_index()

# Rename columns for clarity
average_metrics.columns = ['author_id', 'avg_retweet_count', 'avg_favorite_count', 'avg_quote_count']

# Merge the average metrics back into the original DataFrame
tweets_df = tweets_df.merge(average_metrics, on='author_id', how='left')

# Display the updated DataFrame
tweets_df.tail()

In [None]:
# Select the relevant columns
filtered_df = author_counts_merged[['count', 'id', 'screen_name']].merge(
    average_metrics[['author_id', 'avg_retweet_count', 'avg_favorite_count', 'avg_quote_count']],
    left_on='id',
    right_on='author_id'
).drop(columns=['author_id'])

# Display the updated DataFrame
filtered_df.head(25)

In [None]:
# Merge tweet and user data
tweets_users_df = tweets_df.merge(
    users_df,
    left_on='author_id',
    right_on='id',
    suffixes=('_tweet', '_user')
)

# Analyze Interaction Data for Tweets
interaction_metrics = [
    'retweet_count',
    'favorite_count',
    'quote_count',
    'reply_count',
    'followers_count'
]

# Ensure the interaction metrics are numeric
for metric in interaction_metrics:
    tweets_users_df[metric] = pd.to_numeric(tweets_users_df[metric], errors='coerce')

# Remove rows with NaN in interaction metrics
tweets_users_df.dropna(subset=interaction_metrics, inplace=True)

# Plot the distribution of interaction metrics with increased granularity
plt.figure(figsize=(18, 10))
for i, metric in enumerate(interaction_metrics):
    plt.subplot(2, 3, i + 1)
    # Increase bins and use log scale if necessary
    max_value = tweets_users_df[metric].max()
    if max_value > 1000:
        sns.histplot(tweets_users_df[metric], bins=50, kde=True, log_scale=(True, False))
        plt.xlabel(f'{metric} (Log Scale)')
    else:
        sns.histplot(tweets_users_df[metric], bins=50, kde=True)
        plt.xlabel(metric)
    plt.title(f'Distribution of {metric}')
plt.tight_layout()
plt.savefig('interaction_metrics_distribution.png')
plt.show()

# Analyze Popular Users with Increased Granularity
# Define threshold for popular users (e.g., top 10% by followers_count)
popularity_threshold = tweets_users_df['followers_count'].quantile(0.90)
popular_users_df = tweets_users_df[tweets_users_df['followers_count'] >= popularity_threshold]

# Metrics for popular users
popular_user_metrics = popular_users_df.groupby('author_id')[interaction_metrics].mean()
popular_user_metrics.reset_index(inplace=True)

# Sort users by followers_count
popular_user_metrics.sort_values(by='followers_count', ascending=False, inplace=True)

# Plot average interaction metrics for popular users
plt.figure(figsize=(14, 8))
# Plot each metric separately
for metric in interaction_metrics[:-1]:  # Exclude 'followers_count' from metrics to plot
    sns.lineplot(
        data=popular_user_metrics,
        x='author_id',
        y=metric,
        label=metric
    )

plt.title('Interaction Metrics for Popular Users')
plt.xlabel('Author ID')
plt.ylabel('Average Metric Value')
plt.xticks(rotation=90)
plt.legend()
plt.tight_layout()
plt.savefig('popular_users_interaction_metrics.png')
plt.show()

# Additional Visualization: Correlation Heatmap
correlation_matrix = tweets_users_df[interaction_metrics].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='Blues')
plt.title('Correlation Heatmap of Interaction Metrics')
plt.tight_layout()
plt.savefig('interaction_metrics_correlation_heatmap.png')
plt.show()

# Save the summary statistics to CSV
interaction_summary = tweets_users_df[interaction_metrics].describe()
interaction_summary.to_csv('interaction_summary.csv')
popular_user_metrics.to_csv('popular_user_metrics.csv', index=False)


In [None]:
author_counts = tweets_df['author_id'].value_counts()

# print(author_counts.head(int(len(author_counts) * 0.10)))   # prints top 10% of author_counts (should be 37)

top_authors = list(author_counts.head(25).index)  # top 25 authors
top_authors

In [None]:
author_counts = tweets_df['author_id'].value_counts()
author_counts

In [None]:
tweets_df['created_at'] = pd.to_datetime(tweets_df['created_at'])

# Group by 'author_id' and get the earliest date in 'created_at' column
earliest_dates = tweets_df.groupby('author_id')['created_at'].min().reset_index()

# Display the earliest dates for each author_id
print(earliest_dates)