# Data Exploration 

In [52]:
import pandas as pd

df = pd.read_json('data/sample_output_json.json ', lines=True)
# Convert 'date' column to datetime with the correct format
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')


In [53]:
df.date.min(), df.date.max()

(Timestamp('2021-06-04 00:00:00'), Timestamp('2024-03-01 00:00:00'))

In [54]:
df.sample(7)

Unnamed: 0,text,author_name,author_handle,date,lang,url,mentioned_urls,is_retweet,media_type,images_urls,num_reply,num_retweet,num_like,num_view
540,希望这群创业者里，一两年后诞生几家世界级的公司。,GooCarlos,@goocarlos,2023-08-24,zh,https://twitter.com/goocarlos/status/169464823...,[],False,Image,[https://pbs.twimg.com/media/F4SZlmLasAA333Q?f...,5,4,37,0
26,I didn't think it would work!\n\nJust uploaded...,Paige Bailey,@DynamicWebPaige,2024-02-22,en,https://twitter.com/DynamicWebPaige/status/176...,[],False,No media,,19,38,453,0
745,The moment it was over for literally everyone,¹⁰,@SxrgioSZN,2023-06-18,en,https://twitter.com/SxrgioSZN/status/167047843...,[],False,Video,,23,380,2228,0
216,Putting the „CINEMA“ back in „cinematic“ \n\nS...,phOBography,@phobography,2023-12-23,en,https://twitter.com/phobography/status/1738651...,[],False,Image,[https://pbs.twimg.com/media/GCDua9gWAAATn8J?f...,23,38,374,0
275,SpaceX is tracking to launch over 80% of all E...,Elon Musk,@elonmusk,2023-12-06,en,https://twitter.com/elonmusk/status/1732393496...,[],False,Image,[https://pbs.twimg.com/media/GAqytC-WUAAZBLc?f...,4840,8909,60179,0
386,推荐Google Deepmind的新论文：《Levels of AGI: Operatio...,宝玉,@dotey,2023-11-08,zh,https://twitter.com/dotey/status/1722323876267...,[],False,Image,[https://pbs.twimg.com/media/F-bqr81WMAAOoSx?f...,17,124,338,0
214,我本身是一个不太写总结类内容的人，但这次跟风写一回，总结我这一年AI相关的技术总结和我整体的思考。,GanymedeNil,@GanymedeNil,2023-12-25,zh,https://twitter.com/GanymedeNil/status/1739160...,"[https://t.co/znYrPTNfq6, https://t.co/znYrPTN...",False,No media,,4,13,53,0


In [6]:
df.author_name.value_counts().head(5)

author_name
Jim Fan         23
Jerry Liu       20
Yann LeCun      19
Science girl    18
Massimo         17
Name: count, dtype: int64

## Plots
Few basic plots.

In [7]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

def plot_likes_with_cumulative(df, start_date, end_date):
    df['date'] = pd.to_datetime(df['date'])
    df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
    df['media_type'] = df['media_type'].replace({'No media': 'Text'})
    df['year_month'] = df['date'].dt.strftime('%Y-%m')

    # Prepare the monthly count DataFrame
    monthly_likes_df = df.groupby(['year_month', 'media_type']).size().reset_index(name='counts')

    # Prepare the cumulative DataFrame
    cumulative_likes_df = monthly_likes_df.copy()
    cumulative_likes_df['cumulative_counts'] = cumulative_likes_df.groupby('media_type')['counts'].cumsum()

    # Select a color palette
    color_palette = px.colors.qualitative.Pastel

    # Match colors to media type
    color_map = {media_type: color_palette[i] for i, media_type in enumerate([ 'Video', 'Image',  'Text'])}

    # Creating subplots
    fig = make_subplots(rows=2, cols=1, shared_xaxes=True,
                        subplot_titles=("Monthly Likes by Media Type", "Cumulative Likes by Media Type"),
                        vertical_spacing=0.15)

    # Adding the monthly bar chart
    for media_type in monthly_likes_df['media_type'].unique():
        filtered_df = monthly_likes_df[monthly_likes_df['media_type'] == media_type]
        fig.add_trace(go.Bar(x=filtered_df['year_month'], y=filtered_df['counts'], opacity=0.8,
                             name=media_type, marker_color=color_map[media_type]),
                      row=1, col=1)

    # Adding the cumulative area chart
    for media_type in cumulative_likes_df['media_type'].unique():
        filtered_cumulative_df = cumulative_likes_df[cumulative_likes_df['media_type'] == media_type]
        fig.add_trace(go.Scatter(x=filtered_cumulative_df['year_month'], y=filtered_cumulative_df['cumulative_counts'],
                                 mode='lines', name=media_type, fill='tonexty',
                                 line=dict(color=color_map[media_type])),
                      row=2, col=1)

    # Update layout for clarity and visual appeal
    fig.update_layout(height=600, title_text="Likes Analysis by Media Type")
    fig.update_xaxes(title_text="Month-Year", row=2, col=1)
    fig.update_yaxes(title_text="Monthly Counts", row=1, col=1)
    fig.update_yaxes(title_text="Cumulative Counts", row=2, col=1)

    fig.show()

# Call the function with your DataFrame and a date range
plot_likes_with_cumulative(df, '2023-01-01', '2023-12-31')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['media_type'] = df['media_type'].replace({'No media': 'Text'})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year_month'] = df['date'].dt.strftime('%Y-%m')


In [8]:
def plot_combined_donut_chart_ordered_correctly(data_df, date_col = 'StrTime'):
    # Convert 'StrTime' to datetime and extract the weekday name and number
    data_df['WeekdayName'] = data_df[date_col].dt.day_name()
    # Map weekday names to numbers for sorting (Monday=0, ..., Sunday=6)
    weekdays_map = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
    data_df['WeekdayNumber'] = data_df['WeekdayName'].map(weekdays_map)
    
    # Prepare the data: count messages by WeekdayName, ignoring sender
    weekday_distribution = data_df.groupby(['WeekdayName', 'WeekdayNumber']).size().reset_index(name='Count')
    
    # Sort by WeekdayNumber to ensure correct order
    weekday_distribution.sort_values('WeekdayNumber', inplace=True)
    
    # Now that sorting is done, WeekdayNumber is no longer needed
    weekday_distribution = weekday_distribution.drop(columns=['WeekdayNumber'])
    
    # Create Donut chart with a different color scheme
    fig = px.pie(weekday_distribution, names='WeekdayName', values='Count',
                 title="Tweets Liked by Weekday",
                 color_discrete_sequence=px.colors.qualitative.Pastel,
                 hole=0.4)
    
    # Add slight margin between each day for better visualization
    fig.update_traces(textinfo='percent+label', pull=0.02)
    fig.update_traces(
        sort=False, 
        direction='clockwise'
                      )


    fig.show()

# Ensure data_df['StrTime'] is a datetime column before calling the function
plot_combined_donut_chart_ordered_correctly(df, date_col='date')


In [9]:
from plotly_calplot import calplot

def convert_to_day_df(df):
    # Convert the 'date' column to datetime if it's not already
    df['date'] = pd.to_datetime(df['date'])
    day_df = df.groupby(df['date'].dt.date).size().reset_index(name='num_tweets')
    
    return day_df

day_df = convert_to_day_df(df)
day_df.head(2)

fig = calplot(day_df[(day_df['date'].astype(str)>'2023-01-01') & (day_df['date'].astype(str) < '2024-01-01')], x="date", y="num_tweets", 
              start_month = 1,
    end_month = 12,
    )

# set height of the figure
fig.update_layout(height=250)
# add title 
fig.update_layout(title='Number of Liked Tweets per Day 2023')
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Image captions 
Proof of concept: get high level summary, detailed description and tags for first image from liked tweets. 


With GPT-4-V. 

Gemini 1.0 also works pretty great for this usecase.

Mainly, convert image to structured json data. 

In [10]:
from openai import OpenAI
from config import OPENAI_API_KEY
import os 
import json
from IPython.display import Image, display

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

client = OpenAI()

def get_valid_json_with_gpt35(input):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-0125", # latest version of GPT-3.5, better JSON mode support.
        messages=[
            {
            "role": "system",
            "content": "You will be given a JSON string (that may not be completely valid), your task is to create a valid JSON results based on the input. Try keep the structure of the input JSON as much as possible."
            }, 
            {
            "role": "user",
            "content": input
            }
        ],
        temperature=0.1,
        response_format = { "type": "json_object" },
        max_tokens=2000,
        )
    return response.choices[0].message.content

def get_tweet_image_captioning(image_url, author_name, text):
    image_prompt = f"""
    This is a tweet from {author_name}, 
    with the following text:
    {text}.

    Below is an image from the tweet.
    Your task is to caption the image and provide a description. reference the text or author if necessary. 

    Output rules you MUST follow:
    - "Summary": A brief high level, big picture description of the image. What is this and what is this for?
    - "Description": A detailed description of the image. If there are text in the image, please include all of it in the description. (in its original language)
    - "Tags": a brief list of high level tags that describe the image. (e.g. "cat", "anime girl", "food")

    Return results in a JSON format (with "Answer" as key, contains list of  dictionary with exactly these key names):    
"""

    vision_response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {
            "role": "user",
            "content": [
                {"type": "text", "text": image_prompt},
                {
                "type": "image_url",
                "image_url": {
                    "url": image_url,
                },
                },
            ],
            }
        ],
        max_tokens=1000,
        )
    
    # Post process the response with GPT 3.5 json mode
    json_captions = get_valid_json_with_gpt35(vision_response.choices[0].message.content)
    return json_captions


In [35]:
sample_image_df = df[df.media_type=='Image'].head(30)
sample_image_urls = sample_image_df.images_urls.tolist()
sample_authors = sample_image_df.author_name.tolist()
sample_texts = sample_image_df.text.tolist()

In [42]:
cur_idx = 26
image_url = sample_image_urls[cur_idx][0]
author_name = sample_authors[cur_idx]
text = sample_texts[cur_idx]

display(Image(url=image_url))

In [43]:

print(f"Image url: {image_url}\n"
      f"Author name: {author_name}\n"
      f"Text: {text}"   )

result = get_tweet_image_captioning(image_url, author_name, text)

Image url: https://pbs.twimg.com/media/GGQj7ObbQAAaKgl?format=jpg&name=900x900
Author name: Kars
Text: Cover for Fate/Strange Fake volume 9 which will release on March 8.

https://dengekibunko.jp/product/fate/322308000304.html… #strangefake


In [44]:
print(json.dumps(json.loads(result), indent=4))

{
    "Answer": [
        {
            "Summary": "The image is an illustrated cover art for 'Fate/Strange Fake' volume 9 which is set to release on March 8 as mentioned in the tweet by Kars.",
            "Description": "The image features an anime-style illustration of a red-haired female character with a determined expression. She is wearing a white Greek-style toga with a purple sash, which appears to be unraveling, and black and white armored gloves. The character has a red choker and earrings, and blood is visible on her shoulder. Behind her, a chaotic burst illustrates a dramatic scene with dark red visual effects that resemble flames or shards of destruction, contrasting against the brighter colors in the foreground. There is no text within the image itself.",
            "Tags": [
                "anime",
                "book cover",
                "Fate/Strange Fake",
                "illustration",
                "red-haired anime character"
            ]
        }
    ]