## Analysis about keyword: "애플페이"

In [49]:
import json
from typing import Dict
import orjson

from utils import DATA_DIR

with (DATA_DIR / "OpenAI.json").open() as f:
    statuses = json.loads(f.read())

In [50]:
tweet_created = [
    status["created_at"]
    for status in statuses
]

In [51]:
print(f"데이터에서 가장 오래된 트윗: \t{min(tweet_created)}")
print(f"데이터에서 가장 최근 트윗: \t{max(tweet_created)}")

데이터에서 가장 오래된 트윗: 	Wed Mar 15 14:47:00 +0000 2023
데이터에서 가장 최근 트윗: 	Wed Mar 15 15:14:55 +0000 2023


In [52]:
user_created = [
    status["user"]["created_at"]
    for status in statuses
]

In [53]:
print(f"트윗 작성자 중 가장 오래된 계정 생성 날짜: \t{min(tweet_created)}")
print(f"트윗 작성자 중 가장 최근 계정 생성 날짜: \t{max(tweet_created)}")

트윗 작성자 중 가장 오래된 계정 생성 날짜: 	Wed Mar 15 14:47:00 +0000 2023
트윗 작성자 중 가장 최근 계정 생성 날짜: 	Wed Mar 15 15:14:55 +0000 2023


In [54]:
how_many_tweets_by_the_user = [
    status["user"]["statuses_count"]
    for status in statuses
]

In [55]:
how_many_followers_by_the_user = [
    status["user"]["followers_count"]
    for status in statuses
]

In [56]:
import pandas as pd
df = pd.DataFrame([
    tweet_created,
    user_created,
    how_many_tweets_by_the_user,
    how_many_followers_by_the_user
]).transpose()

df.columns = ["tweet_created", "user_created", "tweets_from_user", "followers_from_user"]
df["tweet_created"] = pd.to_datetime(df["tweet_created"])
df["user_created"] = pd.to_datetime(df["user_created"])
df["tweets_from_user"] = pd.to_numeric(df["tweets_from_user"])
df["followers_from_user"] = pd.to_numeric(df["followers_from_user"])

In [57]:
df

Unnamed: 0,tweet_created,user_created,tweets_from_user,followers_from_user
0,2023-03-15 15:14:55+00:00,2016-05-31 14:45:06+00:00,97553,291
1,2023-03-15 15:14:52+00:00,2017-01-26 15:25:26+00:00,466400,3806
2,2023-03-15 15:14:50+00:00,2021-10-03 02:18:33+00:00,770,33
3,2023-03-15 15:14:48+00:00,2022-08-03 21:47:21+00:00,1475,3
4,2023-03-15 15:14:48+00:00,2009-06-26 00:07:38+00:00,8563,130
...,...,...,...,...
1078,2023-03-15 14:47:13+00:00,2018-12-23 00:04:27+00:00,7844,4052
1079,2023-03-15 14:47:12+00:00,2022-10-22 18:34:31+00:00,78,28
1080,2023-03-15 14:47:00+00:00,2023-03-03 07:24:50+00:00,61,0
1081,2023-03-15 14:47:00+00:00,2013-05-15 01:15:34+00:00,3768,144


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1083 entries, 0 to 1082
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   tweet_created        1083 non-null   datetime64[ns, UTC]
 1   user_created         1083 non-null   datetime64[ns, UTC]
 2   tweets_from_user     1083 non-null   int64              
 3   followers_from_user  1083 non-null   int64              
dtypes: datetime64[ns, UTC](2), int64(2)
memory usage: 34.0 KB


In [59]:
df.describe()

Unnamed: 0,tweets_from_user,followers_from_user
count,1083.0,1083.0
mean,45716.04,32068.37
std,179133.4,378685.3
min,1.0,0.0
25%,636.5,73.0
50%,4755.0,345.0
75%,25293.5,1715.0
max,3913302.0,9202692.0


* 사용자가 작성한 트윗
    * 평균: 4만 5716개
    * 중앙값: 4만 7550개
    * 최대: 약 390만개
* 사용자의 팔로워 수
    * 평균: 3만 2000 여 명
    * 중앙값: 345 명
    * 최대: 920만 명
    * 최소: 0 명

사용자의 팔로워 수에서 최대값이 매우 크고, 중앙값이 작은 거로 보아 평균의 오류가 발생하고 있는 데이터라고 볼수 있다.
그러면 920만명의 팔로워를 가진 사람은 누구일까.

In [60]:
list(filter(
    lambda row: row["user"]["followers_count"] > 9200000,
    statuses
))[0]["user"]

{'id': 34713362,
 'id_str': '34713362',
 'name': 'Bloomberg',
 'screen_name': 'business',
 'location': 'New York and the World',
 'description': 'The first word in business news. Newsletters: https://t.co/nWaCxHSKuU Podcasts: https://t.co/096e9xMbPz The Big Take podcast: https://t.co/UqskwXtp5Z',
 'url': 'https://t.co/MdLRpsrsk5',
 'entities': {'url': {'urls': [{'url': 'https://t.co/MdLRpsrsk5',
     'expanded_url': 'https://www.bloomberg.com/account/newsletters/politics',
     'display_url': 'bloomberg.com/account/newsle…',
     'indices': [0, 23]}]},
  'description': {'urls': [{'url': 'https://t.co/nWaCxHSKuU',
     'expanded_url': 'http://bloom.bg/newsletters',
     'display_url': 'bloom.bg/newsletters',
     'indices': [46, 69]},
    {'url': 'https://t.co/096e9xMbPz',
     'expanded_url': 'http://bloom.bg/podcasts',
     'display_url': 'bloom.bg/podcasts',
     'indices': [80, 103]},
    {'url': 'https://t.co/UqskwXtp5Z',
     'expanded_url': 'http://trib.al/jtoXfuT',
     'display

Bloomberg 공식 계정 이었다. 블룸버그의 뉴스레터니... 많을 수 밖에
팔로워 수가 많을 수 밖에 없었다.

가장 많이 나오는 hashtag는?

In [61]:
from functools import reduce

hashtags = [
    record['text']
    for record in
    list(reduce(
        lambda i, j: i+j,
        [status["entities"]["hashtags"] for status in statuses]
    ))
]


In [62]:
from collections import Counter

sorted(Counter(hashtags).items(), key=lambda i: i[1], reverse=True)

[('OpenAI', 35),
 ('GPT4', 34),
 ('ChatGPT', 33),
 ('AI', 22),
 ('gpt4', 15),
 ('Web3', 14),
 ('openai', 14),
 ('chatgpt4', 8),
 ('Microsoft', 6),
 ('GPT', 6),
 ('chatgpt', 5),
 ('nowai', 5),
 ('aiart', 4),
 ('OpenAi', 4),
 ('Ai', 4),
 ('Bing', 3),
 ('ArtificialIntelligence', 3),
 ('ai', 3),
 ('openAI', 3),
 ('digitalhealth', 3),
 ('CyberChess', 2),
 ('Chatbots', 2),
 ('ChatPBT', 2),
 ('FutureOfWork', 2),
 ('BingAI', 2),
 ('selfelaketi', 2),
 ('lluvias', 2),
 ('MAFSAU', 2),
 ('CreditSuisse', 2),
 ('ImranKhan', 2),
 ('kibe', 2),
 ('JUNGKOOK', 2),
 ('Pakistan', 2),
 ('nowaiart', 2),
 ('chatbot', 2),
 ('contentwriter', 2),
 ('innovation', 2),
 ('programming', 2),
 ('SXSW2023', 2),
 ('GPT3', 2),
 ('Tecnologia', 1),
 ('chatsing', 1),
 ('bankcollapse', 1),
 ('gpt5', 1),
 ('analytics', 1),
 ('googleads', 1),
 ('facebookads', 1),
 ('InteligenciaArtificial', 1),
 ('NewsPicks', 1),
 ('WEEKLYOCHIAI', 1),
 ('techies', 1),
 ('TechNews', 1),
 ('GPT_4', 1),
 ('WorkoutApp', 1),
 ('Trending', 1),
 ('ML

가장 많이 나온 해쉬태그는 예상대로
'OpenAI', 'GPT4', 'ChatGPT' 순 이었다.
키워드 제출하는 날 (03/15)에 이미 gpt4 가 publish되었기 때문에 유명세를 타지 않았을까?

가장 연관이 많은 빈출 단어는?

In [63]:
from functools import reduce

contents = [
    status["text"]
    for status in statuses
]

In [64]:
contents

['RT @StabilityAI_JP: オープンソースの流れ\n・ChatGLM-6Bという中国語特化のオープンソースが出現\n・スタンフォード大学がGPT-3.5並みの性能を誇るALpaca7Bを開発、発表\n・OpenChatKitと呼ばれる本当にオープンなLLMを作るセットが元…',
 '#Tecnologia #OpenAI anunció su última versión de #ChatGPT con mayor precisión y creatividad. https://t.co/8L7G3d9qf4 https://t.co/aMZa8QtEgc',
 "Ekibimizden Sarper Kaçar'ın paylaşımının ayrıntıları ve daha fazla haber https://t.co/jrKbm2mMiy'da.… https://t.co/VGO7bpEdxl",
 'OpenAI, its GPT-4\xa0chatbot https://t.co/5mZyCYIs13',
 'RT @CerfiaFR: 🚨🤖 FLASH | #OpenAI, maison mère de #ChatGPT, a annoncé la sortie de GPT-4, une version qui surpasse les capacités actuelles d…',
 'OpenAI, its GPT-4\xa0chatbot https://t.co/hGHKTutN72',
 'RT @StabilityAI_JP: GPT-4発表\nOpenAIよりGPT-4が発表された https://t.co/PHEVaofPUJ (1/5) https://t.co/kbcJChsROf',
 'RT @profoundlyyyy: You get a Twitter notification saying OpenAI released GPT-4 and look outside your window and see this…wyd? https://t.co/…',
 'RT @benmschmidt: Neural networks like GPT-4 are notoriously blac

In [65]:
import spacy
# 여기서는 트윗이 대체로 영어로 작성되어 있어 영어 spacy 모듈을 로드
nlp = spacy.load('en_core_web_sm')

In [66]:
clean_contents = [
    nlp(content)
    for content in contents
]

In [67]:
clean_contents[0]

RT @StabilityAI_JP: オープンソースの流れ
・ChatGLM-6Bという中国語特化のオープンソースが出現
・スタンフォード大学がGPT-3.5並みの性能を誇るALpaca7Bを開発、発表
・OpenChatKitと呼ばれる本当にオープンなLLMを作るセットが元…

In [68]:
import nltk

is_noun = lambda pos: pos[:2] == 'NN'

In [69]:
contents_toks = list(reduce(
    lambda i, j: i + j,
    [
        nltk.word_tokenize(str(clean_content))
        for clean_content in clean_contents
    ]
))
contents_nouns = [word for (word, pos) in nltk.pos_tag(contents_toks) if is_noun(pos)]

In [70]:
sorted(Counter(contents_nouns).items(), key=lambda i: i[1], reverse=True)

[('@', 988),
 ('https', 704),
 ('OpenAI', 662),
 ('RT', 614),
 ('GPT-4', 416),
 ('ChatGPT', 167),
 ('AI', 152),
 ('model', 133),
 ('multimodal', 79),
 ('capabilities', 71),
 ('’', 67),
 ('results', 66),
 ('alignment', 65),
 ('h…', 63),
 ('GPT4', 45),
 ('everything', 34),
 ('”', 33),
 ('version', 32),
 ('today', 29),
 ('kana_Eng_coach', 28),
 ('Maxを公開。Maxは会話の練習相手で、間違えた時は話し手がなぜその文章で話したかまで理解して、解説をくれるという優れもの。AIによって世界中…',
  28),
 ('language', 27),
 ('chatbot', 26),
 ('“', 25),
 ('thealexbanks', 24),
 ('dust', 24),
 ('GPT', 24),
 ('ChatGPT-4', 24),
 ('Bing', 23),
 ('paper', 22),
 ('%', 22),
 ('API', 22),
 ('yesterday', 21),
 ('soumithchintala', 19),
 ('StabilityAI_JP', 18),
 ('amp', 18),
 ('que', 18),
 ('Google', 18),
 ('s', 17),
 ('t', 17),
 ('la', 16),
 ('The_Delysium', 16),
 ('New', 16),
 ('release', 16),
 ('people', 15),
 ('future', 15),
 ('unusual_whales', 15),
 ('Speaking', 14),
 ('Web3', 14),
 ('bentossell', 14),
 ('examples', 14),
 ('Microsoft', 14),
 ('BI', 14),
 ('company', 13),
 (

트윗 내용중에 계정이름 앞에 붙는 '@'가 가장 많이 나온 명사
그다음은 링크 때문에 'https'가 2등.
예상대로 'OpenAI'가 3등에 662번 사용되었다.
'GPT-4'도 416번 사용되었고
'ChatGPT'도 167번 사용되었다.
chat gpt보다 gpt4가 2배 넘게 검색된 것으로 보아 사람들의 관심이 chatgpt 에서 GPT4로 이동 된것을 유추해볼 수 있었다.

## 리트윗 통계

In [71]:
print(f"중복 포함한 전체 RT 개수: {len([status for status in statuses if 'retweeted_status' in status])}")
retweets_id = {
    status["retweeted_status"]["id"]
    for status in statuses if 'retweeted_status' in status
}
retweets = {
    status["retweeted_status"]["id"]: (status['retweet_count'], status['retweeted_status']['user']['screen_name'], status['text'])
    for status in statuses if 'retweeted_status' in status
}

retweets = dict(sorted(
    retweets.items(),
    key=lambda rt: rt[1][0],
    reverse=True
))


중복 포함한 전체 RT 개수: 611


In [72]:
print(len(retweets))
retweets

268


{1635687373060317185: (16356,
  'OpenAI',
  'RT @OpenAI: Announcing GPT-4, a large multimodal model, with our best-ever results on capabilities and alignment: https://t.co/TwLFssyALF h…'),
 1635687853324902401: (4335,
  'sama',
  'RT @sama: here is GPT-4, our most capable and aligned model yet. it is available today in our API (with a waitlist) and in ChatGPT+.\n\nhttps…'),
 1630207720522678274: (3187,
  'gomezidao',
  'RT @gomezidao: Gomezi Network:\n- large open source, \n- built in the open, \n- community driven,\n- funded in a decentralised manner.\n🔵Like &amp;…'),
 1626232682882699270: (2173,
  'unusual_whales',
  "RT @unusual_whales: OpenAI's ChatGPT has reportedly predicted that the stock market will crash on March 15, 2023, per BI."),
 1635695401880760351: (2116,
  'The_Delysium',
  "RT @The_Delysium: @OpenAI Speaking of breakthroughs - here's the biggest one in #Web3  https://t.co/bi3OoAX4KE"),
 1635704687344365569: (1716,
  'thealexbanks',
  "RT @thealexbanks: OpenAI just lau

광고나 관심을 받기 위한 트윗이 있을지도 몰라 트윗의 내용으로 중복을 제거해보았다

In [73]:
duplicate_content_removed_retweets = {
    retweet[1][2]: (retweet[1][0], retweet[1][1])
    for retweet in retweets.items()
}

In [74]:
print(len(duplicate_content_removed_retweets))
duplicate_content_removed_retweets

268


{'RT @OpenAI: Announcing GPT-4, a large multimodal model, with our best-ever results on capabilities and alignment: https://t.co/TwLFssyALF h…': (16356,
  'OpenAI'),
 'RT @sama: here is GPT-4, our most capable and aligned model yet. it is available today in our API (with a waitlist) and in ChatGPT+.\n\nhttps…': (4335,
  'sama'),
 'RT @gomezidao: Gomezi Network:\n- large open source, \n- built in the open, \n- community driven,\n- funded in a decentralised manner.\n🔵Like &amp;…': (3187,
  'gomezidao'),
 "RT @unusual_whales: OpenAI's ChatGPT has reportedly predicted that the stock market will crash on March 15, 2023, per BI.": (2173,
  'unusual_whales'),
 "RT @The_Delysium: @OpenAI Speaking of breakthroughs - here's the biggest one in #Web3  https://t.co/bi3OoAX4KE": (2116,
  'The_Delysium'),
 "RT @thealexbanks: OpenAI just launched GPT-4.\n\nIt leaves ChatGPT in the dust.\n\nHere's everything you need to know: https://t.co/Oc6TrygvNO": (1716,
  'thealexbanks'),
 'RT @kana_Eng_coach: 世界一

다행인 점은 영어 트윗은 한글 트윗과 다르게 비슷한 내용을 조금씩 바꿔낸 트윗은 없었다.

In [75]:
# top 10 RTs
rt_df = pd.DataFrame(
    list(map(
        lambda i: (i[1][1], i[0], i[1][0]),
        sorted(
            duplicate_content_removed_retweets.items(),
            key=lambda rt: rt[1][0],
            reverse=True
        )[:10]
    )),
    columns=["작성자", "트윗 내용", "RT 횟수"]
)

In [81]:
rt_df

Unnamed: 0,작성자,트윗 내용,RT 횟수
0,OpenAI,"RT @OpenAI: Announcing GPT-4, a large multimod...",16356
1,sama,"RT @sama: here is GPT-4, our most capable and ...",4335
2,gomezidao,RT @gomezidao: Gomezi Network:\n- large open s...,3187
3,unusual_whales,RT @unusual_whales: OpenAI's ChatGPT has repor...,2173
4,The_Delysium,RT @The_Delysium: @OpenAI Speaking of breakthr...,2116
5,thealexbanks,RT @thealexbanks: OpenAI just launched GPT-4.\...,1716
6,kana_Eng_coach,RT @kana_Eng_coach: 世界一人気の語学アプリDuolingoが、OpenA...,1617
7,gdb,RT @gdb: We’re releasing GPT-4 — a large multi...,1343
8,spectatorindex,RT @spectatorindex: BREAKING: OpenAI's ChatGPT...,1067
9,DotCSV,RT @DotCSV: 🔴 GPT-4 YA ESTÁ AQUÍ!!!\n\nHoy era...,1052
