In [1]:
import pandas as pd

# Analiza danych i pre-processing

Wybrałam bezimienny zbiór danych, w którym znajdują się tweety na temat zmian klimatycznych, oznaczonych hashtagiem `#climatechange`. Pochodzą one z okresu 20.11 - 05.12.2018r., w którym to udało się zgromadzić prawie $460\,000$ tweetów.

In [33]:
df = pd.read_csv("climatechange_tweets_all.csv", parse_dates=['tweet_created_at', 'user_created_at'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457294 entries, 0 to 457293
Data columns (total 15 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   tweet_id                  457294 non-null  int64         
 1   tweet_created_at          457294 non-null  datetime64[ns]
 2   language                  457294 non-null  object        
 3   user_screen_name          457294 non-null  object        
 4   user_created_at           457294 non-null  datetime64[ns]
 5   user_id                   457294 non-null  int64         
 6   followers_count           457294 non-null  int64         
 7   friends_count             457294 non-null  int64         
 8   time_zone                 0 non-null       float64       
 9   utc_offset                0 non-null       float64       
 10  retweeted_status          457294 non-null  int64         
 11  retweet_id                457294 non-null  object        
 12  re

In [34]:
df.head()

Unnamed: 0,tweet_id,tweet_created_at,language,user_screen_name,user_created_at,user_id,followers_count,friends_count,time_zone,utc_offset,retweeted_status,retweet_id,retweet_user_screen_name,retweet_user_id,text
0,1068223635024941056,2018-11-29 19:22:15,en,JGuerreroN13,2017-06-07 20:21:48,872549171680694272,245,419,,,1,1068067040882233344,UNFCCC,17463923,RT @UNFCCC: Delegates from around the world ar...
1,1068223643765940226,2018-11-29 19:22:17,en,t_rustyboy1720,2017-09-23 03:53:34,911438362271031296,63,232,,,1,1068210734977490944,saskboy,6634632,RT @saskboy: I think we most often hear the wo...
2,1068223645330432001,2018-11-29 19:22:18,en,MissCJWright,2009-03-15 16:24:32,24544855,1383,4585,,,1,1068223231264505857,EcoSenseNow,1392956425,"RT @EcoSenseNow: It mat not be obvious, but Ei..."
3,1068223649222664194,2018-11-29 19:22:19,en,al_f,2009-07-17 00:05:50,57490401,333,377,,,1,1068181567510626310,ClimateComms,362642209,RT @ClimateComms: Most Americans now see #clim...
4,1068223660299677696,2018-11-29 19:22:21,en,Hawknana_,2013-12-21 03:16:04,2255917724,1751,3409,,,1,1068187254768590848,PortlandBEM,18030823,RT @PortlandBEM: Last week's federal climate c...


Kolumny, które są interesujące ze względu na analizę procesu Haweksa to:
- `tweet_id`,
- `tweet_created_at`,
- `followers_count`,
- `retweet_id`.

Dzięki nim możemy znaleźć oryginalny tweet i kaskadę, którą zapoczątkował. Kaskady zostaną utworzone dla najbardziej retweetowanych wpisów w zbiorze. Zobaczmy zatem, które to tweety.

In [35]:
df.groupby('retweet_id').size().sort_values(ascending=False).reset_index()

Unnamed: 0,retweet_id,0
0,,138191
1,1066155330986541058,8923
2,1067149009867878400,3316
3,1065274149461987328,3108
4,1069278755041001472,2885
...,...,...
44122,1068194793874845696,1
44123,1068194561569243137,1
44124,1068194417239035904,1
44125,1068194210619289600,1


Na pierwszym miejscu grupowania znajduje się wartość `None`, która oznacza, że dany tweet jest oryginałem, zatem nie może być brana pod uwagę. Następnie znajdziemy tweety o największej liczbie retweetów w zbiorze. Dwa pierwsze to:

In [36]:
df[df["tweet_id"] == 1066155330986541058]

Unnamed: 0,tweet_id,tweet_created_at,language,user_screen_name,user_created_at,user_id,followers_count,friends_count,time_zone,utc_offset,retweeted_status,retweet_id,retweet_user_screen_name,retweet_user_id,text
63445,1066155330986541058,2018-11-24 02:23:33,en,NancyPelosi,2008-08-07 15:35:02,15764644,1712266,327,,,0,,,,The climate crisis threatens both our communit...


In [37]:
df[df["tweet_id"] == 1067149009867878400]

Unnamed: 0,tweet_id,tweet_created_at,language,user_screen_name,user_created_at,user_id,followers_count,friends_count,time_zone,utc_offset,retweeted_status,retweet_id,retweet_user_screen_name,retweet_user_id,text
288435,1067149009867878400,2018-11-26 20:12:05,en,jessphoenix2018,2017-03-17 01:37:21,842550390818201600,68853,5094,,,0,,,,If you're not willing to listen to scientists ...


Stwórzmy zbiory reprezentujące kaskady, które tworzą.

In [64]:
sub_dfs = []

for tweet_id in [1066155330986541058, 1067149009867878400]:
    sub_df = df[["tweet_id", "tweet_created_at", "followers_count", "retweet_id"]].copy()
    original = sub_df[sub_df["tweet_id"] == tweet_id]
    sub_df = sub_df[sub_df["retweet_id"] == str(tweet_id)]
    sub_df = sub_df.sort_values(by="tweet_created_at", ascending=True)
    sub_df = pd.concat([original, sub_df], ignore_index=True)
    sub_df = sub_df[["tweet_created_at", "followers_count"]]
    sub_df["time"] = (
        sub_df["tweet_created_at"] - sub_df.iloc[0]["tweet_created_at"]
    ).dt.total_seconds().fillna(0)
    sub_df["magintude"] = sub_df["followers_count"]
    sub_df = sub_df.drop(columns=["tweet_created_at", "followers_count"])
    sub_dfs.append(sub_df)
    sub_df.to_csv(f"{tweet_id}.csv")

In [65]:
sub_dfs[0]

Unnamed: 0,time,magintude
0,0.0,1712266
1,13.0,6305
2,14.0,818
3,14.0,451
4,15.0,2799
...,...,...
8919,590628.0,8
8920,592464.0,2401
8921,600019.0,1738
8922,846051.0,4643


In [66]:
sub_dfs[1]

Unnamed: 0,time,magintude
0,0.0,68853
1,12.0,167
2,20.0,262
3,28.0,3906
4,45.0,359
...,...,...
3312,636555.0,34571
3313,636904.0,698
3314,637615.0,1341
3315,676583.0,561
