In [1]:
import pandas as pd
import numpy as np
import warnings
from typing import Dict, Iterator, Any
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [20, 10]
import os
from tqdm.auto import tqdm
# register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
tqdm.pandas()
from enum import StrEnum
import math
#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class OrigDataSchema(StrEnum):
    IS_RETWEET = "is_retweet"
    TIMESTAMP = "tweetcreatedts"
    ID = "tweetid"
    TEXT = "text"


class HashedDataSchema(StrEnum):
    HASH = "hash"
    ID = "tweetid"
    TIMESTAMP = "tweetcreatedts"
    DATE = "date"

def iterate_dataframes(path: str) -> Iterator[pd.DataFrame]:
    """
    Iterates over all .csv files in path as pd.DataFrame
    """
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
        csvs = [path + x for x in os.listdir(path) if "csv" in x]
    
        for csv in tqdm(csvs):
            yield pd.read_csv(csv,  lineterminator='\n')

def aggregate_dataframe(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Aggregates dataframe to dict.
    """
    df_la = df.groupby("language").count().reset_index()
    languages = df_la['language'].tolist()
    lang_counts = df_la['username'].tolist()
    df['dupl'] = df.duplicated(subset="text")
    languages_dupl = [la + "_dupl" for la in languages]
    lang_dupl_counts = df.groupby("language")['dupl'].sum().tolist()
    unique_user_count = df['userid'].unique().shape[0]
    row_count = df.shape[0]
    duplicated_count = df.duplicated(subset="text").sum()
    date = df.iloc[0]['tweetcreatedts'][:10]
    aggregation = {'unique_users': unique_user_count,
                  'row_count': row_count,
                  'text_duplicated_count': duplicated_count,
                  'date':date}
    aggregation = {**dict(zip(languages, lang_counts)), **aggregation, **dict(zip(languages_dupl, lang_dupl_counts))}
    return aggregation
    


    

def get_all_aggregated_data() -> pd.DataFrame:
    """
    Loads or creates all aggregated data.
    """
    path_2023 = "/Users/robinfeldmann/TopicAnalysisRUWTweets/Data/2023_agg.csv"
    path_2022 = "/Users/robinfeldmann/TopicAnalysisRUWTweets/Data/2022_agg.csv"

    df_agg_2023 = pd.read_csv(path_2023)
    df_agg_2022 = pd.read_csv(path_2022)

    df_agg = df_agg = pd.concat([df_agg_2022,df_agg_2023]).fillna(0)
    return df_agg


def create_hashes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates dataframes containing only hashes and tweetids from not duplicates.
    """
    if OrigDataSchema.IS_RETWEET in df.columns:
        df = df[~df.duplicated(subset=OrigDataSchema.TEXT) & ~df[OrigDataSchema.IS_RETWEET]]
    else:
        df =df[~df.duplicated(subset=OrigDataSchema.TEXT)]
        
    df_hashed = df[[OrigDataSchema.TEXT, OrigDataSchema.ID,OrigDataSchema.TIMESTAMP]].copy()
    
    df_hashed[HashedDataSchema.HASH] = df_hashed[OrigDataSchema.TEXT].apply(hash)
    
    
    #df_hashed[HashedDataSchema.DATE] = df_hashed[OrigDataSchema.TIMESTAMP].apply(lambda ts: pd.to_datetime(ts[:10]))

    df_hashed = df_hashed.drop(OrigDataSchema.TEXT, axis=1).set_index(HashedDataSchema.ID)#.drop(OrigDataSchema.TIMESTAMP, axis=1)
    return df_hashed


def aggregate_hash_data(dir_path: str, target_path: str) -> pd.DataFrame:
    """
    Creates aggregated data frame and saves it as csv.
    """

    hashed_dfs = []
    for df in iterate_dataframes(dir_path):
        hashed = create_hashes(df)
        hashed_dfs.append(hashed)

    hashed_df = pd.concat(hashed_dfs)
    hashed_df.to_csv(target_path)

def get_all_hashed_data() -> pd.DataFrame:
    """
    Loads all hashed dataframes.
    """
    target_2022 = "/Users/robinfeldmann/TopicAnalysisRUWTweets/Data/2022_hashed.csv"
    target_2023 = "/Users/robinfeldmann/TopicAnalysisRUWTweets/Data/2023_hashed.csv"

    df_hash_2023 = pd.read_csv(path_2023)
    df_hash_2022 = pd.read_csv(path_2022)

    df_hash  = pd.concat([df_hash_2022,df_hash_2023])
    return df_hash

In [None]:
path_2022_data = "/Users/robinfeldmann/TopicAnalysisRUWTweets/Data/2022/"
target_2022 = "/Users/robinfeldmann/TopicAnalysisRUWTweets/Data/2022_hashed.csv"
aggregate_hash_data(path_2022_data, target_2022)

 54%|██████████████████████                   | 167/311 [06:02<04:23,  1.83s/it]

In [None]:
path_2023_data = "/Users/robinfeldmann/TopicAnalysisRUWTweets/Data/2023/"
target_2023 = "/Users/robinfeldmann/TopicAnalysisRUWTweets/Data/2023_hashed.csv"
df_23 = aggregate_hash_data(path_2023_data, target_2023)

In [7]:
df = pd.concat([df_22,df_23])

In [8]:
df[]

Unnamed: 0_level_0,tweetcreatedts,hash
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
1525264628711936000,2022-05-14 00:00:00.000000,614946501457347476
1525264628715880449,2022-05-14 00:00:00.000000,-5100654424024256760
1525264628925624325,2022-05-14 00:00:00.000000,1138897545845033386
1525264628934234120,2022-05-14 00:00:00.000000,-5316821414530685413
1525264629626191880,2022-05-14 00:00:00.000000,-7631953033647462987


In [10]:
df = df.

1611412

In [11]:
df.shape

(30110382, 2)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30110382 entries, 1525264628711936000 to 1631081855142318083
Data columns (total 2 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tweetcreatedts  object
 1   hash            int64 
dtypes: int64(1), object(1)
memory usage: 689.2+ MB


In [87]:
df

Unnamed: 0_level_0,hash
tweetid,Unnamed: 1_level_1
1525264628711936000,7512023704103795790
1525264628715880449,-1632070416298900772
1525264628925624325,8837051791152875786
1525264628934234120,-8947177930304202717
1525264629626191880,-1186198326629031726
...,...
1631081836112642048,7417735237431925445
1631081844127965185,4297768191860994750
1631081845906448387,3482933603353950004
1631081850507608066,8544894820816301394
