In [1]:
import pandas as pd
import os

In [2]:
def move_files_into_one_dir(archive_path: str) -> None:
    """
    Moves all the files that are in a nested directory up to be in on the same level. And deletes empty directories.
    """
    if not os.path.isdir(path_to_archive+ "/UkraineWar") or not os.path.isdir(path_to_archive+ "/UkraineWar/UkraineWar"):
        raise FileNotFoundError("Archive doesnt have expected structure")
    for file in os.listdir(archive_path + "/UkraineWar/UkraineWar"):
        os.rename(archive_path + "/UkraineWar/UkraineWar/"+file, archive_path+"/" + file)
    os.rmdir(path_to_archive+ "/UkraineWar/UkraineWar")
    os.rmdir(path_to_archive+ "/UkraineWar")
        

In [3]:
def split_csv_by_date(data_path: str) -> None:
    """
    Splits csv files in data_path that span multiple days into single files, one for each day. Then removes the old csvs.
    """
    if not data_path[-1] == "/":
        data_path = data_path + "/"
    for path in [data_path+x for x in sorted(os.listdir(data_path)) if "to" in x and ".csv" in x]:
        df = pd.read_csv(path)
        df['date'] = df['tweetcreatedts'].apply(lambda x: x[:10])
        dfs = {"".join(date.split('-')) + '_UkraineCombinedTweetsDeduped.csv' : df[df['date']==date] for date in df['date'].unique() }
        for name,df_date  in dfs.items():
            df_date.to_csv(data_path + name)
    for path in [data_path+x for x in sorted(os.listdir(data_path)) if "to" in x]:
        os.remove(path)        

In [4]:
def merge_splitted_files(data_path: str) -> None:
    """
    Files for Feb28 are splitted into two parts. Merges them into one.
    """
    if not data_path[-1] == "/":
        data_path = data_path + "/"
    if not "UkraineCombinedTweetsDeduped_FEB28_part2.csv" in os.listdir(data_path) or not "UkraineCombinedTweetsDeduped_FEB28_part1.csv" in os.listdir(data_path):#
        raise FileNotFoundError("Files to merge not found")
    df_1 = pd.read_csv(data_path+"UkraineCombinedTweetsDeduped_FEB28_part1.csv")#
    df_2 = pd.read_csv(data_path+"UkraineCombinedTweetsDeduped_FEB28_part2.csv")
    df = pd.concat([df_1,df_1])
    df.to_csv("UkraineCombinedTweetsDeduped_FEB28.csv")
    os.remove(data_path+"UkraineCombinedTweetsDeduped_FEB28_part1.csv")
    os.remove(data_path+"UkraineCombinedTweetsDeduped_FEB28_part2.csv")


In [5]:
def rename_files(data_path: str) -> None:
    """
    Renames files to YYYYMMDD.csv.
    """
    if not data_path[-1] == "/":
        data_path = data_path + "/"
    for file in os.listdir(data_path):
        if not "Ukraine" in file:
            continue
        print(file)
        df = pd.read_csv(data_path + file,lineterminator='\n')
        #df['date'] = df['tweetcreatedts'].apply(lambda x: x[:10])
        name = df.iloc[0]['tweetcreatedts'][:10]
        name = "".join(name.split('-'))
        os.rename(data_path + file, data_path + name + ".csv")

In [6]:
rename_files("../Data/archive")

20220627_UkraineCombinedTweetsDeduped.csv
20221129_UkraineCombinedTweetsDeduped.csv
20220728_UkraineCombinedTweetsDeduped.csv
20221011_UkraineCombinedTweetsDeduped.csv
20230415_UkraineCombinedTweetsDeduped.csv
20220610_UkraineCombinedTweetsDeduped.csv
20230312_UkraineCombinedTweetsDeduped.csv
20221101_UkraineCombinedTweetsDeduped.csv
20230325_UkraineCombinedTweetsDeduped.csv
20230505_UkraineCombinedTweetsDeduped.csv
20221019_UkraineCombinedTweetsDeduped.csv
20220618_UkraineCombinedTweetsDeduped.csv
20221116_UkraineCombinedTweetsDeduped.csv
20230512_UkraineCombinedTweetsDeduped.csv


  df = pd.read_csv(data_path + file,lineterminator='\n')


20220717_UkraineCombinedTweetsDeduped.csv
20230305_UkraineCombinedTweetsDeduped.csv
20221121_UkraineCombinedTweetsDeduped.csv
20230525_UkraineCombinedTweetsDeduped.csv
20220720_UkraineCombinedTweetsDeduped.csv
20221109_UkraineCombinedTweetsDeduped.csv
20220708_UkraineCombinedTweetsDeduped.csv
20230222_UkraineCombinedTweetsDeduped.csv
20221006_UkraineCombinedTweetsDeduped.csv
20230402_UkraineCombinedTweetsDeduped.csv
20220607_UkraineCombinedTweetsDeduped.csv
20230215_UkraineCombinedTweetsDeduped.csv
20221031_UkraineCombinedTweetsDeduped.csv
20220630_UkraineCombinedTweetsDeduped.csv
20220501_UkraineCombinedTweetsDeduped.csv
20220321_UkraineCombinedTweetsDeduped.csv
20230124_UkraineCombinedTweetsDeduped.csv
20220809_UkraineCombinedTweetsDeduped.csv
20220316_UkraineCombinedTweetsDeduped.csv
20230113_UkraineCombinedTweetsDeduped.csv
20220906_UkraineCombinedTweetsDeduped.csv
20220411_UkraineCombinedTweetsDeduped.csv
20220821_UkraineCombinedTweetsDeduped.csv
20221210_UkraineCombinedTweetsDedu

  df = pd.read_csv(data_path + file,lineterminator='\n')


20220919_UkraineCombinedTweetsDeduped.csv
20220426_UkraineCombinedTweetsDeduped.csv
20221227_UkraineCombinedTweetsDeduped.csv
20220816_UkraineCombinedTweetsDeduped.csv
20221214_UkraineCombinedTweetsDeduped.csv
20220825_UkraineCombinedTweetsDeduped.csv
20220415_UkraineCombinedTweetsDeduped.csv
20220812_UkraineCombinedTweetsDeduped.csv
20221223_UkraineCombinedTweetsDeduped.csv
20220422_UkraineCombinedTweetsDeduped.csv


  df = pd.read_csv(data_path + file,lineterminator='\n')


20230108_UkraineCombinedTweetsDeduped.csv
20220325_UkraineCombinedTweetsDeduped.csv
20230120_UkraineCombinedTweetsDeduped.csv
20220505_UkraineCombinedTweetsDeduped.csv
20220902_UkraineCombinedTweetsDeduped.csv
20220312_UkraineCombinedTweetsDeduped.csv


  df = pd.read_csv(data_path + file,lineterminator='\n')


20230117_UkraineCombinedTweetsDeduped.csv
20230406_UkraineCombinedTweetsDeduped.csv
20220603_UkraineCombinedTweetsDeduped.csv


  df = pd.read_csv(data_path + file,lineterminator='\n')


20221002_UkraineCombinedTweetsDeduped.csv
20230226_UkraineCombinedTweetsDeduped.csv
20230509_UkraineCombinedTweetsDeduped.csv


  df = pd.read_csv(data_path + file,lineterminator='\n')


20230329_UkraineCombinedTweetsDeduped.csv
20230211_UkraineCombinedTweetsDeduped.csv
20230516_UkraineCombinedTweetsDeduped.csv
20220713_UkraineCombinedTweetsDeduped.csv


  df = pd.read_csv(data_path + file,lineterminator='\n')


20221112_UkraineCombinedTweetsDeduped.csv
20230419_UkraineCombinedTweetsDeduped.csv
20230521_UkraineCombinedTweetsDeduped.csv
20220724_UkraineCombinedTweetsDeduped.csv
20221125_UkraineCombinedTweetsDeduped.csv
20230301_UkraineCombinedTweetsDeduped.csv
