# Notebook for Reorganizing Data in Coding DH

Realized we needed to improve how we archived our data so that we could get historic snapshots. This notebook is for reorganizing the data.

In [3]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import rich
from rich.console import Console
console = Console()

In [41]:
current_path = "../../datasets/"
older_path = "../../datasets/older_files/"
file_paths_dfs = []

for dir, _, files in os.walk(current_path):
    older_dir = dir.replace(current_path, older_path)
    if os.path.exists(older_dir):
        older_files = os.listdir(older_dir)
        for file in files:
            if file.endswith(".csv"):
                subset_file = os.path.splitext(file)[0]
                for older_file in older_files:
                    subset_older_file = older_file.split("_202")[0]
                    if subset_file == subset_older_file:
                        file_dict = {
                            'file_path': os.path.join(dir, file),
                            'subset_file': subset_file,
                            'dir_path': dir,
                            'older_file_path': os.path.join(older_dir, older_file)
                        }
                        file_paths_dfs.append(file_dict)
files_df = pd.DataFrame(file_paths_dfs)

In [47]:
files_df['grouped_dir_path'] = files_df.dir_path.str.split("datasets/").str[1].str.split("/").str[0]

files_df[['grouped_dir_path', 'subset_file']].drop_duplicates()

Unnamed: 0,grouped_dir_path,subset_file
0,temp,missing_repos
1,temp,missing_users
2,temp,missing_orgs_dataset
3,temp,repo_dataset_updated
4,derived_files,updated_search_queries_user_join_subset_dh_dat...
...,...,...
181,user_data,users_searched_Humanidades+digitales
183,large_files,repo_subscribers_join_dataset
184,large_files,repo_stargazers_join_dataset
186,large_files,users_dataset


In [48]:
files_df.grouped_dir_path.unique()

array(['temp', 'derived_files', 'repo_data', 'join_files', 'entity_files',
       'user_data', 'large_files'], dtype=object)

In [30]:
def process_and_group_files(file_group):
    """
    Process and group files based on their full name. The function reads files,
    formats dates, and groups the data, keeping unique entries or the oldest entry per group.

    Parameters:
    file_group (pd.DataFrame): DataFrame with file paths and related information.

    Returns:
    pd.DataFrame: Grouped and processed DataFrame.
    """

    def format_file(file_path, date):
        """
        Read a CSV file and add a formatted date column.
        """
        file_df = pd.read_csv(file_path)
        file_df['coding_dh_date'] = pd.to_datetime(date)
        return file_df
    console.print(f"Processing file: {file_group.iloc[0]['subset_file']}", style="bold green")
    # Process the current file
    current_date = "2024-01-13"
    existing_file = format_file(file_group.iloc[0]['file_path'], current_date)

    # Process older files
    older_files = []
    for _, row in file_group.iterrows():
        older_date = "202" + row['older_file_path'].split('_202')[1].replace("_", "-").split(".")[0]
        print(older_date, len(older_date))
        
        older_file = format_file(row['older_file_path'], older_date)
        older_files.append(older_file)

    # Combine and group files
    combined_files = pd.concat([existing_file] + older_files)
    if 'search_query' not in combined_files.columns:
        final_path = file_group.iloc[0]['file_path'].replace("repo_data", "searched_repo_data")
        console.print(final_path)
        if not os.path.exists(os.path.dirname(final_path)):
            os.makedirs(os.path.dirname(final_path))
        existing_file.to_csv(final_path, index=False)
        return
    combined_files['cleaned_search_query'] = combined_files.search_query.str.replace('%22', '"').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]
    grouped_files = combined_files.groupby('full_name')
    processed_files = []
    for name, group in grouped_files:
        # Drop duplicates based on all columns except for 'coding_dh_date' and 'search_query'
        group = group.drop_duplicates(subset=group.columns.difference(['coding_dh_date', 'search_query']))
        if (group.drop(columns=['coding_dh_date', 'search_query']).nunique() > 1).any():
            group = group.sort_values('coding_dh_date')
            group['coding_dh_id'] = np.arange(len(group))
        else:
            group = group.sort_values('coding_dh_date').iloc[0:1]
            group['coding_dh_id'] = 0
        processed_files.append(group)

    final_df = pd.concat(processed_files).reset_index(drop=True)
    final_path = file_group.iloc[0]['file_path'].replace("repo_data", "searched_repo_data")
    console.print(final_path)
    if not os.path.exists(os.path.dirname(final_path)):
        os.makedirs(os.path.dirname(final_path))
    final_df.to_csv(final_path, index=False)

tqdm.pandas(desc="Processing files")
# Apply the function to the grouped DataFrame
files_df.groupby('subset_file').progress_apply(process_and_group_files)


Processing files:   0%|          | 0/50 [00:00<?, ?it/s]

2023-03-14 10


Processing files:   4%|▍         | 2/50 [00:00<00:08,  5.73it/s]

2023-03-14 10


Processing files:   6%|▌         | 3/50 [00:01<00:34,  1.35it/s]

2023-03-14 10


Processing files:   8%|▊         | 4/50 [00:03<00:45,  1.02it/s]

2023-03-14 10
2022-12-10 10
2023-03-18 10


Processing files:  10%|█         | 5/50 [00:06<01:20,  1.79s/it]

2023-03-14 10


Processing files:  12%|█▏        | 6/50 [00:07<01:03,  1.44s/it]

2023-03-14 10


Processing files:  14%|█▍        | 7/50 [00:08<00:56,  1.32s/it]

2023-03-14 10


Processing files:  16%|█▌        | 8/50 [00:09<00:48,  1.16s/it]

2023-03-14 10


Processing files:  18%|█▊        | 9/50 [00:09<00:39,  1.03it/s]

2022-12-11 10
2023-03-14 10


2023-03-14 10
2023-03-18 10


Processing files:  22%|██▏       | 11/50 [00:20<01:54,  2.93s/it]

2023-03-14 10
2023-03-18 10


2023-03-14 10


Processing files:  26%|██▌       | 13/50 [00:20<01:09,  1.87s/it]

2023-03-14 10


2023-03-14 10


Processing files:  30%|███       | 15/50 [00:20<00:43,  1.25s/it]

2023-03-14 10


Processing files:  32%|███▏      | 16/50 [00:21<00:37,  1.11s/it]

2023-03-14 10


Processing files:  34%|███▍      | 17/50 [00:21<00:29,  1.13it/s]

2023-03-14 10


2023-03-14 10


Processing files:  38%|███▊      | 19/50 [00:22<00:19,  1.59it/s]

2023-03-14 10


Processing files:  40%|████      | 20/50 [00:22<00:17,  1.73it/s]

2022-12-11 10
2023-03-09 10
2023-03-10 10
2023-03-14 10


Processing files:  42%|████▏     | 21/50 [00:27<00:45,  1.58s/it]

2023-03-14 10


Processing files:  44%|████▍     | 22/50 [00:27<00:35,  1.25s/it]

2022-11-22 10
2022-11-18 10
2022-11-27 10
2022-12-11 10
2023-03-14 10
2022-11-02 10
2022-11-01 10
2022-11-12 10


Processing files:  46%|████▌     | 23/50 [00:28<00:29,  1.10s/it]

2023-03-14 10


Processing files:  48%|████▊     | 24/50 [00:29<00:26,  1.02s/it]

2023-03-18 10
2022-12-11 10


2022-12-11 10
2023-03-14 10


Processing files:  52%|█████▏    | 26/50 [00:30<00:18,  1.27it/s]

2022-12-11 10
2022-11-18 10
2022-11-27 10
2022-11-02 10
2022-11-01 10
2022-11-12 10
2023-03-14 10


Processing files:  54%|█████▍    | 27/50 [00:30<00:17,  1.33it/s]

2023-03-14 10


Processing files:  56%|█████▌    | 28/50 [00:31<00:15,  1.41it/s]

2023-03-14 10
2022-12-11 10


Processing files:  58%|█████▊    | 29/50 [00:31<00:15,  1.39it/s]

2023-03-14 10


Processing files:  60%|██████    | 30/50 [00:32<00:11,  1.74it/s]

2023-03-16 10
2023-03-09 10
2023-03-18 10


Processing files:  62%|██████▏   | 31/50 [00:36<00:30,  1.60s/it]

2023-03-16 10


Processing files:  64%|██████▍   | 32/50 [00:36<00:21,  1.18s/it]

2023-03-16 10


2023-03-16 10


Processing files:  68%|██████▊   | 34/50 [00:36<00:11,  1.44it/s]

2023-03-16 10


Processing files:  70%|███████   | 35/50 [00:39<00:19,  1.32s/it]

2022-12-11 10
2023-03-16 10


Processing files:  72%|███████▏  | 36/50 [00:40<00:16,  1.19s/it]

2022-12-18 10
2023-03-16 10
2022-12-11 10


Processing files:  74%|███████▍  | 37/50 [00:42<00:17,  1.32s/it]

2023-03-16 10


Processing files:  76%|███████▌  | 38/50 [00:44<00:16,  1.41s/it]

2022-12-18 10


Processing files:  78%|███████▊  | 39/50 [00:44<00:13,  1.20s/it]

2023-03-16 10
2022-12-18 10


Processing files:  80%|████████  | 40/50 [00:47<00:15,  1.52s/it]

2023-03-16 10
2022-12-18 10


Processing files:  82%|████████▏ | 41/50 [00:47<00:11,  1.28s/it]

2022-12-18 10


Processing files:  84%|████████▍ | 42/50 [00:51<00:15,  1.91s/it]

2023-03-16 10
2022-12-18 10


Processing files:  86%|████████▌ | 43/50 [00:53<00:13,  1.91s/it]

2023-03-16 10


2023-03-14 10


Processing files:  90%|█████████ | 45/50 [00:53<00:05,  1.11s/it]

2022-12-11 10


Processing files:  92%|█████████▏| 46/50 [00:54<00:03,  1.03it/s]

2023-03-14 10
2023-03-14 10
2022-11-01 10
2022-11-12 10
2022-11-22 10
2022-11-27 10
2022-12-11 10
2023-03-09 10


Processing files:  94%|█████████▍| 47/50 [01:02<00:09,  3.05s/it]

2023-03-14 10
2022-12-11 10


Processing files:  96%|█████████▌| 48/50 [01:03<00:04,  2.46s/it]

2023-03-14 10


2023-03-16 10


Processing files: 100%|██████████| 50/50 [01:04<00:00,  1.43s/it]

2023-03-16 10


Processing files: 100%|██████████| 50/50 [01:04<00:00,  1.28s/it]


In [None]:
def process_files(rows):
    current_date = "2024-01-13"
    existing_file = pd.read_csv(rows[0:1].file_path)
    existing_file['coding_dh_date'] = current_date
    existing_file.coding_dh_date = pd.to_datetime(existing_file.coding_dh_date)
    older_files = []
    new_df = pd.DataFrame()
    for _, row in rows.iterrows():
        older_date = row.older_file_path.split(row.subset_file)[1].split(".")[0].replace("_", "-").replace("-2", "2")
        older_file = pd.read_csv(row.older_file_path)
        older_file['coding_dh_date'] = older_date
        older_file.coding_dh_date = pd.to_datetime(older_file.coding_dh_date)
        older_files.append(older_file)
    older_file_df = pd.concat(older_files)
    combined_files = pd.concat([existing_file, older_file_df])
    grouped = combined_files.groupby('full_name')
    for name, group in grouped:
        # Check if there are any unique values in any column except for coding_dh_date
        if (group.drop(columns='coding_dh_date').nunique() > 1).any():
            # Sort the group by coding_dh_date and assign a unique id to each row
            group = group.sort_values('coding_dh_date')
            group['unique_id'] = np.arange(len(group))
        else:
            # Keep only the oldest row
            group = group.sort_values('coding_dh_date').iloc[0:1]
        new_df = pd.concat([new_df, group])
    new_df.reset_index(drop=True, inplace=True)
    return new_df

new_files_df = files_df.groupby('subset_file').apply(process_files)

In [13]:
for _, row in files_df[0:1].iterrows():
    older_date = row.older_file_path.split(row.subset_file)[1].split(".")[0].replace("_", "-").replace("-2", "2")
    current_date = "2024-01-13"
    existing_file = pd.read_csv(row.file_path)
    existing_file['coding_dh_date'] = current_date
    older_file = pd.read_csv(row.older_file_path)
    older_file['coding_dh_date'] = older_date
    combined_file = pd.concat([existing_file, older_file])

2023-03-16


In [4]:
test = pd.read_csv(files_df['file_path'][0])
test['coding_dh_date'] = "2024-01-13"
test2 = pd.read_csv(files_df['older_file_path'][0])
files_df['subset_file'][0]

In [8]:
testing = pd.concat([test, test2], axis=0, ignore_index=True)

In [9]:
testing

Unnamed: 0,id,node_id,name,full_name,private,html_url,description,fork,url,forks_url,...,license.key,license.name,license.spdx_id,license.url,license.node_id,search_query,search_term,search_term_source,natural_language,search_type
0,443788827,R_kgDOGnOuGw,CCU-Deprecated,ItalianDudes/CCU-Deprecated,False,https://github.com/ItalianDudes/CCU-Deprecated,Carte Contro l'Umanità (CCU),False,https://api.github.com/repos/ItalianDudes/CCU-...,https://api.github.com/repos/ItalianDudes/CCU-...,...,,,,,,"https://api.github.com/search/repositories?q=""...",Umanità,Humanities,co,searched
1,163746787,MDEwOlJlcG9zaXRvcnkxNjM3NDY3ODc=,Miei-Pensieri-di-varia-Umanita_58579,GITenberg/Miei-Pensieri-di-varia-Umanita_58579,False,https://github.com/GITenberg/Miei-Pensieri-di-...,Miei Pensieri di varia Umanità by Giovanni Pas...,False,https://api.github.com/repos/GITenberg/Miei-Pe...,https://api.github.com/repos/GITenberg/Miei-Pe...,...,other,Other,NOASSERTION,,MDc6TGljZW5zZTA=,"https://api.github.com/search/repositories?q=""...",Umanità,Humanities,co,searched
2,381767394,MDEwOlJlcG9zaXRvcnkzODE3NjczOTQ=,SdD-progettoFinale,odinsseo/SdD-progettoFinale,False,https://github.com/odinsseo/SdD-progettoFinale,"Progetto finale del corso ""Fondamenti di scien...",False,https://api.github.com/repos/odinsseo/SdD-prog...,https://api.github.com/repos/odinsseo/SdD-prog...,...,,,,,,"https://api.github.com/search/repositories?q=""...",Umanità,Humanities,co,searched
3,163746787,MDEwOlJlcG9zaXRvcnkxNjM3NDY3ODc=,Miei-Pensieri-di-varia-Umanita_58579,GITenberg/Miei-Pensieri-di-varia-Umanita_58579,False,https://github.com/GITenberg/Miei-Pensieri-di-...,Miei Pensieri di varia Umanità by Giovanni Pas...,False,https://api.github.com/repos/GITenberg/Miei-Pe...,https://api.github.com/repos/GITenberg/Miei-Pe...,...,other,Other,NOASSERTION,,MDc6TGljZW5zZTA=,https://api.github.com/search/repositories?q=U...,Umanità,Humanities,co,searched
4,381767394,MDEwOlJlcG9zaXRvcnkzODE3NjczOTQ=,SdD-progettoFinale,odinsseo/SdD-progettoFinale,False,https://github.com/odinsseo/SdD-progettoFinale,"Progetto finale del corso ""Fondamenti di scien...",False,https://api.github.com/repos/odinsseo/SdD-prog...,https://api.github.com/repos/odinsseo/SdD-prog...,...,,,,,,https://api.github.com/search/repositories?q=U...,Umanità,Humanities,co,searched


In [31]:
df = pd.read_csv("../../datasets/searched_repo_data/digital_history/repos_searched_Digital+History.csv")

In [35]:
df[df.full_name == "C2DH/journal-of-digital-history"]

Unnamed: 0,id,node_id,name,full_name,private,html_url,description,fork,url,forks_url,...,permissions.pull,license,search_query,search_term,search_term_source,natural_language,search_type,coding_dh_date,cleaned_search_query,coding_dh_id
25,259946094,MDEwOlJlcG9zaXRvcnkyNTk5NDYwOTQ=,journal-of-digital-history,C2DH/journal-of-digital-history,False,https://github.com/C2DH/journal-of-digital-his...,frontend app for our Digital Journal,False,https://api.github.com/repos/C2DH/journal-of-d...,https://api.github.com/repos/C2DH/journal-of-d...,...,True,,https://api.github.com/search/repositories?q=D...,Digital History,Digital History,"bs, en, ky, la, sn, fy, yo",searched,2023-03-14,https://api.github.com/search/repositories?q=D...,0
26,259946094,MDEwOlJlcG9zaXRvcnkyNTk5NDYwOTQ=,journal-of-digital-history,C2DH/journal-of-digital-history,False,https://github.com/C2DH/journal-of-digital-his...,frontend app for our Digital Journal,False,https://api.github.com/repos/C2DH/journal-of-d...,https://api.github.com/repos/C2DH/journal-of-d...,...,True,,"https://api.github.com/search/repositories?q=""...",Digital History,Digital History,"bs, en, ky, la, sn, fy, yo",searched,2023-03-18,https://api.github.com/search/repositories?q=D...,1
27,259946094,MDEwOlJlcG9zaXRvcnkyNTk5NDYwOTQ=,journal-of-digital-history,C2DH/journal-of-digital-history,False,https://github.com/C2DH/journal-of-digital-his...,frontend app for our Digital Journal,False,https://api.github.com/repos/C2DH/journal-of-d...,https://api.github.com/repos/C2DH/journal-of-d...,...,True,,"https://api.github.com/search/repositories?q=""...",Digital History,Digital History,"bs, en, ky, la, sn, fy, yo",searched,2024-01-13,https://api.github.com/search/repositories?q=D...,2


In [60]:
test = pd.read_csv("../../datasets/temp/redo_users/kamangir_potential_users.csv")
test2 = pd.read_csv("../../datasets/temp/temp_users/kamangir_potential_users.csv")

In [63]:
tests = pd.concat([test, test2], axis=0, ignore_index=True)

In [67]:
tests['followers']

0    20
1    23
Name: followers, dtype: int64

In [62]:
set(test.columns.tolist()).difference(set(test2.columns.tolist()))

set()

In [None]:
import datetime
def get_file_date(file_path):
    # Get the creation time
    creation_time = os.path.getctime(file_path)

    # Convert the timestamp to a datetime object
    creation_date = datetime.datetime.fromtimestamp(creation_time)
    return creation_date

In [59]:
import os
import shutil

source_dir = "../../datasets/temp/redo_users"
target_dir = "../../datasets/temp/temp_users"

# Get a list of all files in the source directory
source_files = os.listdir(source_dir)

for file in source_files:
    source_file_path = os.path.join(source_dir, file)
    target_file_path = os.path.join(target_dir, file)

    # Check if the file exists in the target directory
    if not os.path.exists(target_file_path):
        # If it doesn't exist, move it from the source to the target directory
        shutil.move(source_file_path, target_file_path)
    if os.path.exists(target_file_path):
        df = pd.read_csv(target_file_path)
        df2 = pd.read_csv(source_file_path)
        dfs = pd.concat([df, df2])
        # check if there are any duplicates
        if dfs.duplicated().any():
            dfs.drop_duplicates(inplace=True)
        if len(dfs) > 1:
            target_time_created = get_file_date(target_file_path)
            source_time_created = get_file_date(source_file_path)
            df['coding_dh_date'] = target_time_created
            df['coding_dh_id'] = 0
            df2['coding_dh_date'] = source_time_created
            df2['coding_dh_id'] = 1
            dfs = pd.concat([df, df2])
            
        else:
            dfs['coding_dh_date'] = get_file_date(target_file_path)
            dfs['coding_dh_id'] = 0
        dfs.to_csv(target_file_path, index=False)
        os.remove(source_file_path)

../../datasets/temp/temp_users/kamangir_potential_users.csv
../../datasets/temp/temp_users/GabrielDancause_potential_users.csv
../../datasets/temp/temp_users/axellelecroq_potential_users.csv
../../datasets/temp/temp_users/zmuhls_potential_users.csv
../../datasets/temp/temp_users/Felegz_potential_users.csv
../../datasets/temp/temp_users/bmix_potential_users.csv
../../datasets/temp/temp_users/peterjaric_potential_users.csv
../../datasets/temp/temp_users/lizuoyue_potential_users.csv
../../datasets/temp/temp_users/vinoddalvi_potential_users.csv
../../datasets/temp/temp_users/aslishah_potential_users.csv
../../datasets/temp/temp_users/mattiafilosa22_potential_users.csv
../../datasets/temp/temp_users/mfmcc1_potential_users.csv
../../datasets/temp/temp_users/khatvangi_potential_users.csv
../../datasets/temp/temp_users/andrewbattista_potential_users.csv
../../datasets/temp/temp_users/lishka_potential_users.csv
../../datasets/temp/temp_users/lin-du_potential_users.csv
../../datasets/temp/temp_u

  dfs = pd.concat([df, df2])


../../datasets/temp/temp_users/johnfonner_potential_users.csv
../../datasets/temp/temp_users/ThomasG77_potential_users.csv
../../datasets/temp/temp_users/SonjaNilson_potential_users.csv
../../datasets/temp/temp_users/mingsquall_potential_users.csv
../../datasets/temp/temp_users/LShining_potential_users.csv
../../datasets/temp/temp_users/travisbrown_potential_users.csv
../../datasets/temp/temp_users/crmin_potential_users.csv
../../datasets/temp/temp_users/tamer1an_potential_users.csv
../../datasets/temp/temp_users/dlr1251_potential_users.csv
../../datasets/temp/temp_users/awagner-mainz_potential_users.csv
../../datasets/temp/temp_users/jmurty_potential_users.csv
../../datasets/temp/temp_users/mojavelinux_potential_users.csv
../../datasets/temp/temp_users/grasshoff_potential_users.csv
../../datasets/temp/temp_users/evesala_potential_users.csv
../../datasets/temp/temp_users/jbaiter_potential_users.csv
../../datasets/temp/temp_users/DavydovichYana_potential_users.csv
../../datasets/temp/te

  dfs = pd.concat([df, df2])


../../datasets/temp/temp_users/jboynyc_potential_users.csv
../../datasets/temp/temp_users/symac_potential_users.csv
../../datasets/temp/temp_users/recife25_potential_users.csv
../../datasets/temp/temp_users/gbstringer_potential_users.csv
../../datasets/temp/temp_users/datapolitan_potential_users.csv
../../datasets/temp/temp_users/pleonard212_potential_users.csv
../../datasets/temp/temp_users/gkthiruvathukal_potential_users.csv
../../datasets/temp/temp_users/grahamearley_potential_users.csv
../../datasets/temp/temp_users/ad-si_potential_users.csv
../../datasets/temp/temp_users/thgie_potential_users.csv
../../datasets/temp/temp_users/trisongz_potential_users.csv
../../datasets/temp/temp_users/Ned2191_potential_users.csv
../../datasets/temp/temp_users/chpollin_potential_users.csv
../../datasets/temp/temp_users/sros-UNED_potential_users.csv
../../datasets/temp/temp_users/SKrywinski_potential_users.csv
../../datasets/temp/temp_users/lizziehop_potential_users.csv
../../datasets/temp/temp_use

  dfs = pd.concat([df, df2])


../../datasets/temp/temp_users/azleslie_potential_users.csv
../../datasets/temp/temp_users/tansengming_potential_users.csv
../../datasets/temp/temp_users/iamapunkmonkey_potential_users.csv
../../datasets/temp/temp_users/swissspidy_potential_users.csv
../../datasets/temp/temp_users/DanilSko_potential_users.csv
../../datasets/temp/temp_users/amandamiotto_potential_users.csv
../../datasets/temp/temp_users/hmishra2250_potential_users.csv
../../datasets/temp/temp_users/emansom_potential_users.csv
../../datasets/temp/temp_users/nscyclone_potential_users.csv
../../datasets/temp/temp_users/dhyxy_potential_users.csv
../../datasets/temp/temp_users/Seanny123_potential_users.csv
../../datasets/temp/temp_users/fghaas_potential_users.csv
../../datasets/temp/temp_users/ImanHashemi_potential_users.csv
../../datasets/temp/temp_users/igauravsehrawat_potential_users.csv
../../datasets/temp/temp_users/danieleborghe_potential_users.csv
../../datasets/temp/temp_users/pbd84_potential_users.csv
../../datasets

  dfs = pd.concat([df, df2])


../../datasets/temp/temp_users/ateucher_potential_users.csv
../../datasets/temp/temp_users/philgooch_potential_users.csv
../../datasets/temp/temp_users/rlarson20_potential_users.csv
../../datasets/temp/temp_users/jimsmithm3_potential_users.csv
../../datasets/temp/temp_users/nmaynes_potential_users.csv
../../datasets/temp/temp_users/christofs_potential_users.csv
../../datasets/temp/temp_users/davidderoure_potential_users.csv
../../datasets/temp/temp_users/davidmcclure_potential_users.csv
../../datasets/temp/temp_users/DavidKi_potential_users.csv
../../datasets/temp/temp_users/BillMills_potential_users.csv
../../datasets/temp/temp_users/ontoligent_potential_users.csv
../../datasets/temp/temp_users/naomiyaki_potential_users.csv
../../datasets/temp/temp_users/jamotilla_potential_users.csv
../../datasets/temp/temp_users/fandinod_potential_users.csv
../../datasets/temp/temp_users/fredgibbs_potential_users.csv
../../datasets/temp/temp_users/ondich_potential_users.csv
../../datasets/temp/temp_

  dfs = pd.concat([df, df2])


../../datasets/temp/temp_users/jphalip_potential_users.csv
../../datasets/temp/temp_users/navateja_potential_users.csv
../../datasets/temp/temp_users/xianminx_potential_users.csv
../../datasets/temp/temp_users/nemobis_potential_users.csv
../../datasets/temp/temp_users/luciapiff_potential_users.csv
../../datasets/temp/temp_users/rundimeco_potential_users.csv
../../datasets/temp/temp_users/mej_potential_users.csv
../../datasets/temp/temp_users/blrtvs_potential_users.csv
../../datasets/temp/temp_users/KelleyYin_potential_users.csv
../../datasets/temp/temp_users/lesteve_potential_users.csv
../../datasets/temp/temp_users/shawn2306_potential_users.csv
../../datasets/temp/temp_users/iacobucci_potential_users.csv
../../datasets/temp/temp_users/rbeagrie_potential_users.csv
../../datasets/temp/temp_users/thars3n_potential_users.csv
../../datasets/temp/temp_users/justinallen_potential_users.csv
../../datasets/temp/temp_users/federica1994_potential_users.csv
../../datasets/temp/temp_users/jkrajnia

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte