<a href="https://colab.research.google.com/github/DarioneNazionale/KickLearning/blob/main/working_on_data/extracting_storic_information.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Coning the repository

In [1]:
! git clone https://github.com/DarioneNazionale/KickLearning.git
%cd KickLearning

Cloning into 'KickLearning'...
remote: Enumerating objects: 106, done.[K
remote: Counting objects: 100% (106/106), done.[K
remote: Compressing objects: 100% (88/88), done.[K
remote: Total 106 (delta 40), reused 50 (delta 12), pack-reused 0[K
Receiving objects: 100% (106/106), 229.81 KiB | 3.15 MiB/s, done.
Resolving deltas: 100% (40/40), done.
/content/KickLearning


# Building the storic informations 

In [None]:
data_path = "./drive/MyDrive/Second Semester/SL/Project/Data/zip"
destination_path = "./drive/MyDrive/Second Semester/SL/Project/Data/datasets"

In [None]:
import os
import re
import pandas as pd
import json
import time
from collections import defaultdict

## Functions to retrive categories

In [None]:
def get_categories(entry):
    if "category" not in entry.keys(): return pd.Series()
    
    category_dict = json.loads(entry["category"])
    if "parent_name" in category_dict.keys():
        return pd.Series({
            "category": category_dict["parent_name"],
            "sub_category": category_dict["name"]
        })
    else:
        return pd.Series({
            "category": category_dict["name"]
        })

def get_urls(entry):
    return pd.Series({"project_url": json.loads(entry["urls"])["web"]["project"]})

def get_creator(entry):
    return pd.Series({"creator_id": int(re.search(r"(?<=\"id\":)\d+(?=,)", entry["creator"]).group(0))})

In [3]:
def preprocessing(df, categories_dict):
    columns_to_hold = ["id", "backers_count", "country", "fx_rate", "pledged", "usd_pledged", "currency", "goal",
                        "state", "state_changed_at", "created_at", "launched_at", "deadline", "disable_communication"] # creator.id, categories, urls
    
    tidy_df = pd.concat((df[set(df.columns).intersection(columns_to_hold)], df.apply(get_categories, axis=1), df.apply(get_urls, axis=1), df.apply(get_creator, axis=1)), axis=1)
    
    for id, row in tidy_df.iterrows():
        proj_id = row["id"]
        if proj_id in categories_dict.keys():
            tidy_df.drop(index=id, inplace=True)
        if "category" in row.keys() and type(row["category"])==str: 
            categories_dict[proj_id]["category"].append(row["category"])
        if "sub_category" in row.keys() and type(row["sub_category"])==str: 
            categories_dict[proj_id]["sub_category"].append(row["sub_category"])
    
    return tidy_df, categories_dict

In [None]:
indexes = defaultdict(lambda: {"category": [], "sub_category": []})
df_to_save = pd.DataFrame() # pd.read_csv(os.path.join(destination_path, f"before_error_on_{2340}.csv"))
problematic_dfs = None

file_list = sorted(filter(lambda file: file[-4:]==".csv", os.listdir(data_path)), reverse=True)
file_id = 0
datapoints = 0
i = 0

start = time.perf_counter()
for i in range(len(file_list)):
    file = file_list[i]

    df = pd.read_csv(os.path.join(data_path, file))

    df = df.drop(df[df["state"]=="live"].index)

    try:
        df, indexes = preprocessing(df, indexes)
    except:
        print(f"error encountered at file {i}, saving what we have untill now")
        df_to_save.to_csv(os.path.join(destination_path, f"before_error_on_{i}.csv"))
        problematic_dfs=df
        break

    df["year"] = int(re.match(r"\d+(?=-)", file).group())

    df_to_save = pd.concat([df_to_save, df], axis=0)
    datapoints += len(df)

    if len(df_to_save) >= 15000:

        df_to_save.iloc[:15000].to_csv(os.path.join(destination_path, f"file_{str(file_id).zfill(4)}.csv"))
        df_to_save = df_to_save.iloc[15000:]
        file_id += 1

    # once we reach the number of observations we want we save the file and stop:
    if datapoints >= 250000:
        df_to_save.to_csv(os.path.join(destination_path, f"file_{str(file_id).zfill(4)}.csv"))
        pd.DataFrame.from_dict(indexes, orient="index").to_csv(os.path.join(destination_path, f"categories_df_untill_{file[:-4]}.csv"))
        print("------->  End, done untill", file[:-4])
        break
    
    if i>0 and i % round(len(file_list)/100) == 0:
        print(f"Time elapsed: {time.perf_counter()-start}; remaning time: {(time.perf_counter()-start)/i*(len(file_list)-i)}")
        print(f"Done {round(i/len(file_list)*100)}% untill now, in total {i} files, datapoints: {datapoints}", end="\n\n")