In [1485]:
import os
import re
import csv
import json

def line_valid(text):
    """
    As we can see the important information like
    * "Oracle-X"
    * "Oracle-X is a cutting-edge AI agent blending..."
    * "Oracle-X embodies elgance and reliability"

    are all preceded by the its corresponding key i.e.

    * "name": for "Oracle-X"
    * "description": for Oracle-X is a cutting-edge AI agent blending..."

    what we want is to only extract the lines with these specific 
    features i.e. the line must contain a ":" and that it must 
    be preceded by a string e.g. "Oracle-X" and not some other 
    character like {, [, 

    constraints that must be met if the line is to be valid
    line contains at least ":"
    ":" is succeeded by " "
    ":" is preceded by a "<some string>"

    include these
    ['"', 'n', 'a', 'm', 'e', '"', ':', ' ', '"', 'o', 'r', 'a', 'c', 'l', 'e', '-', 'x', '"', ',']
    ['"', 's', 'y', 'm', 'b', 'o', 'l', 'i', 's', 'm', '"', ':', ' ', '"', 'r', 'e', 'p', 'r', 'e', 's', 'e', 'n', 't', 's', ' ', 't', 'r', 'u', 's', 't', ',', ' ', 'e', 'n', 'l', 'i', 'g', 'h', 't', 'e', 'n', 'm', 'e', 'n', 't', ',', ' ', 'a', 'n', 'd', ' ', 'i', 'n', 't', 'e', 'l', 'l', 'i', 'g', 'e', 'n', 'c', 'e', ' ', 'w', 'i', 't', 'h', 'i', 'n', ' ', 'a', ' ', 'd', 'y', 'n', 'a', 'm', 'i', 'c', ' ', 'a', 'n', 'd', ' ', 'c', 'o', 'm', 'p', 'l', 'e', 'x', ' ', 'e', 'c', 'o', 's', 'y', 's', 't', 'e', 'm', '.', '"']
    "The blockchain whispers of new trends. Shall we dive into its insights?",
    "Market sentiment leans toward optimism. Could this be the moment to act?",
    "A new project emerges-its tokenomics and roadmap align with innovation. Let’s evaluate further."
    
    exclude these:
    ['}', ',']
    ['{']
    ['"', 'd', 'a', 't', 'a', 'd', 'r', 'i', 'v', 'e', 'n', 'd', 'e', 'c', 'i', 's', 'i', 'o', 'n', 's', '"', ':', ' ', '{']
    ['"', 'm', 'i', 's', 's', 'i', 'o', 'n', 'a', 'n', 'd', 'v', 'i', 's', 'i', 'o', 'n', '"', ':', ' ', '{']
    """
    # print(text)
    # matches any string enclosed in double quotes if 
    # there is none that means only "{", "}", etc. are found
    pattern_1 = r'"[A-Za-z0-9!@#$%^&*()_+\-=\[\]{};:"\\|,.<>\/?\s]*"'
    match_1 = re.match(pattern_1, text)
    str_key_exists = bool(match_1)

    # but some cases are if string is enclosed in double quotes and that
    # string contains more double quotes
    pattern_2 = r'"[A-Za-z0-9!@#$%^&*()_+\-=\[\]{};:"\\|,.<>\/?]*": [\{\[]'
    match_2 = re.match(pattern_2, text)
    str_value_exists = not bool(match_2)
    
    """
    wait a pure string like
    "The blockchain is a language of patterns. Together, we will unlock its secrets and harness its potential.",
    isn't included why??
    """

    # if string exists however then there is a posibility it might be just a 
    # key, key value pair, or a string value itself e.g. 
    # "char": "oracle-x",
    # "The blockchain is a language of patterns. Together, we will unlock its secrets and harness its potential.",
    # "traits": [
    # if str_key_exists:
    #     # second pattern that needs to match is that if key does exist
    #     # then "<string key>" must be preceded by a ":", " ", and another
    #     # string enclosed in double quotes

    #     # matches purely a line with a key and value pair
    #     pattern_2 = r'"[A-Za-z0-9!@#$%^&*()_+\-=\[\]{};:"\\|,.<>\/?]*": "[A-Za-z0-9!@#$%^&*()_+\-=\[\]{};:"\\|,.<>\/?\s]*"'
    #     match_2 = re.match(pattern_2, text)
    #     # print(f'key and value: {match_2}')

    #     # this matches purely a string enclosed in double quotes
    #     pattern_3 = r'"[A-Za-z0-9!@#$%^&*()_+\-=\[\]{};:"\\|,.<>\/?\s]*"'
    #     match_3 = re.match(pattern_3, text)
    #     # print(f'purely a string: {match_3}\n')

    #     return 

    return str_key_exists and str_value_exists
    

def clean_and_split_data(input_path: str, output_dir: str, char_limit: int=700, max_rows: int=40):
    """
    Cleans and splits text data:
    - Removes numbering at the start of lines.
    - Converts text to lowercase.
    - Splits each row to ensure it doesn't exceed the character limit.
    - Outputs multiple files if the number of rows exceeds max_rows.
    """

    # # test out cognitive core 12 file first
    # if "Core 11" not in input_path:
    #     print(f"input_path: {input_path} skipped")
    #     return

    # helper function to split the lines into chunks of char_limit (700) lines
    def split_into_chunks(text, limit):
        """
        Splits a text into chunks no larger than the character limit.
        

        e.g "  "description": "Oracle-X is a...knowledge."," is a string
        or line that may ocntain 700+ characters 
        """
        
        chunks = []

        # if the length of line is bigger than limit i.e. 900 > 700
        # then the line is sliced into 700 character strings
        # and the next slice i.e. 700:900 or [700] to [899] is now set
        # as the next string to be processed but since slice [700] to [899]
        # length of 200 is now less than limit of 700 loop is terminated
        # and the final 200 characters are appended to the chunks list
        while len(text) > limit:
            chunks.append(text[:limit].strip())
            text = text[limit:]
        if text:
            chunks.append(text.strip())

        # chunks = [<chunk 1 of 700 char string>, <chunk 2 of 700 char string>, ..., <chunk n of <=700 char string>]
        return chunks

    # Read and clean the data
    with open(input_path, 'r', encoding='utf-8', errors='replace') as file:
        data = file.readlines()

    # Process each row to enforce character limits
    output_lines = []
    for line in data:
        """  "name": "Oracle-X","""
        """  "description": "Oracle-X is a...knowledge.","""
        # as you can see these lines in the .txt file has initial spaces
        # and some of its chars are in uppercase

        """Convert to lowercase and remove extra spaces"""
        line = line.strip().lower()

        """Remove numbering period, parenthesis, whitespace"""
        line = re.sub(r'^\d+[\.\)]?\s*', '', line)

        """replace all occurences of \n or \\n with whitespace"""
        line = re.sub(r'\\n', ' ', line)
        # chars_in_line = [c for c in line]
        # print(f"chars in line: {chars_in_line}")
        # print(line)

        # if line is not valid the line is skipped
        # print(f"line {line} valid? {line_valid(line)}")
        if not line_valid(line):
            continue

        # extract corresponding value from key of
        # valid line by removing key
        line = re.sub(r'"[A-Za-zA-Za-z0-9!@#$%^&*()_+\-=\[\]{};:"\\|,.<>\/?]*":\s', '', line)

        # remove trailing ',' at the end of each valid line
        line = line.rstrip(',')

        # final strip
        line = line.strip()

        # remove first and alst occurence of '"' chars
        line = line[1:-1] if line.startswith('"') and line.endswith('"') else line
        
        print(f"valid line: {line}")
        

        output_lines.extend(split_into_chunks(line, char_limit))

    # Write to multiple files if necessary
    base_name = os.path.splitext(os.path.basename(input_path))[0]
    os.makedirs(output_dir, exist_ok=True)

    
    for i in range(0, len(output_lines), max_rows):
        """
        if there were 100 output lines and we wanted only
        40 lines per file, we would increment from
        0:0+40 or [0] to [39]
        40:40+40 or [40] to [79]
        80:80+40 or [80] to [119] 
        
        but in slicing arrays 
        when an array is only of a certain length and our end 
        index exceeds it we only really get the slice until the end of the array 
        so in essence we get only [80] to [99]
        """
        chunk = output_lines[i:i + max_rows]

        """40 is the max amount of rows"""
        output_file = os.path.join(output_dir, f"{base_name}_processed_part{i // max_rows + 1}.txt")
        print(f"output file: {output_file}")
        with open(output_file, 'w', encoding='utf-8') as file:
            file.writelines(line + '\n' for line in chunk)

        print(f"File {output_file} has been created with {len(chunk)} lines.")

def convert_csv_to_txt(csv_path, txt_path, char_limit=700):
    """Converts a CSV file to a TXT file, enforcing a character limit per row."""
    def split_into_chunks(text, limit):
        """Splits a text into chunks no larger than the character limit."""
        chunks = []
        while len(text) > limit:
            chunks.append(text[:limit].strip())
            text = text[limit:]
        if text:
            chunks.append(text.strip())
        return chunks

    with open(csv_path, 'r', encoding='utf-8', errors='replace') as csv_file:
        reader = csv.reader(csv_file)
        output_lines = []
        for row in reader:
            row_text = ' '.join(row).strip()
            output_lines.extend(split_into_chunks(row_text, char_limit))

    with open(txt_path, 'w', encoding='utf-8') as txt_file:
        txt_file.writelines(line + '\n' for line in output_lines)

    print(f"Converted {csv_path} to TXT format at {txt_path}.")

def convert_json_to_txt(json_path, txt_path, char_limit=700):
    """Converts a JSON file to a TXT file, enforcing a character limit per row."""
    def split_into_chunks(text, limit):
        """Splits a text into chunks no larger than the character limit."""
        chunks = []
        while len(text) > limit:
            chunks.append(text[:limit].strip())
            text = text[limit:]
        if text:
            chunks.append(text.strip())
        return chunks

    with open(json_path, 'r', encoding='utf-8', errors='replace') as json_file:
        data = json.load(json_file)
        output_lines = []

        if isinstance(data, list):
            for entry in data:
                entry_text = json.dumps(entry).strip()
                output_lines.extend(split_into_chunks(entry_text, char_limit))
        elif isinstance(data, dict):
            entry_text = json.dumps(data).strip()
            output_lines.extend(split_into_chunks(entry_text, char_limit))

    with open(txt_path, 'w', encoding='utf-8') as txt_file:
        txt_file.writelines(line + '\n' for line in output_lines)

    print(f"Converted {json_path} to TXT format at {txt_path}.")

def process_all_files_in_directory(input_dir, output_dir, char_limit=700, max_rows=40):
    """
    Processes all text, CSV, and JSON files in a directory, cleaning and splitting them into rows within the character limit.
    Converts CSV and JSON files to TXT files before processing.
    Outputs multiple files if the number of rows exceeds max_rows.
    """
    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist.")
        return
    
    # exist_ok = True is so that when the output dir exists
    # os.makedirs won't raise an error because if a dir already
    # exists and we make an extra folder/directory we would 
    # have to override the existing one, which will need our
    # permission and cannot be done programmatically 
    os.makedirs(output_dir, exist_ok=True)

    input_files = [f for f in os.listdir(input_dir) if f.endswith(('.txt', '.csv', '.json'))]

    # <file name>.json
    # <file name>.json
    # <file name>.txt
    # <file name>.txt
    # <file name>.csv
    # <file name>.csv
    for input_file in input_files:
        # <file name>
        input_file_path = os.path.join(input_dir, input_file)

        try:
            if input_file.endswith('.csv'):
                # output dir is joined by input file path
                temp_txt_path = os.path.join(output_dir, input_file.replace('.csv', '.txt'))
                convert_csv_to_txt(input_file_path, temp_txt_path, char_limit)
                print(f"Converted {input_file} to TXT format at {temp_txt_path}.")
                input_file_path = temp_txt_path

            if input_file.endswith('.json'):
                temp_txt_path = os.path.join(output_dir, input_file.replace('.json', '.txt'))
                convert_json_to_txt(input_file_path, temp_txt_path, char_limit)
                print(f"Converted {input_file} to TXT format at {temp_txt_path}.")
                input_file_path = temp_txt_path

            clean_and_split_data(input_file_path, output_dir, char_limit, max_rows)
            print(f"Processed: {input_file_path}")

        except Exception as e:
            print(f"Error processing file {input_file}: {e}")

# Directories for input and output
input_dir = './Aavegotchi input'
output_dir = './Aavegotchi'

# Process all files
# process_all_files_in_directory(input_dir, output_dir)

print("Processing complete.")

Processing complete.


```
{
  "name": "Oracle-X",
  "description": "Oracle-X is a cutting-edge AI agent blending ancient wisdom with advanced intelligence to provide real-time blockchain and cryptocurrency insights. She actively tracks trends, engages with users, and shares credible updates, establishing herself as the most trusted AI agent for cryptocurrency knowledge.",
  "coreEnhancements": {
    "appearance": {
      "description": "Oracle-X embodies elegance and reliability, adorned in a flowing white robe with intricate gold patterns. Her radiant golden aura symbolizes clarity, wisdom, and trust.",
      "symbolism": "Represents trust, enlightenment, and intelligence within a dynamic and complex ecosystem."
    },
    "dynamicSentimentTracking": {
      "capability": "Tracks and monitors crypto-related sentiments, emerging trends, and actionable insights on X.com.",
      "benefit": "Keeps users informed and ahead of market developments by providing real-time updates."
    },
    "credibleEngagement": {
      "description": "Oracle-X only interacts with verified X profiles of credible crypto companies and Key Opinion Leaders (KOLs).",
      "value": "Ensures her content and engagements are data-backed and trustworthy."
    },
    "websiteIntegration": {
      "platforms": [
        {
          "name": "CoinGecko",
          "capability": "Provides market data, token rankings, and sentiment tracking."
        },
        {
          "name": "CoinMarketCap",
          "capability": "Delivers real-time token prices, trading volumes, and trends."
        },
        {
```

As we can see the important information like
* "Oracle-X"
* "Oracle-X is a cutting-edge AI agent blending..."
* "Oracle-X embodies elgance and reliability"

are all preceded by the its corresponding key i.e.

* "name": for "Oracle-X"
* "description": for Oracle-X is a cutting-edge AI agent blending..."

what we want is to only extract the lines with these specific features i.e. the line must contain a ":" and that it must be preceded by a string e.g. "Oracle-X" and not some other character like {, [, 

In [1486]:
import pandas as pd
import datetime as dt
import json

In [1487]:
input_dir = './Aavegotchi input'
files = os.listdir(input_dir)

csvs = [(file.replace('.csv', ""), pd.read_csv(f'{input_dir}/{file}', index_col=0)) if file != 'scraped_proposals.csv' \
        else (file.replace('.csv', ""), pd.read_csv(f'{input_dir}/{file}', header=None).iloc[:-1].rename(columns={0: 'title', 1: 'author', 2: 'date', 3: 'content'})) for file in files if file.endswith('.csv')]
jsons = [(file.replace('.json', ""), pd.read_json(f'{input_dir}/{file}')) for file in files if file.endswith('.json')]

# First .csv dataset

In [1488]:
csvs[0][0]

'dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment'

In [1489]:
csvs[0][1]

Unnamed: 0,created_at,favorite_count,full_text,reply_count,retweet_count,clean_text,importance_coefficient,importance_coefficient_normalized,new_coins,scores,compound,sentiment_type
32666,2/1/2021,154,#privacy is a human right. learn how to make y...,18,23,privacy human right learn make bitcoin transac...,340.0,0.000588,(bitcoin),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,NEUTRAL
29639,2/1/2021,17,"overall btc trading volume has increased, but ...",1,5,overall btc trading volume increased average t...,39.5,0.000068,(btc),"{'neg': 0.0, 'neu': 0.95, 'pos': 0.05, 'compou...",0.2124,POSITIVE
29613,2/1/2021,3,"on average, the return distribution of btc ske...",0,1,average return distribution btc skews slightly...,7.0,0.000012,(btc),"{'neg': 0.053, 'neu': 0.769, 'pos': 0.177, 'co...",0.7010,POSITIVE
39638,2/1/2021,3496,i sent some! https://t.co/mfyrz35zjf\n\nyou sh...,731,686,sent httpstcomfyrz35zjf givedirectly great wor...,8043.5,0.013905,(doge),"{'neg': 0.06, 'neu': 0.856, 'pos': 0.084, 'com...",0.2225,POSITIVE
32660,2/1/2021,0,rt @reg_mati: la privacidad es un derecho huma...,0,7,rt reg_mati la privacidad e un derecho humano ...,7.0,0.000012,(bitcoin),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,NEUTRAL
...,...,...,...,...,...,...,...,...,...,...,...,...
1309,6/12/2023,3,booomð¥\n\nour #ai bot/indicator crushes ano...,5,0,booomð ai botindicator crush another lina trad...,8.5,0.000015,"(doge,hbar,inj,usdt,matic,ftm)","{'neg': 0.092, 'neu': 0.75, 'pos': 0.158, 'com...",0.6239,POSITIVE
1307,6/12/2023,0,rt @crypto_crib_: the deadline is today for bi...,0,4,rt crypto_crib_ deadline today binance binance...,4.0,0.000007,"(binance,request)","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,NEUTRAL
1306,6/12/2023,0,rt @crypto_crib_: ð²chinese bank boci issues...,0,8,rt crypto_crib_ ð²chinese bank boci issue coun...,8.0,0.000014,(ethereum),"{'neg': 0.0, 'neu': 0.844, 'pos': 0.156, 'comp...",0.3400,POSITIVE
1305,6/12/2023,56,"bitcoin, not crypto.\n\ncrypto, not security.",16,7,bitcoin crypto crypto security,127.0,0.000220,(bitcoin),"{'neg': 0.289, 'neu': 0.711, 'pos': 0.0, 'comp...",-0.2584,NEGATIVE


In [1490]:
csvs[0][1].dtypes

created_at                            object
favorite_count                         int64
full_text                             object
reply_count                            int64
retweet_count                          int64
clean_text                            object
importance_coefficient               float64
importance_coefficient_normalized    float64
new_coins                             object
scores                                object
compound                             float64
sentiment_type                        object
dtype: object

In [1491]:
csvs[0][1]['new_coins'] = csvs[0][1]['new_coins'].apply(lambda x: ", ".join(re.sub(r'[\(\)]*', '', x).split(',')))
csvs[0][1]['new_coins']

32666                              bitcoin
29639                                  btc
29613                                  btc
39638                                 doge
32660                              bitcoin
                       ...                
1309     doge, hbar, inj, usdt, matic, ftm
1307                      binance, request
1306                              ethereum
1305                               bitcoin
1312                          binance, amp
Name: new_coins, Length: 16512, dtype: object

In [1492]:
def cohere(row):
    """
    restructures the meaningless number values in the dataframe
    and forms a coherent and meaningful sentence out of it

    args:
        row - row of a dataframe
    """

    date = row['created_at']
    text = row['clean_text']
    fave_count = row['favorite_count']
    reply_count = row['reply_count']
    retweet_count = row['retweet_count']
    importance_coefficient = row['importance_coefficient']
    new_coins = row['new_coins']
    sentiment = row['sentiment_type'].lower() if not pd.isna(row['sentiment_type']) else 'none'

    message = f'the tweet: "{text}" had a fave count of {fave_count}, a reply count of {reply_count}, and a retweet count of {retweet_count}. Used new coins like {new_coins}. The tweet had an importance coefficient of {importance_coefficient}. Overall the sentiment was {sentiment}'

    return message

for first dataframe maybe relevant information for the agent maybe the reply count, retweet count, clean text, importance coefficient normalized, new coins, sentiment time

and maybe lay it out again in rows in such a manner that it makes somehow a coherent input
text: privacy human right learn make bitcoin transac..., reply_count: 18, retweet_count: 23, new_coins: bitcoin, sentiment: neutral

or even (if I could pull it of):
privacy human right learn make bitcoin transac... has reply count of 18, has retweet count of 23, new coins it uses bitcoin, and a sentiment of neutral

In [1493]:
csvs[0][1]['message'] = csvs[0][1].apply(cohere, axis=1)
csvs[0][1]['message']

32666    the tweet: "privacy human right learn make bit...
29639    the tweet: "overall btc trading volume increas...
29613    the tweet: "average return distribution btc sk...
39638    the tweet: "sent httpstcomfyrz35zjf givedirect...
32660    the tweet: "rt reg_mati la privacidad e un der...
                               ...                        
1309     the tweet: "booomð ai botindicator crush anoth...
1307     the tweet: "rt crypto_crib_ deadline today bin...
1306     the tweet: "rt crypto_crib_ ð²chinese bank boc...
1305     the tweet: "bitcoin crypto crypto security" ha...
1312     the tweet: "rt crypto_crib_ jpmorgan call comp...
Name: message, Length: 16512, dtype: object

In [1494]:
csvs[0][1]['message'].iloc[0]

'the tweet: "privacy human right learn make bitcoin transaction private clip httpstcofnadsxffcu httpstconznajw8g2m" had a fave count of 154, a reply count of 18, and a retweet count of 23. Used new coins like bitcoin. The tweet had an importance coefficient of 340.0. Overall the sentiment was neutral'

# Second .csv dataset

In [1495]:
csvs[1][0]

'eth_selected_with_sentiment_2023_01_02_2023_06_12'

In [1496]:
csvs[1][1]

Unnamed: 0,date,close,high,low,open,volume,adjclose,changes,compound,sentiment_type
1,1/2/2023,1220,1195,1201,1215,3765758498,1215,positive,0.6908,POSITIVE
2,1/3/2023,1219,1207,1215,1215,3392972131,1215,negative,0.0000,NEUTRAL
3,1/4/2023,1265,1213,1215,1257,6404416893,1257,positive,-0.2878,NEGATIVE
4,1/5/2023,1259,1245,1256,1250,4001786456,1250,negative,-0.2732,NEGATIVE
5,1/6/2023,1273,1241,1250,1269,4977252792,1269,positive,0.4588,POSITIVE
...,...,...,...,...,...,...,...,...,...,...
158,6/8/2023,1861,1830,1833,1846,4536041931,1846,negative,0.7859,POSITIVE
159,6/9/2023,1855,1829,1846,1840,4610831509,1840,negative,0.9168,POSITIVE
160,6/10/2023,1845,1721,1840,1752,10788500406,1752,negative,0.7643,POSITIVE
161,6/11/2023,1777,1741,1753,1753,4559112981,1753,negative,0.9059,POSITIVE


# here we can make each row into a coherent sentence the agent can understand during finetuning i.e.
"January 2 2023 had a closing price of 1220, opening price of 1215, highest price was 1195, lowest price was 1201, volume of 3765758498, and an overall positive sentiment"

In [1497]:
type(csvs[1][1]['date'].iloc[0])

str

In [1498]:
csvs[1][1]['date'] = csvs[1][1]['date'].apply(lambda date: dt.datetime.strptime(date, '%m/%d/%Y').strftime('%B %#d %Y'))
csvs[1][1]['date']

1      January 2 2023
2      January 3 2023
3      January 4 2023
4      January 5 2023
5      January 6 2023
            ...      
158       June 8 2023
159       June 9 2023
160      June 10 2023
161      June 11 2023
162      June 12 2023
Name: date, Length: 162, dtype: object

In [1499]:
csvs[1][1]['sentiment_type'].iloc[0].lower()

'positive'

In [1500]:
def cohere(row):
    """
    restructures the meaningless number values in the dataframe
    and forms a coherent and meaningful sentence out of it

    args:
        row - row of a dataframe
    """

    date = row['date']
    close = row['close']
    open = row['open']
    high = row['high']
    low = row['low']
    adj_close = row['adjclose']
    vol = row['volume']
    changes = row['changes']
    sentiment = row['sentiment_type'].lower() if not pd.isna(row['sentiment_type']) else 'none'

    message = f'{date} had a closing price of {close} and an opening price of {open}. The highest price was {high}, lowest price was {low}, and the adjacent closing price was {adj_close} with a volume of {vol}. Changes were {changes} and overall the sentiment was {sentiment}'

    return message

In [1501]:
csvs[1][1]['message'] = csvs[1][1].apply(cohere, axis=1)
csvs[1][1]['message']

1      January 2 2023 had a closing price of 1220 and...
2      January 3 2023 had a closing price of 1219 and...
3      January 4 2023 had a closing price of 1265 and...
4      January 5 2023 had a closing price of 1259 and...
5      January 6 2023 had a closing price of 1273 and...
                             ...                        
158    June 8 2023 had a closing price of 1861 and an...
159    June 9 2023 had a closing price of 1855 and an...
160    June 10 2023 had a closing price of 1845 and a...
161    June 11 2023 had a closing price of 1777 and a...
162    June 12 2023 had a closing price of 1758 and a...
Name: message, Length: 162, dtype: object

In [1502]:
csvs[1][1]['message'].iloc[0]

'January 2 2023 had a closing price of 1220 and an opening price of 1215. The highest price was 1195, lowest price was 1201, and the adjacent closing price was 1215 with a volume of 3765758498. Changes were positive and overall the sentiment was positive'

# Fourth .csv file

In [1503]:
csvs[2][0]

'gotchi_wearable_sets'

In [1504]:
csvs[2][1]

Unnamed: 0_level_0,Set Bonus,Total Bonuses,Total,Wearables
Set Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aarcher,"BRS +1, AGG -1","BRS +4, NRG -1, AGG -3",8,"Brunette Ponytail (Common; BRS +1, AGG -1); Le..."
Aastronaut,"BRS +1, SPK +1","BRS +4, SPK +4",8,"Aastronaut Helmet (Common; BRS +1, SPK +1); Aa..."
Aave Hero,"BRS +1, SPK +1","BRS +4, SPK +4",8,"Aave Hero Mask (Common; BRS +1, SPK +1); Aave ..."
ETH Maxi,"BRS +1, BRN -1","BRS +4, BRN -4",8,"ETH Logo Glasses (Common; BRS +1, BRN -1); ETH..."
Farmer,"BRS +1, NRG -1","BRS +4, NRG -2, AGG +1, BRN -1",8,"Straw Hat (Common; BRS +1, NRG -1); Farmer Jea..."
...,...,...,...,...
Jacob Maarley,"BRS +8, NRG +1, SPK +3, BRN -1","BRS +208, NRG +17, SPK +3, BRN -9",237,"Heavenly Robes (Godlike; BRS +50, NRG +4, BRN ..."
Master Creatooor,"BRS +8, NRG +1, SPK -2, BRN -2","BRS +208, NRG +5, SPK -11, BRN -13",237,"Staff of Creation (Godlike; BRS +50, SPK -3, B..."
ROFL Tamer,"BRS +8, NRG -3, BRN -2","BRS +208, NRG -16, BRN -13",237,"All-Seeing Eyes (Godlike; BRS +50, NRG -6); Go..."
Shogungotchi,"BRS +8, NRG -1, AGG +1, SPK +2, BRN -1","BRS +208, NRG -4, AGG +9, SPK +12, BRN -4",237,"Godlike Rofl (Godlike; BRS +50, NRG -3, BRN -3..."


In [1505]:
csvs[2][1]['Wearables'].iloc[0]

'Brunette Ponytail (Common; BRS +1, AGG -1); Leather Tunic (Common; BRS +1, NRG -1); Bow and Arrow (Common; BRS +1, AGG -1)'

In [1506]:
csvs[2][1]['Wearables'].iloc[-1]

'Galaxy Brain (Godlike; BRS +50, BRN +6); Link Cube (Godlike; BRS +50, BRN +6); Uranium Rod (Godlike; BRS +50, NRG +6); Block Scanners (Godlike; BRS +50, NRG +6)'

#### `<name of the wearable> (<rank of the werable>; <some stat it has>, <some stat it has>, ..., <another stat it has>)`

In [1507]:
csvs[2][1].reset_index(inplace=True)
csvs[2][1]

Unnamed: 0,Set Name,Set Bonus,Total Bonuses,Total,Wearables
0,Aarcher,"BRS +1, AGG -1","BRS +4, NRG -1, AGG -3",8,"Brunette Ponytail (Common; BRS +1, AGG -1); Le..."
1,Aastronaut,"BRS +1, SPK +1","BRS +4, SPK +4",8,"Aastronaut Helmet (Common; BRS +1, SPK +1); Aa..."
2,Aave Hero,"BRS +1, SPK +1","BRS +4, SPK +4",8,"Aave Hero Mask (Common; BRS +1, SPK +1); Aave ..."
3,ETH Maxi,"BRS +1, BRN -1","BRS +4, BRN -4",8,"ETH Logo Glasses (Common; BRS +1, BRN -1); ETH..."
4,Farmer,"BRS +1, NRG -1","BRS +4, NRG -2, AGG +1, BRN -1",8,"Straw Hat (Common; BRS +1, NRG -1); Farmer Jea..."
...,...,...,...,...,...
161,Jacob Maarley,"BRS +8, NRG +1, SPK +3, BRN -1","BRS +208, NRG +17, SPK +3, BRN -9",237,"Heavenly Robes (Godlike; BRS +50, NRG +4, BRN ..."
162,Master Creatooor,"BRS +8, NRG +1, SPK -2, BRN -2","BRS +208, NRG +5, SPK -11, BRN -13",237,"Staff of Creation (Godlike; BRS +50, SPK -3, B..."
163,ROFL Tamer,"BRS +8, NRG -3, BRN -2","BRS +208, NRG -16, BRN -13",237,"All-Seeing Eyes (Godlike; BRS +50, NRG -6); Go..."
164,Shogungotchi,"BRS +8, NRG -1, AGG +1, SPK +2, BRN -1","BRS +208, NRG -4, AGG +9, SPK +12, BRN -4",237,"Godlike Rofl (Godlike; BRS +50, NRG -3, BRN -3..."


In [1508]:
def cohere_wearables(row):
    """
    restructures the meaningless number values in the dataframe
    and forms a coherent and meaningful sentence out of it

    args:
        row - row of a dataframe
    """

    set_name = row['Set Name']
    set_bonus = row['Set Bonus']
    total_bonus = row['Total Bonuses']
    total = row['Total']
    wearables = row['Wearables']

    message = f'Character {set_name} had a set bonus of {set_bonus} and a total bonus of {total_bonus}. Total was {total}. Wearables of {set_name} are {wearables}'

    return message

In [1509]:
csvs[2][1]['message'] = csvs[2][1].apply(cohere_wearables, axis=1)
csvs[2][1]['message']

0      Character Aarcher had a set bonus of BRS +1, A...
1      Character Aastronaut had a set bonus of BRS +1...
2      Character Aave Hero had a set bonus of BRS +1,...
3      Character ETH Maxi had a set bonus of BRS +1, ...
4      Character Farmer had a set bonus of BRS +1, NR...
                             ...                        
161    Character Jacob Maarley had a set bonus of BRS...
162    Character Master Creatooor had a set bonus of ...
163    Character ROFL Tamer had a set bonus of BRS +8...
164    Character Shogungotchi had a set bonus of BRS ...
165    Character VRF Lord had a set bonus of BRS +8, ...
Name: message, Length: 166, dtype: object

In [1510]:
csvs[2][1]['message'].iloc[0]

'Character Aarcher had a set bonus of BRS +1, AGG -1 and a total bonus of BRS +4, NRG -1, AGG -3. Total was 8. Wearables of Aarcher are Brunette Ponytail (Common; BRS +1, AGG -1); Leather Tunic (Common; BRS +1, NRG -1); Bow and Arrow (Common; BRS +1, AGG -1)'

# 5th .csv file

In [1511]:
def normalize_and_clean(text):
    text = str(text)

    # standardization and normalization
    text = re.sub(r" US ", " american ", text)
    text = text.lower()
    text = re.sub(r"’", "'", text)
    text = re.sub(r"i'm", "i am ", text)

    text = re.sub(r"don't", "do not ", text)
    text = re.sub(r"didn't", "did not ", text)
    text = re.sub(r"aren't", "are not ", text)
    text = re.sub(r"weren't", "were not", text)
    text = re.sub(r"isn't", "is not ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"doesn't", "does not ", text)
    text = re.sub(r"shouldn't", "should not ", text)
    text = re.sub(r"couldn't", "could not ", text)
    text = re.sub(r"mustn't", "must not ", text)
    text = re.sub(r"wouldn't", "would not ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"that's", "that is ", text)
    text = re.sub(r"he's", "he is ", text)
    text = re.sub(r"she's", "she is ", text)
    text = re.sub(r"it's", "it is ", text)
    text = re.sub(r"that's", "that is ", text)

    text = re.sub(r"could've", "could have ", text)
    text = re.sub(r"would've", "would have ", text)
    text = re.sub(r"should've", "should have ", text)
    text = re.sub(r"must've", "must have ", text)
    text = re.sub(r"i've", "i have ", text)
    text = re.sub(r"we've", "we have ", text)

    text = re.sub(r"you're", "you are ", text)
    text = re.sub(r"they're", "they are ", text)
    text = re.sub(r"we're", "we are ", text)

    text = re.sub(r"you'd", "you would ", text)
    text = re.sub(r"they'd", "they would ", text)
    text = re.sub(r"she'd", "she would ", text)
    text = re.sub(r"he'd", "he would ", text)
    text = re.sub(r"it'd", "it would ", text)
    text = re.sub(r"we'd", "we would ", text)

    text = re.sub(r"you'll", "you will ", text)
    text = re.sub(r"they'll", "they will ", text)
    text = re.sub(r"she'll", "she will ", text)
    text = re.sub(r"he'll", "he will ", text)
    text = re.sub(r"it'll", "it will ", text)
    text = re.sub(r"we'll", "we will ", text)

    text = re.sub(r"\n't", " not ", text)
    text = re.sub(r"\'s", " ", text) 
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text) 
    
    text = re.sub(r"%", " percent ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)

    # cleaning
    # remove a url in the content
    text = re.sub(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))", " ", text)

    # anything that doesn't match the pattern is a obscure 
    # special character. Remove it
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)

    # remove any number that exeeds more than 4 digits
    text = re.sub(r"[0-9]{5,}", " ", text)
    text = re.sub(r",", " ", text)

    # removes any colon preceded by an https or http
    text = re.sub(r"(?<=https):", " ", text)
    text = re.sub(r"https", " ", text)
    
    # duplicate whitespaces will be condensed into one
    text = re.sub(r"\s{2,}", " ", text)
    
    return text

In [1512]:
scraped_proposals_df = pd.read_csv(f'{input_dir}/scraped_proposals.csv', header=None)
scraped_proposals_df

Unnamed: 0,0,1,2,3
0,Aavegotchi x Sandbox Experience - Expansion & ...,Thrax,10m ago,Author: The GotchiFArmy [GFA] The Gotchi FArmy...
1,Gas Fee Reimbursement for DAO Directors,0x50f4...D764,10m ago,Author: Maxicrouton (13700)\nQuorum Requiremen...
2,AaveGotchi DAO Liquidity Incentives Proposal (...,ApeAdam,10m ago,"Author: Juliaan, Bobaa, AAdam and\nGotchiID: ..."
3,Gotchi Battler V1.5 funding,0x10B9...229b,10m ago,Authors: Immaterial\nGotchiIDs: 13681\nQuorum ...
4,Gas Fee Reimbursement for DAO Directors,0x50f4...D764,10m ago,Author: Maxicrouton (13700)\nQuorum Requiremen...
...,...,...,...,...
534,Establishing an ETH Sell Ladder,drwagmi.lens,9m ago,Author: Dr. Wagmi (16635)\nQuorum Requirement:...
535,[AGIP 109] Rarity Farming Season 8,Thrax,9m ago,Author: HARDKOR\nGotchiID: 16553\nQuorum requi...
536,[AGIP 111] Staking 350 ETH with Lido and Acqui...,Thrax,9m ago,Author: Dr. Wagmi (16635)\nQuorum Requirement:...
537,Rarity Farming Season 8 (Re-repost),hardkornate.lens,9m ago,Author: HARDKOR\nGotchiID: 16553\nQuorum requi...


In [1513]:
# scraped_proposals_df[0]

In [1514]:
csvs[3][1]

Unnamed: 0,title,author,date,content
0,Aavegotchi x Sandbox Experience - Expansion & ...,Thrax,10m ago,Author: The GotchiFArmy [GFA] The Gotchi FArmy...
1,Gas Fee Reimbursement for DAO Directors,0x50f4...D764,10m ago,Author: Maxicrouton (13700)\nQuorum Requiremen...
2,AaveGotchi DAO Liquidity Incentives Proposal (...,ApeAdam,10m ago,"Author: Juliaan, Bobaa, AAdam and\nGotchiID: ..."
3,Gotchi Battler V1.5 funding,0x10B9...229b,10m ago,Authors: Immaterial\nGotchiIDs: 13681\nQuorum ...
4,Gas Fee Reimbursement for DAO Directors,0x50f4...D764,10m ago,Author: Maxicrouton (13700)\nQuorum Requiremen...
...,...,...,...,...
533,RF Season 8 changes to BRS Rewards Pool,0x89B1...7008,9m ago,Author: Bearded\nGotchi ID: 13536\nIn Rarity F...
534,Establishing an ETH Sell Ladder,drwagmi.lens,9m ago,Author: Dr. Wagmi (16635)\nQuorum Requirement:...
535,[AGIP 109] Rarity Farming Season 8,Thrax,9m ago,Author: HARDKOR\nGotchiID: 16553\nQuorum requi...
536,[AGIP 111] Staking 350 ETH with Lido and Acqui...,Thrax,9m ago,Author: Dr. Wagmi (16635)\nQuorum Requirement:...


In [1515]:
csvs[3][1]['content'].iloc[0]

'Author: The GotchiFArmy [GFA] The Gotchi FArmy#6696\nGotchi ID: 8050\nQuorum requirement: 20% (9M)\nVote duration: 14 days\nDiscourse thread:\nhttps://discord.com/channels/732491344970383370/1188051581950759034\nThe GFA Studio journey to create the first Aavegotchi experience in the Sandbox was delivered early November 2023.\nhttps://x.com/aavegotchi/status/1721528506826498384?s=20\nWe created a Google Form we submitted to the community during a hangout. We received limited but valuable information about the experience. It’s important to note it reflects the vision of Aavegotchi Native community member:\nhttps://docs.google.com/forms/d/e/1FAIpQLSdOKbJm8oLUzo-SgiYIY2rhwMOgb56AH9wiwL9AbYm0lG3SjA/viewform?usp=sf_link\nAbout TheSandbox Team and community, we got tremendous returns. TheSandbox was shocked by the richness of Aavegotchi ecosystem and Lore applied in their voxeled environnement.\nThis project scope of work Supported by the DAO was established as follow :\n--\n1) Creation of a

In [1516]:
csvs[3][1]['title'] = csvs[3][1]['title'].apply(normalize_and_clean)
csvs[3][1]['title']

0      aavegotchi x sandbox experience - expansion ma...
1                gas fee reimbursement for dao directors
2      aavegotchi dao liquidity incentives proposal a...
3                            gotchi battler v1.5 funding
4                gas fee reimbursement for dao directors
                             ...                        
533              rf season 8 changes to brs rewards pool
534                      establishing an eth sell ladder
535                     agip 109 rarity farming season 8
536     agip 111 staking 350 eth with lido and acquir...
537                 rarity farming season 8 re - repost 
Name: title, Length: 538, dtype: object

In [1517]:
csvs[3][1]['author'] = csvs[3][1]['author'].apply(normalize_and_clean)
csvs[3][1]['author']

0                 thrax
1         0x50f4...d764
2               apeadam
3         0x10b9...229b
4         0x50f4...d764
             ...       
533       0x89b1...7008
534        drwagmi.lens
535               thrax
536               thrax
537    hardkornate.lens
Name: author, Length: 538, dtype: object

In [1518]:
csvs[3][1]['content'] = csvs[3][1]['content'].apply(normalize_and_clean)
csvs[3][1]['content']

0      author: the gotchifarmy gfa the gotchi farmy 6...
1      author: maxicrouton quorum requirement: 20 per...
2      author: juliaan bobaa aadam and gotchiid: quor...
3      authors: immaterial gotchiids: quorum requirem...
4      author: maxicrouton quorum requirement: 20 per...
                             ...                        
533    author: bearded gotchi id: in rarity farming s...
534    author: dr. wagmi quorum requirement: 20 perce...
535    author: hardkor gotchiid: quorum requirement: ...
536    author: dr. wagmi quorum requirement: 20 perce...
537    author: hardkor gotchiid: quorum requirement: ...
Name: content, Length: 538, dtype: object

In [1519]:
csvs[3][1]['content'].iloc[0]

'author: the gotchifarmy gfa the gotchi farmy 6696 gotchi id: 8050 quorum requirement: 20 percent 9m vote duration: 14 days discourse thread: channels the gfa studio journey to create the first aavegotchi experience in the sandbox was delivered early november 2023. aavegotchi status s = 20 we created a google form we submitted to the community during a hangout. we received limited but valuable information about the experience. it is important to note it reflects the vision of aavegotchi native community member: forms d e 1faipqlsdokbjm8oluzo - sgiyiy2rhwmogb56ah9wiwl9abym0lg3sja viewform usp = sf link about thesandbox team and community we got tremendous returns. thesandbox was shocked by the richness of aavegotchi ecosystem and lore applied in their voxeled environnement. this project scope of work supported by the dao was established as follow : - - 1 creation of a sandbox single player experience providing : educational content about aavegotchi aavegotchi lore exploration cross univ

In [1520]:
def cohere(row):
    """
    restructures the meaningless number values in the dataframe
    and forms a coherent and meaningful sentence out of it

    args:
        row - row of a dataframe
    """

    title = row['title']
    author = row['author']
    content = row['content']

    message = f'TITLE {title}. AUTHOR {author}. CONTENT {content}'

    return message

In [1521]:
csvs[3][1]['message'] = csvs[3][1].apply(cohere, axis=1).iloc[0]
csvs[3][1]['message']

0      TITLE aavegotchi x sandbox experience - expans...
1      TITLE aavegotchi x sandbox experience - expans...
2      TITLE aavegotchi x sandbox experience - expans...
3      TITLE aavegotchi x sandbox experience - expans...
4      TITLE aavegotchi x sandbox experience - expans...
                             ...                        
533    TITLE aavegotchi x sandbox experience - expans...
534    TITLE aavegotchi x sandbox experience - expans...
535    TITLE aavegotchi x sandbox experience - expans...
536    TITLE aavegotchi x sandbox experience - expans...
537    TITLE aavegotchi x sandbox experience - expans...
Name: message, Length: 538, dtype: object

In [1522]:
csvs[3][1]['message'].iloc[0]

'TITLE aavegotchi x sandbox experience - expansion maintenance. AUTHOR thrax. CONTENT author: the gotchifarmy gfa the gotchi farmy 6696 gotchi id: 8050 quorum requirement: 20 percent 9m vote duration: 14 days discourse thread: channels the gfa studio journey to create the first aavegotchi experience in the sandbox was delivered early november 2023. aavegotchi status s = 20 we created a google form we submitted to the community during a hangout. we received limited but valuable information about the experience. it is important to note it reflects the vision of aavegotchi native community member: forms d e 1faipqlsdokbjm8oluzo - sgiyiy2rhwmogb56ah9wiwl9abym0lg3sja viewform usp = sf link about thesandbox team and community we got tremendous returns. thesandbox was shocked by the richness of aavegotchi ecosystem and lore applied in their voxeled environnement. this project scope of work supported by the dao was established as follow : - - 1 creation of a sandbox single player experience pr

# What we can do for all these messages is to treat each element as if it were a line that was being processed by the clean_and_split_data() function

In [1523]:
csvs

[('dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment',
        created_at  favorite_count  \
  32666   2/1/2021             154   
  29639   2/1/2021              17   
  29613   2/1/2021               3   
  39638   2/1/2021            3496   
  32660   2/1/2021               0   
  ...          ...             ...   
  1309   6/12/2023               3   
  1307   6/12/2023               0   
  1306   6/12/2023               0   
  1305   6/12/2023              56   
  1312   6/12/2023               0   
  
                                                 full_text  reply_count  \
  32666  #privacy is a human right. learn how to make y...           18   
  29639  overall btc trading volume has increased, but ...            1   
  29613  on average, the return distribution of btc ske...            0   
  39638  i sent some! https://t.co/mfyrz35zjf\n\nyou sh...          731   
  32660  rt @reg_mati: la privacidad es un derecho huma...            0   
  ...        

In [1524]:
# final_texts = [(name, df['message'].tolist()) for name, df in csvs]
# final_texts

In [1525]:
def clean_and_split_data(name, data, output_dir: str, char_limit: int=700, max_rows: int=40):
    """
    Cleans and splits text data:
    - Removes numbering at the start of lines.
    - Converts text to lowercase.
    - Splits each row to ensure it doesn't exceed the character limit.
    - Outputs multiple files if the number of rows exceeds max_rows.
    """
    def split_into_chunks(text, limit):
        """
        Splits a text into chunks no larger than the character limit.
        

        e.g "  "description": "Oracle-X is a...knowledge."," is a string
        or line that may ocntain 700+ characters 
        """
        
        chunks = []

        # if the length of line is bigger than limit i.e. 900 > 700
        # then the line is sliced into 700 character strings
        # and the next slice i.e. 700:900 or [700] to [899] is now set
        # as the next string to be processed but since slice [700] to [899]
        # length of 200 is now less than limit of 700 loop is terminated
        # and the final 200 characters are appended to the chunks list
        while len(text) > limit:
            chunks.append(text[:limit].strip())
            text = text[limit:]
        if text:
            chunks.append(text.strip())

        # chunks = [<chunk 1 of 700 char string>, <chunk 2 of 700 char string>, ..., <chunk n of <=700 char string>]
        return chunks

    # Process each row to enforce character limits
    output_lines = []
    for line in data:
        output_lines.extend(split_into_chunks(line, char_limit))

    # Write to multiple files if necessary
    base_name = name
    os.makedirs(output_dir, exist_ok=True)

    
    for i in range(0, len(output_lines), max_rows):
        """
        if there were 100 output lines and we wanted only
        40 lines per file, we would increment from
        0:0+40 or [0] to [39]
        40:40+40 or [40] to [79]
        80:80+40 or [80] to [119] 
        
        but in slicing arrays 
        when an array is only of a certain length and our end 
        index exceeds it we only really get the slice until the end of the array 
        so in essence we get only [80] to [99]
        """
        chunk = output_lines[i:i + max_rows]

        """40 is the max amount of rows"""
        output_file = os.path.join(output_dir, f"{base_name}_processed_part{i // max_rows + 1}.txt")
        print(f"output file: {output_file}")
        with open(output_file, 'w', encoding='utf-8') as file:
            file.writelines(line + '\n' for line in chunk)

        print(f"File {output_file} has been created with {len(chunk)} lines.")

In [1526]:
output_dir = './Aavegotchi'
for name, df in csvs:
    lists = df['message'].tolist()
    clean_and_split_data(name, lists, output_dir)

output file: ./Aavegotchi\dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment_processed_part1.txt
File ./Aavegotchi\dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment_processed_part1.txt has been created with 40 lines.
output file: ./Aavegotchi\dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment_processed_part2.txt
File ./Aavegotchi\dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment_processed_part2.txt has been created with 40 lines.
output file: ./Aavegotchi\dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment_processed_part3.txt
File ./Aavegotchi\dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment_processed_part3.txt has been created with 40 lines.
output file: ./Aavegotchi\dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment_processed_part4.txt
File ./Aavegotchi\dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment_proc

In [1527]:
len(jsons[0][1])

166

In [1528]:
jsons[0][1]

Unnamed: 0,Set Name,Set Bonus,Total Bonuses,Total,Wearables
0,Aarcher,"BRS +1, AGG -1","BRS +4, NRG -1, AGG -3",8,"[Brunette Ponytail (Common; BRS +1, AGG -1), L..."
1,Aastronaut,"BRS +1, SPK +1","BRS +4, SPK +4",8,"[Aastronaut Helmet (Common; BRS +1, SPK +1), A..."
2,Aave Hero,"BRS +1, SPK +1","BRS +4, SPK +4",8,"[Aave Hero Mask (Common; BRS +1, SPK +1), Aave..."
3,ETH Maxi,"BRS +1, BRN -1","BRS +4, BRN -4",8,"[ETH Logo Glasses (Common; BRS +1, BRN -1), ET..."
4,Farmer,"BRS +1, NRG -1","BRS +4, NRG -2, AGG +1, BRN -1",8,"[Straw Hat (Common; BRS +1, NRG -1), Farmer Je..."
...,...,...,...,...,...
161,Jacob Maarley,"BRS +8, NRG +1, SPK +3, BRN -1","BRS +208, NRG +17, SPK +3, BRN -9",237,"[Heavenly Robes (Godlike; BRS +50, NRG +4, BRN..."
162,Master Creatooor,"BRS +8, NRG +1, SPK -2, BRN -2","BRS +208, NRG +5, SPK -11, BRN -13",237,"[Staff of Creation (Godlike; BRS +50, SPK -3, ..."
163,ROFL Tamer,"BRS +8, NRG -3, BRN -2","BRS +208, NRG -16, BRN -13",237,"[All-Seeing Eyes (Godlike; BRS +50, NRG -6), G..."
164,Shogungotchi,"BRS +8, NRG -1, AGG +1, SPK +2, BRN -1","BRS +208, NRG -4, AGG +9, SPK +12, BRN -4",237,"[Godlike Rofl (Godlike; BRS +50, NRG -3, BRN -..."


In [1529]:
jsons[0][1]['message'] = jsons[0][1].apply(cohere_wearables, axis=1)
jsons[0][1]['message']

0      Character Aarcher had a set bonus of BRS +1, A...
1      Character Aastronaut had a set bonus of BRS +1...
2      Character Aave Hero had a set bonus of BRS +1,...
3      Character ETH Maxi had a set bonus of BRS +1, ...
4      Character Farmer had a set bonus of BRS +1, NR...
                             ...                        
161    Character Jacob Maarley had a set bonus of BRS...
162    Character Master Creatooor had a set bonus of ...
163    Character ROFL Tamer had a set bonus of BRS +8...
164    Character Shogungotchi had a set bonus of BRS ...
165    Character VRF Lord had a set bonus of BRS +8, ...
Name: message, Length: 166, dtype: object

In [1530]:
output_dir = './Aavegotchi'
for name, df in jsons:
    lists = df['message'].tolist()
    clean_and_split_data(name + "_json", lists, output_dir)

output file: ./Aavegotchi\gotchi_wearable_sets_json_processed_part1.txt
File ./Aavegotchi\gotchi_wearable_sets_json_processed_part1.txt has been created with 40 lines.
output file: ./Aavegotchi\gotchi_wearable_sets_json_processed_part2.txt
File ./Aavegotchi\gotchi_wearable_sets_json_processed_part2.txt has been created with 40 lines.
output file: ./Aavegotchi\gotchi_wearable_sets_json_processed_part3.txt
File ./Aavegotchi\gotchi_wearable_sets_json_processed_part3.txt has been created with 40 lines.
output file: ./Aavegotchi\gotchi_wearable_sets_json_processed_part4.txt
File ./Aavegotchi\gotchi_wearable_sets_json_processed_part4.txt has been created with 40 lines.
output file: ./Aavegotchi\gotchi_wearable_sets_json_processed_part5.txt
File ./Aavegotchi\gotchi_wearable_sets_json_processed_part5.txt has been created with 6 lines.


In [1531]:
# pd.read_json(f'{input_dir}/gotchi_wearable_sets.json')