In [13]:
!pip install pandas seaborn google-cloud-aiplatform tdqm google-generativeai tiktoken
!gcloud auth application-default login

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading tiktoken-0.6.0-cp312-cp312-macosx_11_0_arm64.whl (922 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m922.4/922.4 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hDownloading regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl (278 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.5/278.5 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: regex, tiktoken
Successfully installed regex-2024.4.28 tiktoken-0.6.0
Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?res

In [15]:
import pandas as pd
import glob
import csv
import re
import seaborn as sns
import seaborn.objects as so
import matplotlib.pyplot as plt
import google.generativeai as genai
import os
import time
from tqdm import tqdm
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason
import vertexai.preview.generative_models as generative_models
from google.api_core.exceptions import InternalServerError, ResourceExhausted
import json
from tqdm.auto import tqdm
import re
import tiktoken
tqdm.pandas()

In [2]:
vertexai.init(project="yolo-410704", location="us-central1")
gemini_pro_long = GenerativeModel("gemini-1.5-pro-preview-0409")
gemini_pro = GenerativeModel("gemini-pro")

In [3]:
dfs = []


def read_file(file):
    try:
        return open(directory + file + ".txt", "r").read()
    except:
        return None


for info_file in glob.glob("RusLit/**/info.csv", recursive=True):
    try:
        directory = info_file[:-8]
        info = pd.read_csv(info_file)
        info["author"] = directory.split("/")[2]
        info["type"] = directory.split("/")[1]
        info["text"] = info["name"].map(read_file)
        dfs.append(info)
    except:
        pass

df = pd.concat(dfs, ignore_index=True).dropna()
df["year"] = (
    df["year"].astype(str).map(lambda x: x.split("-")[0].split(".")[0]).astype(int)
)
df = df.dropna()

In [31]:
enc = tiktoken.encoding_for_model("gpt-4")
def try_generate(row):
    if len(enc.encode(row["text"])) < 64000:
        return True
    else:
        print(row["name"], len(enc.encode(row["text"])))
        return False

df_tolstoy = df[df["author"] == "Tolstoy"]
df_tolstoy["generate"] = df_tolstoy.progress_apply(try_generate, axis=1)
df_tolstoy = df_tolstoy[df_tolstoy["generate"] == True]

records = df_tolstoy[["name", "text"]].to_dict(orient='records')
custom_dict = {record['name']: record['text'] for record in records}

with open('tolstoy.json', 'w') as file:
    json.dump(custom_dict, file)

  0%|          | 0/68 [00:00<?, ?it/s]

Анна Каренина 866231
Война и мир. Том 1 400775
Война и мир. Том 2 418988
Война и мир. Том 3 443668
Война и мир. Том 4 427155
Воскресение 438084
Детство 93667
Казаки 147503
Крейцерова соната 73834
Отрочество 71896
Семейное счастье 79661
Хаджи-Мурат 115360
Юность 152104


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tolstoy["generate"] = df_tolstoy.progress_apply(try_generate, axis=1)


In [33]:
enc = tiktoken.encoding_for_model("gpt-4")
def try_generate(row):
    if len(enc.encode(row["text"])) < 64000:
        return True
    else:
        print(row["name"], len(enc.encode(row["text"])))
        return False

df_chekhov = df[df["author"] == "Chekhov"]
df_chekhov["generate"] = df_chekhov.progress_apply(try_generate, axis=1)
df_chekhov = df_chekhov[df_chekhov["generate"] == True]

records = df_chekhov[["name", "text"]].to_dict(orient='records')
custom_dict = {record['name']: record['text'] for record in records}

with open('chekhov.json', 'w') as file:
    json.dump(custom_dict, file)

  0%|          | 0/76 [00:00<?, ?it/s]

Дуэль 96846


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chekhov["generate"] = df_chekhov.progress_apply(try_generate, axis=1)


In [None]:
retry_limit = 5


def get_genres(text, name):
    try:
        for attempt in range(retry_limit):
            try:
                return (
                    gemini_pro_long.generate_content(
                        text
                        + "Please provide a comma-separated list of single-word genres that best describe the text above: "
                    )
                    .candidates[0]
                    .content.parts[0]
                    .text
                )
            except ResourceExhausted:
                time.sleep(60)
    except InternalServerError:
        print(f"InternalServerError on {name} text")
    except IndexError:
        print(f"IndexError on {name} text")
    for attempt in range(retry_limit):
        try:
            return (
                gemini_pro_long.generate_content(
                    name
                    + "Please provide a comma-separated list of single-word genres that best describe the text above: "
                )
                .candidates[0]
                .content.parts[0]
                .text
            )
        except ResourceExhausted:
            time.sleep(60)
    print(f"failed on {name}")
    return None


try:
    df = pd.read_csv("gemini.csv")
except:
    df["genres"] = [
        get_genres(text, title)
        for text, title in tqdm(zip(df["text"], df["name"]), desc="Generating genres")
    ]
    df.to_csv("gemini.csv", index=False)

In [None]:
df = df.dropna()
df["genres"] = df["genres"].map(lambda x: x.strip().lower().split(", "))

In [None]:
df_explode = df.explode("genres")
df_explode = df_explode[df_explode["genres"] != ""]
genre_trends = df_explode.groupby(["year", "genres"]).size().unstack(fill_value=0)
filtered_genre_trends = genre_trends[genre_trends.sum().sort_values()[-10:].index]
percentage_filtered_genre_trends = (
    filtered_genre_trends.divide(filtered_genre_trends.sum(axis=1), axis=0) * 100
)
percentage_filtered_genre_trends.dropna()
percentage_filtered_genre_trends = percentage_filtered_genre_trends.reset_index()
melted_percentage_filtered_genre_trends = percentage_filtered_genre_trends.melt(
    id_vars="year", var_name="genre", value_name="popularity"
)

In [None]:
so.Plot(
    melted_percentage_filtered_genre_trends, "year", "popularity", color="genre"
).add(so.Area(alpha=0.7), so.Stack())

In [None]:
window_size = 7

melted_percentage_filtered_genre_trends["popularity_smoothed"] = (
    melted_percentage_filtered_genre_trends.groupby("genre")["popularity"].transform(
        lambda x: x.rolling(window=window_size, center=True).mean()
    )
)

so.Plot(
    melted_percentage_filtered_genre_trends,
    "year",
    "popularity_smoothed",
    color="genre",
).add(so.Area(alpha=0.7), so.Stack()).layout(size=(16, 9)).save(
    "gemini.png", bbox_inches="tight"
)

In [None]:
# gemini_pro_long.generate_content(
#     "\"The play portrays the visit of an elderly professor and his glamorous, much younger second wife, Yelena, to the rural estate that supports their urban lifestyle. Two friends—Vanya, brother of the professor's late first wife, who has long managed the estate, and Astrov, the local doctor—both fall under Yelena's spell while bemoaning the ennui of their provincial existence. Sonya, the professor's daughter by his first wife, who has worked with Vanya to keep the estate going, suffers from her unrequited feelings for Astrov. Matters are brought to a crisis when the professor announces his intention to sell the estate, Vanya and Sonya's home, with a view to investing the proceeds to achieve a higher income for himself and his wife.\"\nRepeat each sentance with a catigorization after: horror short story humor social commentary satire fiction historical tragedy romance drama"
# ).candidates[0].content.parts[0].text

In [None]:
# gemini_pro_long.generate_content(
#     "\"The play portrays the visit of an elderly professor and his glamorous, much younger second wife, Yelena, to the rural estate that supports their urban lifestyle. Two friends—Vanya, brother of the professor's late first wife, who has long managed the estate, and Astrov, the local doctor—both fall under Yelena's spell while bemoaning the ennui of their provincial existence. Sonya, the professor's daughter by his first wife, who has worked with Vanya to keep the estate going, suffers from her unrequited feelings for Astrov. Matters are brought to a crisis when the professor announces his intention to sell the estate, Vanya and Sonya's home, with a view to investing the proceeds to achieve a higher income for himself and his wife.\"\nRewrite the abstract like Uncle Vanya was a political commentary satire of Russia in 1870:"
# ).candidates[0].content.parts[0].text