In [15]:
import google.generativeai as genai

In [11]:
GOOGLE_API_KEY = "--"  
genai.configure(api_key=GOOGLE_API_KEY)

# Create a model instance (using Gemini-Pro)
model = genai.GenerativeModel('gemini-1.5-flash-latest')

In [14]:
# Example: Generate a response
response = model.generate_content("Write a hello world message")
print(response.text)

```
Hello, world!
```


In [16]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset="train")
newsgroups_test = fetch_20newsgroups(subset="test")

# View list of class names for dataset
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [17]:
print(newsgroups_train.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [18]:
import email
import re
import pandas as pd

def preprocess_newsgroup_row(data):
    # Extract only the subject and body
    msg = email.message_from_string(data)
    text = f"{msg['Subject']}\n\n{msg.get_payload()}"
    # Strip any remaining email addresses
    text = re.sub(r"[\w\.-]+@[\w\.-]+", "", text)
    # Truncate each entry to 5,000 characters
    text = text[:5000]

    return text


def preprocess_newsgroup_data(newsgroup_dataset):
    # Put data points into dataframe
    df = pd.DataFrame(
        {"Text": newsgroup_dataset.data, "Label": newsgroup_dataset.target}
    )
    # Clean up the text
    df["Text"] = df["Text"].apply(preprocess_newsgroup_row)
    # Match label to target name index
    df["Class Name"] = df["Label"].map(lambda l: newsgroup_dataset.target_names[l])

    return df

In [19]:
# Apply preprocessing function to training and test datasets
df_train = preprocess_newsgroup_data(newsgroups_train)
df_test = preprocess_newsgroup_data(newsgroups_test)

df_train.head()

Unnamed: 0,Text,Label,Class Name
0,WHAT car is this!?\n\n I was wondering if anyo...,7,rec.autos
1,SI Clock Poll - Final Call\n\nA fair number of...,4,comp.sys.mac.hardware
2,"PB questions...\n\nwell folks, my mac plus fin...",4,comp.sys.mac.hardware
3,Re: Weitek P9000 ?\n\nRobert J.C. Kyanko () wr...,1,comp.graphics
4,Re: Shuttle Launch Question\n\nFrom article <>...,14,sci.space


In [20]:
def sample_data(df, num_samples, classes_to_keep):
    # Sample rows, selecting num_samples of each Label.
    df = (
        df.groupby("Label")[df.columns]
        .apply(lambda x: x.sample(num_samples))
        .reset_index(drop=True)
    )

    df = df[df["Class Name"].str.contains(classes_to_keep)]

    # We have fewer categories now, so re-calibrate the label encoding.
    df["Class Name"] = df["Class Name"].astype("category")
    df["Encoded Label"] = df["Class Name"].cat.codes

    return df

In [21]:
TRAIN_NUM_SAMPLES = 100
TEST_NUM_SAMPLES = 25
CLASSES_TO_KEEP = "sci"  # Class name should contain 'sci' to keep science categories

df_train = sample_data(df_train, TRAIN_NUM_SAMPLES, CLASSES_TO_KEEP)
df_test = sample_data(df_test, TEST_NUM_SAMPLES, CLASSES_TO_KEEP)

In [22]:
df_train.value_counts("Class Name")

Class Name
sci.crypt          100
sci.electronics    100
sci.med            100
sci.space          100
Name: count, dtype: int64

In [23]:
df_test.value_counts("Class Name")

Class Name
sci.crypt          25
sci.electronics    25
sci.med            25
sci.space          25
Name: count, dtype: int64

In [24]:
from tqdm.auto import tqdm

tqdm.pandas()

from google.api_core import retry


@retry.Retry(timeout=300.0)
def embed_fn(text: str) -> list[float]:
    # You will be performing classification, so set task_type accordingly.
    response = genai.embed_content(
        model="models/text-embedding-004", content=text, task_type="classification"
    )

    return response["embedding"]


def create_embeddings(df):
    df["Embeddings"] = df["Text"].progress_apply(embed_fn)
    return df

In [25]:
df_train = create_embeddings(df_train)
df_test = create_embeddings(df_test)

100%|██████████| 400/400 [02:06<00:00,  3.15it/s]
100%|██████████| 100/100 [00:31<00:00,  3.19it/s]


In [31]:
pd.set_option('display.max_colwidth', None)

df_train.head(2)

Unnamed: 0,Text,Label,Class Name,Encoded Label,Embeddings
1100,"Re: Once tapped, your code is no good any more.\n\nIn article <> (douglas craig holland) writes:\n>\tWith E-Mail, if they can't break your PGP encryption, they'll just\n>call up one of their TEMPEST trucks and read the electromagnetic emmisions\n>from your computer or terminal. Note that measures to protect yourself from\n>TEMPEST surveillance are still classified, as far as I know.\n\nNote that TEMPEST is the name of the shielding standard. TEMPEST is not\nthe name of the surveillance technique.\n\nKen Shirriff\t\t\t\t\nDisclaimer: this is what I've heard and it's in the sci.crypt FAQ, so it's\nprobably true but I can't guarantee it. I'd like to know if I'm wrong.\n",11,sci.crypt,0,"[-0.0043086354, 0.014946195, -0.04718869, 0.038918644, 0.026516398, 0.07706348, 0.10642429, 0.049502425, -0.013975333, -0.018014356, 0.062174253, 0.05050502, 0.023072755, 0.033149973, 0.054371137, -0.0548496, 0.08523746, 0.047518972, -0.0003643629, -0.050271448, -0.025574224, 0.019547164, -0.009532954, 0.0051346673, -0.010429473, -0.0008891799, 0.0012947439, -0.0246474, 0.0068962136, -0.017097741, 0.03163274, 0.015851222, -0.018555803, -0.0083137825, 0.051839203, -0.0059056072, 0.0038899009, 0.021635285, 0.0025033378, -0.06797125, -0.047640763, -0.0012215399, -0.043856185, 0.06743114, -0.042062163, -0.026372904, -0.03635169, -0.058605645, -0.07490101, 0.01896286, -0.023904743, 0.032075237, -0.0017136263, 0.039858103, -0.008706433, -0.041990276, 0.019131372, -0.07795157, 0.0073262397, -0.06462448, -0.0042621507, -0.015266651, 0.03303322, -0.018320683, -0.010633533, -0.024478858, -0.02866816, 0.025612757, -0.05977793, -0.008514277, 0.01163799, 0.06699485, -0.041104812, 0.028408406, 0.02129292, -0.008280964, 0.0869091, -0.01033378, 0.044564925, 0.04092834, -0.034560334, -0.03273403, 0.041548397, -0.007906608, -0.008581105, -0.05207929, 0.029798444, 0.024593176, -0.07719749, -0.051054936, 0.07416974, 0.01987089, -0.030529829, 0.012194375, 0.06359984, -0.009392929, 0.01087187, -0.06341449, 0.04224102, 0.022851307, ...]"
1101,"DES: init vector as additional key?\n\nThe recent discussion in this news group suggests that a key search attack \nagainst DES is quite feasible now. But normally DES is applied in CBC or CFB \nmode where one chooses a random init vector of 8 bytes. Questions:\n\n - Makes it sense to handle the init vector as an additional key? Then we have \na 56 + 64 = 120 bit key.\n \n - If yes: Is anything known about the security of this key scheme? Can we \nbreak it faster than by exhaustive search through the 120 bit key space?\n\n--\nKlaus Pommerening\nInstitut fuer Medizinische Statistik und Dokumentation\nder Johannes-Gutenberg-Universitaet\nObere Zahlbacher Strasse 69, W-6500 Mainz, Germany\n",11,sci.crypt,0,"[0.0052515334, -0.024516884, -0.022230675, 0.03033998, 0.0034571097, 0.06945318, 0.07900606, 0.017766396, -0.011564752, -0.0055645434, 0.0800191, 0.028026406, 0.011098125, 0.003142284, 0.058238015, -0.06528971, 0.10194961, 0.025307294, 0.010203706, -0.039929863, -0.040476304, 0.0071391375, -0.0066721896, 0.024076223, -0.011510883, 0.014300542, 0.0074101, -0.034654137, 0.020016504, -0.011501476, 0.01991005, 0.030683937, -0.008233792, -0.032788854, 0.025400637, -0.016371952, 0.02223255, 0.033161785, -0.011772858, -0.04757878, -0.05254649, -0.008974971, -0.023268113, 0.032900754, -0.032418188, -0.03576506, -0.0005484355, -0.022237595, -0.074608274, 0.011349422, -0.013468872, 0.0011192947, -0.030095484, 0.024018854, 0.012187186, -0.010653717, 0.030060446, -0.019402646, 0.01535876, -0.066677265, 0.008329106, -0.013744917, 0.012467578, -0.012481877, 0.010764515, -0.023325682, -0.020720158, 0.010331271, -0.05240565, 0.0018682295, 0.05617261, 0.09089013, -0.027483238, 0.051388618, 0.0043779816, 0.02593068, 0.08645078, 0.0036815861, 0.054931894, 0.054556802, -0.029818267, -0.05374424, 0.05202867, 0.012696309, 0.01351062, -0.024774952, 0.013146285, -0.00841141, -0.053881302, -0.0060386676, 0.07039838, 0.057442598, -0.032227162, 0.036698375, 0.05217372, -0.013699033, 0.017393427, -0.11009749, 0.06170662, 0.019858098, ...]"


In [34]:
import keras
from keras import layers


def build_classification_model(input_size: int, num_classes: int) -> keras.Model:
    return keras.Sequential(
        [
            layers.Input([input_size], name="embedding_inputs"),
            layers.Dense(input_size, activation="relu", name="hidden"),
            layers.Dense(num_classes, activation="softmax", name="output_probs"),
        ]
    )

In [35]:
# Derive the embedding size from observing the data. The embedding size can also be specified
# with the `output_dimensionality` parameter to `embed_content` if you need to reduce it.
embedding_size = len(df_train["Embeddings"].iloc[0])

classifier = build_classification_model(
    embedding_size, len(df_train["Class Name"].unique())
)
classifier.summary()

classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"],
)

### Train the model


In [36]:
import numpy as np

NUM_EPOCHS = 20
BATCH_SIZE = 32

# Split the x and y components of the train and validation subsets.
y_train = df_train["Encoded Label"]
x_train = np.stack(df_train["Embeddings"])
y_val = df_test["Encoded Label"]
x_val = np.stack(df_test["Embeddings"])

# Specify that it's OK to stop early if accuracy stabilises.
early_stop = keras.callbacks.EarlyStopping(monitor="accuracy", patience=3)

# Train the model for the desired number of epochs.
history = classifier.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_val, y_val),
    callbacks=[early_stop],
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
)

Epoch 1/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.3341 - loss: 1.3755 - val_accuracy: 0.3600 - val_loss: 1.2940
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5885 - loss: 1.2275 - val_accuracy: 0.6200 - val_loss: 1.1684
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8309 - loss: 1.0611 - val_accuracy: 0.8300 - val_loss: 1.0125
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9623 - loss: 0.8723 - val_accuracy: 0.8400 - val_loss: 0.8535
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9136 - loss: 0.7155 - val_accuracy: 0.8800 - val_loss: 0.7204
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9376 - loss: 0.5405 - val_accuracy: 0.8700 - val_loss: 0.6458
Epoch 7/20
[1m13/13[0m [32m━━━━

### Evaluate model performance

Use Keras <a href="https://www.tensorflow.org/api_docs/python/tf/keras/Model#evaluate"><code>Model.evaluate</code></a> to calculate the loss and accuracy on the test dataset.

In [37]:
classifier.evaluate(x=x_val, y=y_val, return_dict=True)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9283 - loss: 0.2809


{'accuracy': 0.9300000071525574, 'loss': 0.3079419732093811}

## Try a custom prediction

Now that you have a trained model with good evaluation metrics, you can try to make a prediction with new, hand-written data. Use the provided example or try your own data to see how the model performs.

In [38]:
# This example avoids any space-specific terminology to see if the model avoids
# biases towards specific jargon.
new_text = """
First-timer looking to get out of here.

Hi, I'm writing about my interest in travelling to the outer limits!

What kind of craft can I buy? What is easiest to access from this 3rd rock?

Let me know how to do that please.
"""
embedded = embed_fn(new_text)
print(embedded)

[0.019432709, -0.0134165445, -0.038569827, 0.027099848, 0.030647298, 0.0552539, 0.089664415, 0.040378496, -0.010588484, 0.009992522, 0.031105185, 0.041578043, 0.029177554, -0.0058882097, 0.049694806, -0.04701471, 0.08547439, 0.06041958, -0.041273843, -0.02578345, -0.0055946195, 0.018798409, -0.0111317085, 0.011139821, -0.0051660193, 0.030108875, 0.04181174, -0.012075451, 0.0108866645, -0.015594199, 0.017580828, 0.04662013, 0.04908052, -0.016169475, 0.051437464, 0.007593721, -0.0024948507, -0.010638129, 0.027199747, -0.07219817, -0.05390348, 0.022033947, -0.0032580693, 0.03200955, -0.04087985, -0.049519826, -0.05676126, -0.02587544, -0.06374949, 0.010307776, 0.0033229957, 0.057755854, -0.047802165, 0.0480191, 0.0029020114, -0.020711409, -0.0018620442, -0.038993567, 0.037544888, -0.093531534, -0.005849261, -0.024610138, 0.03038399, -0.010527783, -0.013461092, -0.053257346, -0.031193351, 0.03940577, -0.038611542, -0.0012961501, 0.002042963, 0.08961567, -0.04722155, 0.05110101, 0.025155822

In [39]:
# Remember that the model takes embeddings as input, and the input must be batched,
# so here they are passed as a list to provide a batch of 1.
inp = np.array([embedded])
[result] = classifier.predict(inp)

for idx, category in enumerate(df_test["Class Name"].cat.categories):
    print(f"{category}: {result[idx] * 100:0.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
sci.crypt: 0.01%
sci.electronics: 0.44%
sci.med: 0.09%
sci.space: 99.47%
