# Introduction to LLMs Notebook

To use this notebook, you will need to install the following packages:

```bash
!pip install pandas
!pip install tqdm
!pip install altair
!pip install scikit-learn
|pip install ollama
```

## Install and Import Libraries

In [4]:
# Importing libraries
from rich.console import Console
import pandas as pd
from tqdm import tqdm
import altair as alt
import ollama
import ast
import pandas as pd
import time
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re
import warnings
warnings.filterwarnings('ignore')

console = Console()

## Load Datasets

You either have the option of using the premade dataset available in Google Drive (though you will need to change the path to the file) or running the code below to remake the dataset from scratch. 

**Be warned, this file is quite large because of the size of the novels, so you may want to use a subset of the novels to test this code.**

### Google Drive Dataset

You can download this dataset here [https://drive.google.com/file/d/1LkaRtYph_lWtMPRyzZpECuzEMD3WPx26/view?usp=sharing](https://drive.google.com/file/d/1LkaRtYph_lWtMPRyzZpECuzEMD3WPx26/view?usp=sharing) and it's very larger so make sure you don't push it up to GitHub.


In [2]:
combined_novels_nyt_df = pd.read_csv("combined_novels_nyt_with_text.csv")

In [3]:
combined_novels_nyt_genre_df = combined_novels_nyt_df[(combined_novels_nyt_df.cleaned_pg_eng_text.notna()) & (combined_novels_nyt_df.genre != "na")][['top_500_rank', 'title', 'author', 'pub_year', 'orig_lang', 'genre',
       'author_birth', 'author_death', 'author_gender', 'author_primary_lang',
       'author_nationality', 'author_field_of_activity', 'author_occupation',
       'oclc_holdings', 'oclc_eholdings', 'oclc_total_editions',
       'oclc_holdings_rank', 'oclc_editions_rank', 'gr_avg_rating',
       'gr_num_ratings', 'gr_num_reviews', 'gr_avg_rating_rank',
       'gr_num_ratings_rank', 'oclc_owi', 'author_viaf', 'gr_url', 'wiki_url',
       'pg_eng_url', 'pg_orig_url', 'year', 'week', 'rank', 'title_id',
       'nyt_title', 'pub_year_date', 'pub_date','pg_eng_text_len',
       'pg_orig_text_len', 'pg_eng_token_len', 'pg_orig_token_len',
       'cleaned_pg_eng_text', 'cleaned_pg_orig_text']]

combined_novels_nyt_genre_df['cleaned_pg_eng_text_len'] = combined_novels_nyt_genre_df['cleaned_pg_eng_text'].str.len()
combined_novels_nyt_genre_df['cleaned_orig_eng_text_len'] = combined_novels_nyt_genre_df['cleaned_pg_orig_text'].str.len()

In [5]:
def clean_text(text):
    """
    Cleans the input text by removing unnecessary metadata, special characters, and image references.

    Parameters
    ----------
    text : str
        The raw text string to clean.

    Returns
    -------
    str
        A cleaned version of the input text.
    """
    # Decode bytes to string if text is in bytes format
    if isinstance(text, bytes):
        text = text.decode('utf-8', errors='ignore')
    
    # Remove image references like "bookcover.jpg" or "p003.jpg (307K)"
    text = re.sub(r'\b\w+\.jpg\b\s*\(\d+K\)', '', text)
    text = re.sub(r'\b\w+\.jpg\b', '', text)

    # Remove sections like "Full Size" which appear frequently
    text = re.sub(r'\bFull Size\b', '', text, flags=re.IGNORECASE)
    
    # Remove phrases indicating editor notes or eBook-related content
    text = re.sub(r'(Ebook Editor’s Note|Project Gutenberg edition)', '', text, flags=re.IGNORECASE)
    
    # Remove special character sequences (e.g., \xe2\x80\x9c, etc.)
    text = re.sub(r'\\x[0-9A-Fa-f]{2}', '', text)
    text = re.sub(r'\\[a-zA-Z]', '', text)  # Remove any leftover backslashes
    
    # Remove extra newline characters and unnecessary whitespace
    text = re.sub(r'\n+', '\n', text).strip()  # Reduce multiple newlines to one
    text = re.sub(r'\s{2,}', ' ', text)  # Replace multiple spaces with a single space

    # Optional: Remove all caps headings if needed, which might signify metadata (like CHAPTER TITLES)
    text = re.sub(r'^[A-Z\s]{3,}$', '', text, flags=re.MULTILINE)

    return text

tqdm.pandas(desc="Cleaning Text")

combined_novels_nyt_genre_df.cleaned_pg_eng_text = combined_novels_nyt_genre_df.cleaned_pg_eng_text.fillna('')
combined_novels_nyt_genre_df.cleaned_pg_orig_text = combined_novels_nyt_genre_df.cleaned_pg_orig_text.fillna('')
combined_novels_nyt_genre_df['full_cleaned_pg_eng_text'] = combined_novels_nyt_genre_df.cleaned_pg_eng_text.progress_apply(clean_text)
combined_novels_nyt_genre_df['full_cleaned_pg_orig_text'] = combined_novels_nyt_genre_df.cleaned_pg_orig_text.progress_apply(clean_text)

Cleaning Text: 100%|██████████| 87/87 [00:11<00:00,  7.74it/s]
Cleaning Text: 100%|██████████| 87/87 [00:02<00:00, 31.01it/s]


In [6]:
combined_novels_nyt_genre_df.genre.value_counts()

history       24
romance       12
action        10
bildung       10
fantasy        7
political      7
scifi          5
mystery        4
allegories     3
autobio        3
horror         2
Name: genre, dtype: int64

In [7]:
vectorizer = TfidfVectorizer(stop_words="english", min_df=1, max_df=0.7,)

# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(combined_novels_nyt_genre_df.full_cleaned_pg_eng_text.fillna(''))

# Convert the TF-IDF matrix to a DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Add the titles to the DataFrame
tfidf_df['title'] = combined_novels_nyt_genre_df['title'].values

# Melt the DataFrame to get a long format DataFrame with terms and scores
melted_tfidf_df = tfidf_df.melt(id_vars=['title'], var_name='term', value_name='score')

# Sort the DataFrame by score in descending order
sorted_tfidf_no_stopwords_min_max_df = melted_tfidf_df.sort_values(by='score', ascending=False)

# Display the top 10 results
sorted_tfidf_no_stopwords_min_max_df.head(10)

Unnamed: 0,title,term,score
15942353,The Jungle,jurgis,0.949413
1660887,This Side of Paradise,amory,0.938376
1295317,"Through the Looking-Glass, and What Alice Foun...",alice,0.919699
1295257,Alice's Adventures in Wonderland,alice,0.911474
22207592,Of Human Bondage,philip,0.895389
10050016,White Fang,fang,0.878559
31973533,The Secret Agent,verloc,0.878165
6298634,The Pilgrim's Progress,chr,0.867819
15925817,The Red & the Black,julien,0.857932
1253008,Kidnapped: The Adventures of David Balfour,alan,0.834603


In [8]:

# Group the text data by genre and join the texts within each genre
grouped_texts = combined_novels_nyt_genre_df.groupby('genre')['full_cleaned_pg_eng_text'].apply(lambda texts: ' '.join(texts)).reset_index()

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english", min_df=1, max_df=0.7)

# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(grouped_texts['full_cleaned_pg_eng_text'].fillna(''))

# Convert the TF-IDF matrix to a DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Add the genres to the DataFrame
tfidf_df['genre'] = grouped_texts['genre'].values

# Melt the DataFrame to get a long format DataFrame with terms and scores
melted_tfidf_df = tfidf_df.melt(id_vars=['genre'], var_name='term', value_name='score')

# Sort the DataFrame by score in descending order
sorted_tfidf_no_stopwords_min_max_df = melted_tfidf_df.sort_values(by='score', ascending=False)

# Display the top 10 results
sorted_tfidf_no_stopwords_min_max_df.head(10)

Unnamed: 0,genre,term,score
3403765,autobio,swann,0.688831
1469898,allegories,grandet,0.628138
160439,fantasy,alice,0.611279
2712973,scifi,pencroft,0.59745
780566,horror,christine,0.594813
3043260,action,sancho,0.552047
2481151,autobio,odette,0.505621
2879536,action,quixote,0.493032
1962606,political,jurgis,0.480073
4045793,fantasy,wendy,0.450921


In [13]:
# Group by genre and get the top 10 terms for each genre
top_terms_by_genre = sorted_tfidf_no_stopwords_min_max_df.groupby('genre').head(10)

# Print the results
for genre, group in top_terms_by_genre.groupby('genre'):
    console.print(f"In our dataset we have this many novels {len(combined_novels_nyt_genre_df[combined_novels_nyt_genre_df.genre == genre])} for genre: {genre}", style="bright_magenta")
    console.print(f"Top 10 terms for genre: {genre}", style="bright_magenta")
    console.print(group[['term', 'score']])
    console.print("\n")

In [11]:
# Filter the dataset for action and romance genres
filtered_df = combined_novels_nyt_genre_df[combined_novels_nyt_genre_df['genre'].isin(['action', 'romance'])]

# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words="english", min_df=1, max_df=0.7)
X = vectorizer.fit_transform(filtered_df['full_cleaned_pg_eng_text'].fillna(''))

# Encode the genres as binary labels
y = filtered_df['genre'].apply(lambda x: 1 if x == 'action' else 0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform logistic regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred, target_names=['romance', 'action']))

# Extract the most distinctive coefficients
feature_names = vectorizer.get_feature_names_out()
coefficients = log_reg.coef_.flatten()
coeff_df = pd.DataFrame({'term': feature_names, 'coefficient': coefficients})

# Sort by absolute value of coefficients to get the most distinctive terms
coeff_df['abs_coefficient'] = coeff_df['coefficient'].abs()
sorted_coeff_df = coeff_df.sort_values(by='abs_coefficient', ascending=False)

# Display the top 10 most distinctive terms
console.print("Top 10 most distinctive terms for action vs romance:")
console.print(sorted_coeff_df.head(10))

# Display the top 10 terms for action
console.print("\nTop 10 terms for action:")
console.print(sorted_coeff_df[sorted_coeff_df['coefficient'] > 0].head(10))

# Display the top 10 terms for romance
console.print("\nTop 10 terms for romance:")
console.print(sorted_coeff_df[sorted_coeff_df['coefficient'] < 0].head(10))

              precision    recall  f1-score   support

     romance       0.75      1.00      0.86         3
      action       1.00      0.50      0.67         2

    accuracy                           0.80         5
   macro avg       0.88      0.75      0.76         5
weighted avg       0.85      0.80      0.78         5



In [12]:


# Assuming combined_novels_nyt_genre_df is already defined and contains 'full_cleaned_pg_eng_text' and 'genre' columns

# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words="english", min_df=1, max_df=0.7)
X = vectorizer.fit_transform(combined_novels_nyt_genre_df['full_cleaned_pg_eng_text'].fillna(''))

# Encode the genres as labels
y = combined_novels_nyt_genre_df['genre']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform Random Forest classification
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_clf.predict(X_test)

# Print classification report
console.print(classification_report(y_test, y_pred))

# Extract feature importances
feature_names = vectorizer.get_feature_names_out()
importances = rf_clf.feature_importances_
importance_df = pd.DataFrame({'term': feature_names, 'importance': importances})

# Sort by importance to get the most distinctive terms
sorted_importance_df = importance_df.sort_values(by='importance', ascending=False)

# Display the top 10 most distinctive terms
console.print("Top 10 most distinctive terms:")
console.print(sorted_importance_df.head(10))

In [15]:
marc_records_df = pd.read_csv("merged_preidentified_periodicals.csv")
console.print(f"Whe have this many marc records {len(marc_records_df)}")

In [19]:
def process_marc_record(row: pd.Series) -> pd.Series:
	"""
	Function to process a MARC record row to generate a human-readable description and classify the periodical.

	Parameters
	----------
	row : pd.Series
		A row of a dataframe with MARC record fields.

	Returns
	-------
	pd.Series
		A series with the human-readable description and classification from the Llama model.
	"""
	time.sleep(1)  # Avoid hitting API rate limits
	try:
		console.print(f"Processing periodical {row['periodical_name']}")
		
		knowledege_check_content = (
			   f"Based on this periodical name: {row['periodical_name']}, what do you know about this publication? "
		)
		console.print(f"Knowledge check content: {knowledege_check_content}", style="bold green")
		
		# Prepare the Ollama API request for the knowledge check
		knowledge_check_response = ollama.chat(
			model='llama3.2',
			messages=[
				{
					'role': 'system',
					'content': "You are a librarian with knowledge of periodicals. Return your answer strictly in JSON format like this: {'knowledge': 'Your knowledge here'}. Do not include any other text or explanations outside the JSON format."
				},
				{
					'role': 'user',
					'content': knowledege_check_content
				}
			]
		)
		console.print(f"Knowledge check response: {knowledge_check_response}", style="bold green")
		# Attempt to parse response as JSON
		try:
			knowledge_message = knowledge_check_response['message']['content']
			row['knowledge'] = ast.literal_eval(knowledge_message).get('knowledge', 'No knowledge available')
			console.print(f"Knowledge: {row.knowledge}", style="bold green")
		except (ValueError, SyntaxError):
			console.print("Error parsing JSON response for knowledge check.", style="bold red")
			row['knowledge'] = "No valid JSON returned."
		# Compile MARC record fields into a structured text for prompting
		marc_fields = '\n'.join([f"{field}: {value}" for field, value in row.items() if pd.notna(value) and field != "publication_type"])
		

		# Define the prompt content for generating a human-readable description
		description_content = (
			f"Here is combined data from multiple MARC records and HathiTrust data:\n{marc_fields}\nPlease create a brief human-readable description "
			f"including title, author, publication date, and subjects."
		)
		console.print(f"Description content: {description_content}", style="bold green")

		# Prepare the Ollama API request for the description
		description_response = ollama.chat(
			model='llama3.2',
			messages=[
				{
					'role': 'system',
					'content': "You are a librarian skilled in converting metadata from MARC files into user-friendly summaries. Please return your answer in strict JSON format like this: {'description': 'Your description here'}. Do not include any other text or explanations outside the JSON format."
				},
				{
					'role': 'user',
					'content': description_content
				}
			]
		)
		console.print(f"Description response: {description_response}", style="bold green")
		# Attempt to parse response as JSON
		description_message = description_response['message']['content']
		
		try:
			# Extract and store the human-readable description
			row['human_readable_description'] = ast.literal_eval(description_message).get('description', 'No description available')
			console.print(f"Description: {row.human_readable_description}", style="bold green")
		except (ValueError, SyntaxError):
			console.print("Error parsing JSON response for description.", style="bold red")
			row['human_readable_description'] = "No valid JSON returned."

		# Define the prompt content for genre classification
		classification_content = (
			f"Based on this description, classify the periodical into a genre: Information Bulletin, Radical Periodical, News & Politics Magazine.\n"
			f"Description: {row['human_readable_description']}"
		)

		# Prepare the Ollama API request for classification
		classification_response = ollama.chat(
			model='llama3.2',
			messages=[
				{
					'role': 'system',
					'content': "You are a classification expert who assigns genres to periodicals. "
							   "Return your answer strictly in JSON format like this: {'classification': 'Your classification here'}."
							   "Do not include any other text or explanations outside the JSON format."
				},
				{
					'role': 'user',
					'content': classification_content
				}
			]
		)

		# Attempt to parse response as JSON
		classification_message = classification_response['message']['content']
		
		try:
			row['classification'] = ast.literal_eval(classification_message).get('classification', 'Unclassified')
			console.print(f"Classification: {row.classification}", style="bold green")
			console.print(f"Actual classification: {row['publication_type']}", style="bright_magenta")
			console.print("\n")
		except (ValueError, SyntaxError):
			console.print("Error parsing JSON response for classification.", style="bold red")
			row['classification'] = "No valid JSON returned."

	except Exception as e:
		console.print(f"Unexpected error: {e}", style="bold red")
		row['human_readable_description'] = None
		row['classification'] = None

	return row


# Process each MARC record
marc_df = marc_records_df[0:10].apply(process_marc_record, axis=1)