# Step 1: Install Required Libraries

In [None]:
pip install pandas scikit-learn nltk

# Step 2: Load and Explore the Dataset

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('books.csv')

# Explore the dataset
print(df.head())
print(df.info())

# Step 3: Clean the Dataset

Handle missing values and clean the text data.

In [None]:
# Drop rows with missing values in title/description
df.dropna(subset=['title', 'description'], inplace=True)

# You can also perform further cleaning like lowercasing, removing punctuation, etc.
df['description'] = df['description'].str.lower()

# Step 4: Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the descriptions
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['description'])

# Step 5: K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

# Choose the number of clusters (genres)
num_clusters = 5  # You can adjust this based on your dataset
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit the model
kmeans.fit(X)
df['cluster'] = kmeans.labels_

# Step 6: Evaluate using Silhouette Coefficient

In [None]:
from sklearn.metrics import silhouette_score

# Calculate silhouette score
silhouette_avg = silhouette_score(X, kmeans.labels_)
print(f'Silhouette Coefficient: {silhouette_avg}')

In [None]:
# Display the count of books in each cluster
print(df['cluster'].value_counts())

# Additional Step: Visualize Clusters (Optional)
If you want to visualize clusters, you might use PCA to reduce the dimensions to 2D.

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Dimensionality reduction
pca = PCA(n_components=2)
X_embedded = pca.fit_transform(X.toarray())

# Plotting
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=df['cluster'], cmap='viridis')
plt.title('Book Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('books.csv')

# Explore the dataset
print(df.head())
print(df.info())

# Drop rows with missing values in title/description
df.dropna(subset=['title', 'description'], inplace=True)

# Clean the description text
df['description'] = df['description'].str.lower()

# Define a custom list of stop words (optional)
custom_stop_words = ["book", "novel", "story", "read", "readers"] # Add more as needed

# Combine custom stop words with default English stop words
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS.union(custom_stop_words)

# Vectorize the descriptions, using the stop words
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(df['description'])

# Choose the number of clusters (genres)
num_clusters = 5  # You can adjust this based on your dataset
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit the model
kmeans.fit(X)
df['cluster'] = kmeans.labels_

# Calculate silhouette score
silhouette_avg = silhouette_score(X, kmeans.labels_)
print(f'Silhouette Coefficient: {silhouette_avg}')

# Display the count of books in each cluster
print(df['cluster'].value_counts())

# Dimensionality reduction for visualization
pca = PCA(n_components=2)
X_embedded = pca.fit_transform(X.toarray())

# Plotting the clusters
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=df['cluster'], cmap='viridis')
plt.title('Book Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()