# Demo 18

In [None]:
import pandas as pd
import sklearn
import numpy as np

import matplotlib.pyplot as plt

## Hyperparameter Tuning

### Dataset

We will use data from [Victorian Era Authorship Attribution Data Set](https://archive.ics.uci.edu/ml/datasets/Victorian+Era+Authorship+Attribution#)

**Instructions:**

- Open up a terminal
- `cd` into `data/`
- make a new directory for victorian author id
- wget https://archive.ics.uci.edu/ml/machine-learning-databases/00454/dataset.zip
- unzip the dataset

In [None]:
df = pd.read_csv("data/victorian_author_id/dataset/Gungor_2018_VictorianAuthorAttribution_data-train.csv")
df.shape

In [None]:
df.keys()

In [None]:
len(df['author'].unique())

In [None]:
df['author'].value_counts()

Its a good idea to check if there are duplicates

In [None]:
df['text'].drop_duplicates().shape

Let's just look at the 5 most frequent authors

In [None]:
top4_authors = df['author'].value_counts()[:5].index
top4_authors

In [None]:
df[df['author'].map(lambda x: x in top4_authors)]

In [None]:
df = df[df['author'].map(lambda x: x in top4_authors)]
df.shape

#### Partitioning dataset

Let's make train, dev, and test splits that are 80:10:10

First shuffle the dataset

In [None]:
df.sample(frac=1)

In [None]:
df = df.sample(frac=1)
df.head(5)

Now split our data into appropriate partitions for training and testing

In [None]:
train_max_idx = int(df.shape[0] * .8)
dev_max_idx = int((df.shape[0] * .1) + train_max_idx)


train_max_idx, dev_max_idx

In [None]:
train_df = df.iloc[:train_max_idx]
dev_df = df.iloc[train_max_idx:dev_max_idx]
test_df = df.iloc[dev_max_idx:]

train_df.shape, dev_df.shape, test_df.shape

### Train a model

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [None]:
%%time

vectorizer.fit(train_df['text'])

In [None]:
X_train = vectorizer.transform(train_df['text'])
X_dev = vectorizer.transform(dev_df['text'])
# X_test = vectorizer.transform(test_df['text'])

In [None]:
%%time 

nb_model.fit(X_train, train_df['author'])

In [None]:
nb_model.score(X_train, train_df['author'])

In [None]:
nb_model.score(X_dev, dev_df['author'])

#### Let's modify the value for smoothing

By default, Laplacian smoothing (add-one) is used. But lets change the value and see what happens

In [None]:
nb_model = MultinomialNB(alpha=10)
nb_model

In [None]:
%%time 

nb_model.fit(X_train, train_df['author'])

In [None]:
nb_model.score(X_train, train_df['author'])

In [None]:
nb_model.score(X_dev, dev_df['author'])

**Question:** Was add-10 smoothing better than add-one?

Let's look at add-two smoothing

In [None]:
nb_model = MultinomialNB(alpha=2)
nb_model.fit(X_train, train_df['author'])
nb_model.score(X_train, train_df['author']), nb_model.score(X_dev, dev_df['author'])

Let's look at no smoothing

In [None]:
nb_model = MultinomialNB(alpha=0)
nb_model.fit(X_train, train_df['author'])
nb_model.score(X_train, train_df['author']), nb_model.score(X_dev, dev_df['author'])

Let's loop through lots of values and see differences in accuracy

In [None]:
from tqdm import tqdm

smoothing_values = np.arange(0,15, 0.5)
train_accuracy, dev_accuracy = [], []

for smoothing_value in tqdm(smoothing_values):
    nb_model = MultinomialNB(alpha=smoothing_value)
    nb_model.fit(X_train, train_df['author'])
    train_accuracy.append(nb_model.score(X_train, train_df['author']))
    dev_accuracy.append(nb_model.score(X_dev, dev_df['author']))
    
results_df = pd.DataFrame({'alpha': smoothing_values, 
              'train_accuracy': train_accuracy,
             'dev_accuracy': dev_accuracy})

In [None]:
results_df

In [None]:
results_df.plot(kind='line', x='alpha')

**Question:** From the above, which value of smoothing seemd to work the best?

In [None]:
results_df['dev_accuracy'].argmax()

So now we will use that value for the model we apply to our held-out test data

In [None]:
best_alpha = results_df.loc[results_df['dev_accuracy'].argmax()]['alpha']
best_alpha

In [None]:
nb_model = MultinomialNB(alpha=best_alpha)
nb_model.fit(X_train, train_df['author'])

X_test = vectorizer.transform(test_df['text'])

nb_model.score(X_test, test_df['author'])

(back to slides)

## Metrics beyond accuracy

In [None]:
import nltk
moview_reviews = nltk.corpus.movie_reviews
review_files = [(file_id, file_id.startswith("pos")) for file_id in moview_reviews.fileids()]

df = pd.DataFrame(review_files)
df = df.rename(columns={0: "file_name", 1: "gold-label"})

def read_mov_review(f_name):
    return moview_reviews.open(f_name).read()

df['review_text'] = df['file_name'].apply(read_mov_review)

In [None]:
df[df['gold-label'] == False].sample(1000 - 150).index

In [None]:
df.drop(index= df[df['gold-label'] == False].sample(1000 - 150).index)

In [None]:
tmp_df = df.drop(index= df[df['gold-label'] == False].sample(1000 - 150).index)

In [None]:
tmp_df = tmp_df.sample(frac=1)

In [None]:
#tmp_df['drugName'].value_counts(normalize=True)
tmp_df['gold-label'].value_counts(normalize=True)

In [None]:
train_max_idx = int(tmp_df.shape[0] * .8)
dev_max_idx = int((tmp_df.shape[0] * .1) + train_max_idx)


train_max_idx, dev_max_idx

train_df = tmp_df.iloc[:train_max_idx]
dev_df = tmp_df.iloc[train_max_idx:dev_max_idx]
test_df = tmp_df.iloc[dev_max_idx:]

train_df.shape, dev_df.shape, test_df.shape

In [None]:
vectorizer = CountVectorizer(min_df=10)
vectorizer.fit(train_df['review_text'])

X_train = vectorizer.transform(train_df['review_text'])
X_dev = vectorizer.transform(dev_df['review_text'])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

clf = LogisticRegression(max_iter=1e3, C=100)
#clf = MLPClassifier(hidden_layer_sizes=10)
clf.fit(X_train, train_df['gold-label'])

clf.score(X_train, train_df['gold-label']), clf.score(X_dev, dev_df['gold-label'])

**Question:** What might be misleading about the dev accuracy?

<details>
<summary>Hint</summary>
    Think about value_counts from above
</details>

In [None]:
dev_df['gold-label'].value_counts(normalize=True)

In [None]:
clf.predict(X_dev)

In [None]:
pd.Series(clf.predict(X_dev)).value_counts()

In [None]:
dev_df = dev_df.assign(prediction = clf.predict(X_dev))

In [None]:
dev_df

In [None]:
sklearn.metrics.confusion_matrix(dev_df['prediction'], dev_df['gold-label'])

In [None]:
sklearn.metrics.precision_score(dev_df['prediction'], dev_df['gold-label'])

In [None]:
sklearn.metrics.recall_score(dev_df['prediction'], dev_df['gold-label'])

In [None]:
sklearn.metrics.f1_score(dev_df['prediction'], dev_df['gold-label'])

(back to slides)


## K-Means walkthrough

Example comes from https://stackoverflow.com/questions/65449241/plotting-the-kmeans-cluster-centers-for-every-iteration-in-python.

I'd recommend going through this [blog post](https://towardsdatascience.com/k-means-clustering-with-scikit-learn-6b47a369a83c)

## Dataset - Obits from HW02

Now lets look at using kmeans to cluster documents

Load in data. This takes a little while.

In [None]:
df = pd.read_csv("data/tfidf_hw02.csv.gz", compression="gzip")
df.shape

In [None]:
df.index

In [None]:
df.head(5)

In [None]:
df.index = df['subject']
df

In [None]:
df = df.drop(columns=['subject'])
df

Let's store the dataframe in a new numpy array called X

In [None]:
X = df.to_numpy()
X.shape

## Clustering

Different clustering methods implemented in sklearn:
https://scikit-learn.org/stable/modules/clustering.html

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans_model = KMeans(n_clusters=10)

### Train Kmeans model

**Question:** What function do we think we can use to train the model?

<details>
<summary>Hint</summary>
    What function did we use yesterday to train the Naive Bayes and Logistic Regression classifiers
</details>

<details>
<summary>Solution</summary>
    .fit()
</details>

In [None]:
# skip below

In [None]:
%%time

kmeans_model.fit(X)

#### Properties of the kmeans_model 

##### Labels (cluster ID) for each example

In [None]:
kmeans_model.labels_

In [None]:
kmeans_model.labels_.shape

We can see how many documents were assigned to each of the 10 clusters

In [None]:
pd.Series(kmeans_model.labels_).value_counts()

##### Center for each cluster

In [None]:
kmeans_model.cluster_centers_

In [None]:
kmeans_model.cluster_centers_.shape

**Question** What do these numbers represent?

<details>
<summary>Solution</summary>
    First digit is the number of clusters, second is the dimensions of the center each cluster. 35408 because that is the size of our vocab
</details>

#### Determining the cluster for new examples


**Question:** What function do we think we can use to use the model to assign clusters to new examples?

<details>
<summary>Hint</summary>
    What function did we use yesterday to test the Naive Bayes and Logistic Regression classifiers
</details>

<details>
<summary>Solution</summary>
    .predict()
</details>


In [None]:
kmeans_model.predict(X)

In [None]:
df['cluster_id'] = kmeans_model.predict(df)

In [None]:
kmeans_model.transform(df)

##### What about transform?


Read the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans.transform) and explain what transform does

In [None]:
kmeans_model.transform(X)

In [None]:
kmeans_model.transform(X).shape

In [None]:
distances = kmeans_model.transform(X)

In [None]:
distances.shape

Let's store the distances in a dataframe

In [None]:
distances_df = pd.DataFrame(distances)
distances_df.index = df.index
distances_df

### Analyze the clusters

In [None]:
kmeans_model.n_clusters

#### Print out all people in each cluster

In [None]:
for cluster_id in range(kmeans_model.n_clusters):
    print(cluster_id, df[df['cluster_id'] == cluster_id].index)
    print()

#### Print out all people in each cluster sorted by distance to cluster's center

In [None]:
cluster_id

In [None]:
names = df[df['cluster_id'] == cluster_id].index
names

In [None]:
distances_df[distances_df.index.isin(names)]

In [None]:
distances_df[distances_df.index.isin(names)].sort_values(by=cluster_id)

Let's put it together

In [None]:
for cluster_id in range(kmeans_model.n_clusters):
    names = df[df['cluster_id'] == cluster_id].index
    print(cluster_id, distances_df[distances_df.index.isin(names)].sort_values(by=cluster_id).index[:10])
    print()

Let's look at the lists above and let's see what we find

(stop here)

### AgglomerativeClustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

agg_model = AgglomerativeClustering(n_clusters=5, compute_distances=True)
agg_model

In [None]:
%%time 

agg_model.fit(X)

In [None]:
agg_model.labels_

In [None]:
agg_model.fit_predict(X)

In [None]:
pd.Series(agg_model.labels_).value_counts()

In [None]:
agg_model.children_.shape

#### Plotting

In [None]:
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
plt.title('Hierarchical Clustering Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(agg_model, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

In [None]:
plt.title('Hierarchical Clustering Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(agg_model, truncate_mode='level', p=5)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

(next week)


## Matrix Factorization / Dimensionality Reduction

### SVD

In [None]:
%%time 

from sklearn.decomposition import TruncatedSVD 

k = 10
svd = TruncatedSVD(n_components=k) 
U = svd.fit_transform(df)
S = svd.singular_values_
V = svd.components_

In [None]:
U

In [None]:
U.shape

In [None]:
S

In [None]:
S.shape

In [None]:
V

In [None]:
V.shape

### NMF

In [None]:
%%time 

from sklearn.decomposition import NMF
nmf = NMF(n_components=k, init='nndsvd', random_state=0) 
W = nmf.fit_transform(df)
H = nmf.components_

In [None]:
W

In [None]:
W.shape

In [None]:
H

In [None]:
H.shape