In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, mixture
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
import numpy as np
import matplotlib.pyplot as plt

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

vectorizer = TfidfVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(newsgroups_train.data)

vectors_test = vectorizer.transform(newsgroups_test.data)

labels = newsgroups_train.target
true_k = np.unique(labels).shape[0]

# print(labels, true_k)
# print(newsgroups_test.target_names)

# K-mean
print("K-Mean")
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
km.fit(vectors)

# order_centroids = km.cluster_centers_.argsort()[:, ::-1]
# terms = vectorizer.get_feature_names()
# for i in range(true_k):
#     print("Cluster %d:" % i, end='')
#     for ind in order_centroids[i, :10]:
#         print(' %s' % terms[ind], end='')
#         print()

# print(km.cluster_centers_)
pre_labels = km.predict(vectors_test)

# How to decide label with majority rule

from collections import Counter
## for k clusters
lbls = [[] for i in range(true_k)]
i = 0
all_labels =len(pre_labels)

while i < all_labels:
    cluster = pre_labels[i]
    label = newsgroups_train.target[i]
    lbls[cluster].append(label)
    i += 1

label_dict = {}
for cluster in range(true_k):
    label, count = Counter(lbls[cluster]).most_common()[0]
    label_dict[cluster] = label    
print (label_dict)

# print("Weighted F-1 score", f1_score(labels, km.labels_, average='weighted'))
print("Weighted F-1 score", f1_score(newsgroups_test.target, pre_labels, average='weighted'))

# Gaussian Mixture Models
print("GMM")
gmix = mixture.GMM(n_components=true_k, covariance_type='full')
gmix.fit(vectors.todense())
pre_labels = gmix.predict(vectors_test.todense())

## for k clusters
lbls = [[] for i in range(true_k)]
i = 0
all_labels =len(pre_labels)

while i < all_labels:
    cluster = pre_labels[i]
    label = newsgroups_train.target[i]
    lbls[cluster].append(label)
    i += 1

label_dict = {}
for cluster in range(true_k):
    label, count = Counter(lbls[cluster]).most_common()[0]
    label_dict[cluster] = label    
print (label_dict)

print("Weighted F-1 score", f1_score(newsgroups_test.target, pre_labels, average='weighted'))

K-Mean
{0: 12, 1: 8, 2: 2, 3: 3, 4: 10, 5: 6, 6: 2, 7: 4, 8: 8, 9: 1, 10: 18, 11: 10, 12: 6, 13: 10, 14: 13, 15: 9, 16: 15, 17: 12, 18: 15, 19: 3}
Weighted F-1 score 0.0340343406213
GMM


In [5]:
from sklearn.datasets import load_boston
from sklearn.linear_model import Lasso, Ridge
from sklearn.cross_validation import train_test_split
import numpy as np

boston = load_boston()

# divide dataset randomly
X_train, X_test, Y_train, Y_test = train_test_split(boston.data, boston.target, test_size=0.3, random_state=5)

# Create linear regression object with a ridge coefficient 0.5
ridge = Ridge()
ridge_a = Ridge(alpha=0.5)

# Train the model using the training set
ridge.fit(boston.data,boston.target)
ridge_a.fit(X_train,Y_train)

# Compute RMSE on training data
pred_test = ridge.predict(X_test)
pred_test_a = ridge_a.predict(X_test)

print ("Ridge")
print ("Default parameters: ")
print ("Fit a model with full training data set, and calculate MSE on test set:", np.mean((Y_test - pred_test) ** 2))
print ("With alpha=0.5: ");
print ("Fit a model with validation set, and calculate MSE on test set:", np.mean((Y_test - pred_test_a) ** 2))

# Create linear regression object with a ridge coefficient 0.5
lasso = Lasso()
lasso_a = Lasso(alpha=0.5)

# Train the model using the training set
lasso.fit(boston.data,boston.target)
lasso_a.fit(X_train,Y_train)

# Compute RMSE on training data
pred_test_l = lasso.predict(X_test)
pred_test_la = lasso_a.predict(X_test)

print ("Lasso")
print ("Default parameters: ")
print ("Fit a model with full training data set, and calculate MSE on test set:", np.mean((Y_test - pred_test_l) ** 2))
print ("With alpha=0.5: ");
print ("Fit a model with validation set, and calculate MSE on test set:", np.mean((Y_test - pred_test_la) ** 2))

Ridge
Default parameters: 
Fit a model with full training data set, and calculate MSE on test set: 27.8042428025
With alpha=0.5: 
Fit a model with validation set, and calculate MSE on test set: 31.2054589186
Lasso
Default parameters: 
Fit a model with full training data set, and calculate MSE on test set: 34.6523813556
With alpha=0.5: 
Fit a model with validation set, and calculate MSE on test set: 33.6083554076
