In [162]:
# general pyspark
from pyspark import SparkContext
from pyspark import SparkConf

# conf = SparkConf().setMaster("local").setAppName("svd_cluster.py")
# sc = SparkContext(conf = conf)

#import mllib
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.feature import Word2Vec
from pyspark.mllib.feature import StandardScalerModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix, BlockMatrix
from pyspark.mllib.linalg import Matrices
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel

# python imports
from math import sqrt
import numpy as np
from numpy import array
import os, csv, sys, time
from random import randint
from itertools import izip, izip_longest

sc = SparkContext.getOrCreate()

In [5]:
# def get_input(fn):

fn = "cl_tiny.csv"
cur_dir = os.path.abspath(os.curdir)
input_file_path = os.path.normpath(os.path.join(cur_dir, "..", "data", fn))
print(input_file_path)

os.path.isfile(input_file_path)

C:\Users\shane\programming\cs657_mining_massive_datasets\craigslist_clustering\data\cl_tiny.csv


True

In [19]:

"""
docConcentration or alpha, Concentration parameter (commonly named “alpha”) for the 
prior placed on documents’ distributions over topics (“theta”)

A high alpha-value will lead to documents being more similar in terms of what topics they contain     
The effect is based on topic distribution assumption
if symmetric distribution - high alpha means that each doc will contain a mix of most topics
if symmetric distribution - low alpha means that docs will contain a few topics
if asymmetric distribution - vice versa
--------------------

topicConcentration or beta – Concentration parameter (commonly named “beta” or “eta”)
for the prior placed on topics’ distributions over terms. (default: -1.0)

A high beta-value will lead to topics being more similar in terms of what words they contain.    
if symmetric distribution - high beta means that each doc will contain a mix of most words
if symmetric distribution - low alpha means that docs will contain a few words
if asymmetric distribution - vice versa
 
"""
cluster_model_params = {
   
    "lda":{
        "k":[x for x in range(2, 20, 2)],
        "max_iters": [x for x in range(15, 30, 10)], #default 20  
        "doc_con":[float(x/100.0) for x in range(1, 10)], # default -1.0
        "topic_con": [float(x/100.0) for x in range(1, 10)] # default -1.0
    }
    "bimeans":{
        "k":[x for x in range(2, 20, 2)],
        "max_iters": [x for x in range(15, 30, 10)], #default 20
        "minDivisibleClusterSize": [float(x/100.0) for x in range(1, 10)], #percent
    }
    "kmeans":{
        "k":[x for x in range(2, 20, 2)],
        "max_iters": [x for x in range(15, 30, 10)], #default 20
        "initializationMode": ["random", "k-means||")] #default k-means        
    }
    "gaus":{
        "k":[x for x in range(2, 20, 2)],
        "max_iters": [x for x in range(90, 150, 10)], #default 100
    }
    "pic":{
        "k":[x for x in range(2, 20, 2)],
        "max_iters": [x for x in range(90, 150, 10)], #default 100
    }
        
}
cluster_models = ["lda", "bimeans", "kmeans", "gaus", "pic"]

# processed_rdd = input.map(lambda x: str(x.decode('utf-8', 'ignore'))).map(lambda x: x.split(","))
# processed_rdd.take(2)

In [None]:
# [(postTitle, postingURL, postLocation, time, lat, long, address, dateRetrieved, post_date, ad), ...]
# tiny input has 30 reviews
raw_ads = sc.textFile(input_file_path)
# set = input.take(3)
# [ad0, ad1, ..]
ads_rdd = raw_ads.map(lambda x: str(x.decode('utf-8', 'ignore')))

In [59]:
type(ads_rdd.collect())

list

In [129]:
hashingTF = HashingTF()
tf = hashingTF.transform(ads_rdd)
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
# spark.mllib's IDF implementation provides an option for ignoring terms
# which occur in less than a minimum number of documents.
# In such cases, the IDF for these terms is set to 0.
# This feature can be used by passing the minDocFreq value to the IDF constructor.
idfIgnore = IDF(minDocFreq=1).fit(tf)
tfidf_rdd = idfIgnore.transform(tf)
matrix_rdd = RowMatrix(tfidf_rdd)
#     left singular vectors
#     type = RowMatrix
#     svd_u = svd.U
#     array of DenseVectors, m_documents x n_topics
#     [[topic_i, ...], ...]
#     return svd_u.rows.collect()
svd_i = matrix_rdd.computeSVD(3, computeU=True)
rdd = svd_i.U.rows
clusters = KMeans.train(rdd, 5)
# WSSSE = rdd.map(lambda point: error(point)).reduce(lambda x, y: x + y)
# print("Within Set Sum of Squared Error = " + str(WSSSE))

In [132]:
# predicts = rdd.map(lambda x: (x, clusters.predict(x)))

[(DenseVector([0.1343, 0.0476, 0.021]), 4),
 (DenseVector([0.2498, 0.0151, 0.3793]), 0),
 (DenseVector([0.0404, 0.0287, 0.0503]), 3)]

SyntaxError: invalid syntax (<ipython-input-100-b66117a46539>, line 4)

In [None]:
run_kmeans_gs(ads_rdd)

In [148]:
def run_kmeans_gs(rdd, verbose=True):
    predictions = 
    ks = [x for x in range(2, 10, 2)]
    svd_topics = [x for x in range(2, 10, 2)]
    for topic in svd_topics:
        for k in ks:
            if verbose: print("k:{}, topic:{}".format(k, topic))
            # calculate tfidf scores
            tfidf_rdd = get_tfidf(rdd)
            
            # transform bag of words to svd
            # the svd object has U, Sigma, V
            # 
            svd = get_svd(tfidf_rdd, topic)
            
            # run kmeans with left singular vectors 
            predictions_rdd = kmeans(svd, k)
            if predictions is None:
                # list of vectors, where each vector is a document and its topic scores
                # [DenseVector[dim_i, dim_i+1 ...], ...]
                predictions = predictions_rdd.collect()
                print(type(predictions))
            else:
                predictions.append(predictions_rdd.collect())
    kmeans_predictions_fn="predictions_k{}_topics{}.npz"
    save_cluster_predictions(np.array(predictions), model="km", fn=kmeans_predictions_fn)
    
# k_means
# Build the model (cluster the data)
# kmeans(rdd, k, maxIterations, runs, InitializationMode, seed, initializationSteps, epsilon, initialModel)
def kmeans(svd, k=2, n_iters=10, save_model=False, verbose=True):
    model="kmeans"
    if verbose: print("in kmeans")
        
    # left singular vectors, U
    # array of DenseVectors, m_documents x n_topics
    # [ doc_i, doc_i+1, ...]
    # [[topic_j_score, topic_j+1_score ...], ...]
    rdd = svd.U.rows
    
    # Build the model (cluster the data)
    model = KMeans.train(rdd, k, maxIterations=n_iters)

    # Evaluate clustering
    cost = model.computeCost(rdd)
    save_cluster_metrics("bimeans", cost, k=k, clust_size=cluster_size)
    
    if save_model:
        save_cluster_model(model, fn)
    

    # returns an rdd of [(topic_values, cluster_id), ...]
    return rdd.map(lambda x: (x, model.predict(x)))


In [178]:
run_bimeans_gs(ads_rdd)

k:2, topic:2, cluster_size:[0.01]
in get_tfidf
in get_svd
<pyspark.mllib.linalg.distributed.SingularValueDecomposition object at 0x0000000007F79908>
in kmeans
saving cluster metrics to csv


Traceback (most recent call last):
  File "C:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\cloudpickle.py", line 148, in dump
    return Pickler.dump(self, obj)
  File "C:\Users\shane\Anaconda3\envs\py27\lib\pickle.py", line 224, in dump
    self.save(obj)
  File "C:\Users\shane\Anaconda3\envs\py27\lib\pickle.py", line 286, in save
    f(self, obj) # Call unbound method with explicit self
  File "C:\Users\shane\Anaconda3\envs\py27\lib\pickle.py", line 568, in save_tuple
    save(element)
  File "C:\Users\shane\Anaconda3\envs\py27\lib\pickle.py", line 286, in save
    f(self, obj) # Call unbound method with explicit self
  File "C:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\cloudpickle.py", line 255, in save_function
    self.save_function_tuple(obj)
  File "C:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\cloudpickle.py", line 292, in save_function_tuple
    save((code, closure, base_globals))
  File "C:\Users\shane\Anaconda3\envs\py27\lib\pickle.py", line 286, in sa

PicklingError: Could not serialize object: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.

In [177]:
def run_bimeans_gs(rdd, verbose=True):
    model = "bimeans"
    predictions = None
    ks = [x for x in range(2, 3, 2)]
    minDivisibleClusterSize = [float(x/100.0) for x in range(1, 50, 50)], #percent
    svd_topics = [x for x in range(2, 3, 2)]
    for topic in svd_topics:
        for k in ks:
            for c_size in minDivisibleClusterSize:
                if verbose: print("k:{}, topic:{}, cluster_size:{}".format(k, topic, c_size))
                # calculate tfidf scores
                tfidf_rdd = get_tfidf(rdd)

                # transform bag of words to svd
                svd = get_svd(tfidf_rdd, topic)
                if verbose: print(svd)

                # run kmeans with left singular vectors 
                predictions_rdd = bimeans(svd, k, c_size)
                # first run
                if predictions is None:
                    #
                    predictions = predictions_rdd.collect()
                    print(type(predictions))
                else:
                    predictions.append(predictions_rdd.collect())
    bimeans_predictions_fn="predictions_{}_topics{}_k{}_csize{}.npz".format(model, topic, k, c_size)
    save_cluster_predictions(np.array(predictions), model="bimeans", fn=kmeans_predictions_fn)

def bimeans(svd, k, cluster_size, verbose=True, save_model=False):
    model = "bimeans"
    if verbose: print("in kmeans")
        
    # left singular vectors, U
    # array of DenseVectors, m_documents x n_topics
    # [ doc_i, doc_i+1, ...]
    # [[topic_j_score, topic_j+1_score ...], ...]
    rdd = svd.U.rows
    
    # Build the model (cluster the data)
#     model = BisectingKMeans.train(rdd, k=k, minDivisibleClusterSize=cluster_size)
    model = BisectingKMeans.train(rdd)

    # Evaluate clustering
    cost = model.computeCost(rdd)
    save_cluster_metrics("bimeans", cost, k=k, clust_size=cluster_size)
    
    if save_model:
        save_cluster_model(model, fn)
        
    # returns an rdd of [(topic_values, cluster_id), ...]
    return rdd.map(lambda x: (x, model.predict(x)))

In [None]:
run_lds_gs(ads_rdd)


In [176]:
def run_lda_gs(rdd, verbose=True):

    model = "lda"
    predictions = None
    ks = [x for x in range(2, 3, 2)]
    doc_concepts = [float(x/100.0) for x in range(1, 10)], # default -1.0
    topic_concepts =[float(x/100.0) for x in range(1, 10)] # default -1.0
    for k in ks:
        for d_concept in doc_concepts:
            for t_concept in topic_concepts:
                if verbose: print("k:{}, d_concept:{}, t_concept:{}".format(k, d_concept, t_concept))
                # calculate tfidf scores
                tfidf_rdd = get_tfidf(rdd)

                # transform bag of words to svd
                svd = get_svd(tfidf_rdd, topic)
                if verbose: print(svd)

                # run kmeans with left singular vectors 
                predictions_rdd = bimeans(svd, k, cluster_size)
                # first run
                if predictions is None:
                    #
                    predictions = predictions_rdd.collect()
                    print(type(predictions))
                else:
                    predictions.append(predictions_rdd.collect())
    bimeans_predictions_fn="predictions_{}_topics{}_k{}_csize{}.npz".format(model, topic, k, c_size)
    save_cluster_predictions(np.array(predictions), model="bimeans", fn=kmeans_predictions_fn)

def lda(rdd, k, save_model=False):
    model = "lda"
    if verbose: print("in {}", model)
        
    # Index documents with unique IDs
    corpus = rdd.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

    # Cluster the documents into k topics using LDA
    model = LDA.train(corpus, k=3)

    # Output topics. Each is a distribution over words (matching word count vectors)
    print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())
          + " words):")
    topics = model.topicsMatrix()
    for topic in range(3):
        print("Topic " + str(topic) + ":")
        for word in range(0, ldaModel.vocabSize()):
            print(" " + str(topics[word][topic]))

    # Evaluate clustering
    cost = model.computeCost(rdd)
    save_cluster_metrics("bimeans", cost, k=k, clust_size=cluster_size)
    
    if save_model:
        save_cluster_model(model, fn)
        
    # returns an rdd of [(topic_values, cluster_id), ...]
    return rdd.map(lambda x: (x, model.predict(x)))

In [None]:
run_gauss_gs(ads_rdd)

In [175]:
def run_gauss_gs(rdd, verbose=True):
    model = "bimeans"
    predictions = None
    ks = [x for x in range(2, 3, 2)]
    minDivisibleClusterSize = [float(x/100.0) for x in range(1, 50, 50)], #percent
    svd_topics = [x for x in range(2, 3, 2)]
    for topic in svd_topics:
        for k in ks:
            for c_size in minDivisibleClusterSize:
                if verbose: print("k:{}, topic:{}, cluster_size:{}".format(k, topic, c_size))
                # calculate tfidf scores
                tfidf_rdd = get_tfidf(rdd)

                # transform bag of words to svd
                svd = get_svd(tfidf_rdd, topic)
                if verbose: print(svd)

                # run kmeans with left singular vectors 
                predictions_rdd = bimeans(svd, k, c_size)
                # first run
                if predictions is None:
                    #
                    predictions = predictions_rdd.collect()
                    print(type(predictions))
                else:
                    predictions.append(predictions_rdd.collect())
    bimeans_predictions_fn="predictions_{}_topics{}_k{}_csize{}.npz".format(model, topic, k, c_size)
    save_cluster_predictions(np.array(predictions), model="bimeans", fn=kmeans_predictions_fn)

def gaussian(svd, k, cluster_size, verbose=True, save_model=False):
    model = "bimeans"
    if verbose: print("in kmeans")
        
    # left singular vectors, U
    # array of DenseVectors, m_documents x n_topics
    # [ doc_i, doc_i+1, ...]
    # [[topic_j_score, topic_j+1_score ...], ...]
    rdd = svd.U.rows
    
    # Build the model (cluster the data)
    model = BisectingKMeans.train(rdd, k=k, minDivisibleClusterSize=cluster_size)
    # Build the model (cluster the data)
    model = GaussianMixture.train(rdd, 2)
    
    
    # Evaluate clustering
    cost = model.computeCost(rdd)
    save_cluster_metrics("bimeans", cost, k=k, clust_size=cluster_size)
    
    if save_model:
        save_cluster_model(model, fn)
        
    # returns an rdd of [(topic_values, cluster_id), ...]
    return rdd.map(lambda x: (x, model.predict(x)))


def gaussian_clustering(rdd):


    # Save and load model
    gmm.save(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")
    sameModel = GaussianMixtureModel\
        .load(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")

    # output parameters of model
    for i in range(2):
        print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
              "sigma = ", gmm.gaussians[i].sigma.toArray())


In [134]:
def get_tfidf(rdd, verbose=True):
    if verbose: print("in get_tfidf")
    # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
    # First to compute the IDF vector and second to scale the term frequencies by IDF.
    hashingTF = HashingTF()
    tf = hashingTF.transform(rdd)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    # spark.mllib's IDF implementation provides an option for ignoring terms
    # which occur in less than a minimum number of documents.
    # In such cases, the IDF for these terms is set to 0.
    # This feature can be used by passing the minDocFreq value to the IDF constructor.
    idfIgnore = IDF(minDocFreq=1).fit(tf)
    tfidf_rdd = idfIgnore.transform(tf)
    # rdd of SparseVectors [(doc_id_i: {word_id_j: tfidfscore_j, ...}), ... }]
    # or m docs x n counts
    return tfidf_rdd

def get_svd(tfidf_rdd, n_topics=3, verbose=True):
    if verbose: print("in get_svd")
    # distributed matrix
    matrix_rdd = RowMatrix(tfidf_rdd)
#     left singular vectors
#     type = RowMatrix
#     svd_u = svd.U
#     array of DenseVectors, m_documents x n_topics
#     [[topic_i, ...], ...]
#     return svd_u.rows.collect()
    svd = matrix_rdd.computeSVD(n_topics, computeU=True)
    return svd




In [173]:
def save_cluster_metrics(model, score, k=None, max_iters=None, clust_size=None, doc_concept=None, topic_concept=None):
    print('saving cluster metrics to csv')
    row = [model, score, k, max_iters, clust_size, doc_concept, topic_concept]
    with open(fn, 'a+') as f:
        writer = csv.writer(f)
        writer.writerow(row)

In [118]:
def save_svd_U(svd_U_rdd, fn="svd_u_results.npz"):
    np.savez(fn,np.array(svd_U_rdd))

#     sentence = "aa bb ab" * 10 + "a cd " * 10
#     localDoc = [sentence, sentence]
#     doc = sc.parallelize(localDoc).map(lambda line: line.split(" "))
#     model = Word2Vec().setVectorSize(10).setSeed(42).fit(doc)
# i think it is expecting a list of document lists [[word1, word2,...], ...]
def get_word2vec(rdd):
    word2vec = Word2Vec()
    model = word2vec.fit(ads_rdd)

def save_cluster_predictions(cluster_results, model="km", fn="cluster_results.pkl"):
    results_fn = "{}_{}".format(model, fn)
    np.savez(results_fn, cluster_results_rdd.collect())

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


# Save and load model
def save_cluster_model(clusters, fn="test_model"):
    clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")

def load_cluster_model(clusters, fn="test_model"):
    sameModel = KMeansModel.load(sc, fn)

# def get_model(n_topics, cluster_model="kmeans", cluster_params):
#     if cluster_model == "gaus":
#         gaussian_clustering()
#         pass
#     elif cluster_model == "kmeans":
#         pass
#     elif cluster_model == "bimeans":
#         pass
#     elif cluster_model == "lda":
#         pass
#     elif cluster_model == "pic":
#         pass
#     else:
#         print("a viable option wasnt chosen")
    
def grid_search(params, model):
    for i in param:
        rdd = get_svd()
        model = get_model()
        y_hats = make_preds()
        results_to_disk(rdd, y)

def gaussian_clustering(rdd):
    # Build the model (cluster the data)
    gmm = GaussianMixture.train(rdd, 2)

    # Save and load model
    gmm.save(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")
    sameModel = GaussianMixtureModel\
        .load(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")

    # output parameters of model
    for i in range(2):
        print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
              "sigma = ", gmm.gaussians[i].sigma.toArray())


def PIC_clustering(rdd, k, n_iters):
    # Load and parse the data
    pass
    data = sc.textFile("data/mllib/pic_data.txt")
    similarities = data.map(lambda line: tuple([float(x) for x in line.split(' ')]))

    # Cluster the data into two classes using PowerIterationClustering
#     model = PowerIterationClustering.train(similarities, 2, 10)
# # invalid syntax
# #     model.assignments().foreach(lambda x: print(str(x.id) + " -> " + str(x.cluster)))

#     # Save and load model
#     model.save(sc, "target/org/apache/spark/PythonPowerIterationClusteringExample/PICModel")
#     sameModel = Po[werIterationClusteringModel\
#             .load(sc, "target/org/apache/spark/PythonPowerIterationClusteringExample/PICModel")





In [33]:
"""
docConcentration or alpha, Concentration parameter (commonly named “alpha”) for the 
prior placed on documents’ distributions over topics (“theta”)

A high alpha-value will lead to documents being more similar in terms of what topics they contain     
The effect is based on topic distribution assumption
if symmetric distribution - high alpha means that each doc will contain a mix of most topics
if symmetric distribution - low alpha means that docs will contain a few topics
if asymmetric distribution - vice versa
--------------------

topicConcentration or beta – Concentration parameter (commonly named “beta” or “eta”)
for the prior placed on topics’ distributions over terms. (default: -1.0)

A high beta-value will lead to topics being more similar in terms of what words they contain.    
if symmetric distribution - high beta means that each doc will contain a mix of most words
if symmetric distribution - low alpha means that docs will contain a few words
if asymmetric distribution - vice versa
 
"""
cluster_model_params = {
   
    "lda":{
        "k":[x for x in range(2, 20, 2)],
        "max_iters": [x for x in range(15, 30, 10)], #default 20  
        "doc_con":[float(x/100.0) for x in range(1, 10)], # default -1.0
        "topic_con": [float(x/100.0) for x in range(1, 10)] # default -1.0
    }
    "bimeans":{
        "k":[x for x in range(2, 20, 2)],
        "max_iters": [x for x in range(15, 30, 10)], #default 20
        "minDivisibleClusterSize": [float(x/100.0) for x in range(1, 10)], #percent
    }
    "kmeans":{
        "k":[x for x in range(2, 20, 2)],
        "max_iters": [x for x in range(15, 30, 10)], #default 20
        "initializationMode": ["random", "k-means||")] #default k-means        
    }
    "gaus":{
        "k":[x for x in range(2, 20, 2)],
        "max_iters": [x for x in range(90, 150, 10)], #default 100
    }
    "pic":{
        "k":[x for x in range(2, 20, 2)],
        "max_iters": [x for x in range(90, 150, 10)], #default 100
    }
        
}
cluster_models = ["lda", "bimeans", "kmeans", "gaus", "pic"]

SyntaxError: invalid syntax (<ipython-input-33-3a597014084a>, line 29)

In [None]:
def run_bimeans_gs(rdd, verbose=True):
    predictions = None
    ks = [x for x in range(2, 20, 2)]
    minDivisibleClusterSize: [float(x/100.0) for x in range(1, 50,10)], #percent
    svd_topics = [x for x in range(2, 10, 2)]
    for topic in svd_topics:
        for k in ks:
            # calculate tfidf scores
            tfidf_rdd = get_tfidf(rdd)
            
            # transform bag of words to svd
            svd = get_svd(tfidf_rdd, topic)
            if verbose: print(svd)
            
            # run kmeans with left singular vectors 
            predictions_rdd = bimeans(svd, k)
            if predictions is None:
                predictions = predictions_rdd.collect()
                print(type(predictions))
            else:
                predictions.append(predictions_rdd.collect())
    kmeans_predictions_fn="predictions_k{}_topics{}.npz"
    save_cluster_predictions(np.array(predictions), model="km", fn=kmeans_predictions_fn)
    save_cluster_predictions(np.array(predictions), model="km", fn=kmeans_predictions_fn)

In [None]:
# def transpose_rdd()
# tfidf_rdd.flatMap(lambda x: x).take(3)
# flatMap by keeping the column position
# flat_rdd = tfidf_rdd.flatMap(lambda row: row.map(lambda col: (col, row.indexOf(col))))
# flat_rdd.take(3)
# .map(v => (v._2, v._1)) // key by column position
# .groupByKey.sortByKey   // regroup on column position, thus all elements from the first column will be in the first row
# .map(_._2)              // discard the key, keep only value
# df = rdd.toDF()
# # Grab data from first columns, since it will be transposed to new column headers
# new_header = [i[0] for i in dt.select("_1").rdd.map(tuple).collect()]

# # Remove first column from dataframe
# dt2 = dt.select([c for c in dt.columns if c not in ['_1']])

# # Convert DataFrame to RDD
# rdd = dt2.rdd.map(tuple)

# # Transpose Data
# rddT1 = rdd.zipWithIndex().flatMap(lambda (x,i): [(i,j,e) for (j,e) in enumerate(x)])
# rddT2 = rddT1.map(lambda (i,j,e): (j, (i,e))).groupByKey().sortByKey()
# rddT3 = rddT2.map(lambda (i, x): sorted(list(x), cmp=lambda (i1,e1),(i2,e2) : cmp(i1, i2)))
# rddT4 = rddT3.map(lambda x: map(lambda (i, y): y , x))

# # Convert back to DataFrame (along with header)
# df = rddT4.toDF(new_header)

# return df