# Dataframe assembler

This notebook will:

* Input original dataset
* Conduct feature engineering for the following columns:
    * Lat/long: Add clusters, potentially; also neighborhood/other vars
    * Features: Exploded? and k-means clustering into 20 clusters -- kapow!
    * Manager: Add a manager score
    * Description: replace with text analysis thing, add columns for exclamations and punctuation
* This will generate a dataframe with several 'features' columns (eg. 'features_description', 'features_manager' etc.)
* We will then combine these columns into a single column of features vectors:
https://scikit-learn.org/0.18/auto_examples/hetero_feature_union.html looks very helpful for doing this

* We then split the data using 20% testing, 80% cv with 5 folds of 16% to parameterize the model

    * First model= logistic regression using no engineered features
    * Second model= random forest with no engineered features

    * Third model= logistic regression with engineered features
    * Fourth model= random forest with engineered features


Cross-validation and model comparison is based on log-loss.
    

In [1]:
# Initiate spark

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("Assemble_Data") \
    .config("spark.executor.memory", '4g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext
sqlCtx = SQLContext(sc)

In [2]:
# Import data
train_data_pd = pd.read_json("data/train.json")
train_data_df = sqlCtx.createDataFrame(train_data_pd)

# Feature Engineering

## Lat/long work:

## 'Features' work:

I have commented the rows, for more thurough explanations look in `rentalPrice_jonas.ipynb`

In [23]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer, StopWordsRemover
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.clustering import BisectingKMeans
import pyspark.sql.types as typ

def add_features_columns(df):
    
    #creates 1 sting of the features
    string_assembler = F.UserDefinedFunction(lambda x: ','.join(x), typ.StringType())
    df = df.withColumn("features", string_assembler(df["features"]))
    #lower case everything
    df = df.withColumn("features", F.lower(df["features"]))
    #adds feature "missing features" to NaN
    df = df.withColumn("features", 
                             F.when(df["features"] == '', 'missing features')
                             .otherwise(df["features"]))
    #split df on "," and "*" stores as new data frame
    feat_df = df.withColumn("features_list", F.split(df["features"], ',| \* '))
    #explodes the features into column "ex_features_list"
    feat_df_ex = feat_df.withColumn("ex_features_list", F.explode(feat_df["features_list"]))
    #creates clustering data frame with only column "ex_features_list"
    clustering_df = feat_df_ex[["ex_features_list"]]
    #renames the column
    clustering_df = clustering_df.withColumnRenamed("ex_features_list", "text")

    #creates a tokenizer 
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    #removes stop words
    remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens")
    #hashes the features into sparse vectors
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=2000)
    #invers document frequency - importance of the work (kind of)
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
    
    #creates and fits the pipeline
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])
    pipelined_df = pipeline.fit(clustering_df).transform(clustering_df)
    
    #Set the number of clusters determined in rentalPrice_jonas.ipynb
    num_k = 20
    #creates the k-means
    km = BisectingKMeans(k = num_k)
    #fits it to the pipelined data frame
    model = km.fit(pipelined_df)
    #transform into the results
    results = model.transform(pipelined_df)
    #changes the name of the column "prediction" to "cluster"
    results = results.withColumnRenamed("prediction", "clusters")
    #drops the unnecessary columns
    join_df = results.drop(*["tokens", "stopWordsRemovedTokens", "rawFeatures", "features"])
    #creates a column to add on
    join_df = join_df.withColumn("join_col", F.monotonically_increasing_id())
    feat_df_ex = feat_df_ex.withColumn("join_col", F.monotonically_increasing_id())
    #joins the df_together
    joined_df = feat_df_ex.join(join_df, feat_df_ex["join_col"] == join_df["join_col"], how = "left")
    joined_df = joined_df.drop("join_col")
    #have to ad constatnt column for the pivot function 
    joined_df = joined_df.withColumn("constant_val", F.lit(1))
    #pivots the data frame
    df_piv = joined_df\
                   .groupBy("listing_id")\
                   .pivot("clusters")\
                   .agg(F.coalesce(F.first("constant_val")))
    #Joins the data frame to the original
    return_df = df.join(df_piv, on = "listing_id", how = "left")
    #store the colusters in list, removes "listing_id"
    cluster_col = df_piv.columns
    cluster_col.remove("listing_id")
    #fills missing values
    return_df = return_df.fillna(0, subset = cluster_col)
    #changes the names of the columns to "#_feature_cluster" to the stings
    for cluster in cluster_col:
        return_df = return_df.withColumnRenamed(cluster, "feature_cluster_" + cluster)

    return return_df

Testing the function

In [4]:
train_data_df.columns

['bathrooms',
 'bedrooms',
 'building_id',
 'created',
 'description',
 'display_address',
 'features',
 'latitude',
 'listing_id',
 'longitude',
 'manager_id',
 'photos',
 'price',
 'street_address',
 'interest_level']

In [24]:
new_df = add_features_columns(train_data_df)

In [25]:
new_df.columns

['listing_id',
 'bathrooms',
 'bedrooms',
 'building_id',
 'created',
 'description',
 'display_address',
 'features',
 'latitude',
 'longitude',
 'manager_id',
 'photos',
 'price',
 'street_address',
 'interest_level',
 'feature_cluster_0',
 'feature_cluster_1',
 'feature_cluster_2',
 'feature_cluster_3',
 'feature_cluster_4',
 'feature_cluster_5',
 'feature_cluster_6',
 'feature_cluster_7',
 'feature_cluster_8',
 'feature_cluster_9',
 'feature_cluster_10',
 'feature_cluster_11',
 'feature_cluster_12',
 'feature_cluster_13',
 'feature_cluster_14',
 'feature_cluster_15',
 'feature_cluster_16',
 'feature_cluster_17',
 'feature_cluster_18',
 'feature_cluster_19']

## Manager work:

In [7]:
from pyspark.ml.feature import StringIndexer

def add_manager_skill(df):
    string_indexer = StringIndexer(inputCol = "manager_id", outputCol = "manager_idx")
    manager_df = string_indexer.fit(df).transform(df)
    
    manager_join = manager_df[["manager_idx"]].groupBy("manager_idx").count()
    
    manager_df = manager_df.join(manager_join, on = "manager_idx", how = "left")
    
    manager_df = manager_df.withColumn("interest_level", F.when(manager_df["interest_level"] == 'low', 0)
                                         .when(manager_df["interest_level"] == 'medium', 1)
                                         .otherwise(2))
    
    manager_skill = manager_df.groupBy("manager_idx").agg({"interest_level": "mean"})
    manager_skill = manager_skill.withColumnRenamed("avg(interest_level)", "manager_skill")
    manager_df = manager_df.join(manager_skill, on = "manager_idx", how = "left")
    
    return manager_df

In [8]:
train_data_df.columns

['bathrooms',
 'bedrooms',
 'building_id',
 'created',
 'description',
 'display_address',
 'features',
 'latitude',
 'listing_id',
 'longitude',
 'manager_id',
 'photos',
 'price',
 'street_address',
 'interest_level']

In [26]:
new_df2 = add_manager_skill(new_df)

In [27]:
new_df.show(3)

+----------+---------+--------+--------------------+-------------------+--------------------+----------------+--------------------+--------+---------+--------------------+--------------------+-----+--------------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|listing_id|bathrooms|bedrooms|         building_id|            created|         description| display_address|            features|latitude|longitude|          manager_id|              photos|price|      street_address|interest_level|feature_cluster_0|feature_cluster_1|feature_cluster_2|feature_cluster_3|feature_cluster_4|feature_cluster_5|feature_cluster_6|feature_cluster_7|feature_cluster_8

In [22]:
def get_magager_skill(train_df, test_df):
    
    #drops the column manager skill if it exists.
    if "manager_skill" in test_df.columns:
        test_df = test_df.drop("manager_skill")
    
    #Calculates the average manager skill
    avg_skill = train_df.select(F.mean(train_df['manager_skill'])).collect()[0][0]
    
    #Takes everey unique manager
    temp_df = train_df.dropDuplicates(["manager_id"])[["manager_id", "manager_skill"]]
    
    test_df = test_df.join(temp_df, on = "manager_id", how = "left")
    test_df = test_df.na.fill(avg_skill)
    
    
    return test_df

## Description work:

In [51]:
from pyspark.sql.functions import isnan
from pyspark.sql.functions import when, lit, col
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.linalg import DenseVector
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

# Make the replace function
def replace(column, value):
    return when(column != value, column).otherwise(lit("none"))

# Make the full function
def add_description_columns(df):
    # select only the description
    #train_data_df2 = df.select("interest_level","description")
    # clean blanks
    train4 = df.withColumn("description", replace(col("description"), '        '))
    train4 = train4.withColumn("description", replace(col("description"), ""))
    train4 = train4.withColumn("description", replace(col("description"), " "))
    train4 = train4.withColumn("description", replace(col("description"), "           "))
    # regular expression tokenizer
    regexTokenizer = RegexTokenizer(inputCol="description", outputCol="words", pattern="\\W") # I don't know what W is...

    # stop words
    add_stopwords = ["a","the","it","of","the","is","and", # standard stop words
     "A","this","in","for"]
    stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

    # bag of words count
    countVectors = CountVectorizer(inputCol="filtered", outputCol="word_features", vocabSize=1000, minDF=5)
    
    pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors])

    # Fit the pipeline to training documents.
    pipelineFit = pipeline.fit(train4)
    dataset = pipelineFit.transform(train4)
    dataset = dataset.withColumn("label", dataset["interest_level"].cast(IntegerType()))
    
    return dataset

In [52]:
new_data_3 = add_description_columns(new_df2)

+---------+--------+--------------------+-------------------+--------------------+-------------------+--------------------+--------+----------+---------+--------------------+--------------------+-----+--------------------+--------------+--------------------+--------------------+--------------------+-----+
|bathrooms|bedrooms|         building_id|            created|         description|    display_address|            features|latitude|listing_id|longitude|          manager_id|              photos|price|      street_address|interest_level|               words|            filtered|       word_features|label|
+---------+--------+--------------------+-------------------+--------------------+-------------------+--------------------+--------+----------+---------+--------------------+--------------------+-----+--------------------+--------------+--------------------+--------------------+--------------------+-----+
|      1.5|       3|53a5b119ba8f7b61d...|2016-06-24 07:54:24|A Brand New 3 Bed.

# Feature Union

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion

pipeline = Pipeline([
    ('feats', FeatureUnion([
        ('manager', add_manager_skill), 
        ('description', add_description_columns) 
    ])),
])

# Ugh this is not working...

In [53]:
new_data_3 = new_data_3.cache()


In [83]:
new_data_3.show(1)

+-----------+----------+---------+--------+--------------------+-------------------+--------------------+---------------+--------------------+--------+---------+--------------------+--------------------+-----+------------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----+------------------+--------------------+--------------------+--------------------+-----+
|manager_idx|listing_id|bathrooms|bedrooms|         building_id|            created|         description|display_address|            features|latitude|longitude|          manager_id|              photos|price|    street_address|interest_level|feature_cluster_0|feature_cluster_1|feature_clust

In [None]:
columns_num = [0, 2, 3, 4, 9, 10, 11, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 40]

In [89]:
features_to_combine = ['manager_idx',
                      'bathrooms',
                      'bedrooms',
                      'building_id',
                      'latitude',
                      'longitude',
                      'manager_id',
                      'price',
                      'feature_cluster_0',
                      'feature_cluster_1',
                      'feature_cluster_2',
                      'feature_cluster_3',
                      'feature_cluster_4',
                      'feature_cluster_5',
                      'feature_cluster_6',
                      'feature_cluster_7',
                      'feature_cluster_8',
                      'feature_cluster_9',
                      'feature_cluster_10',
                      'feature_cluster_11',
                      'feature_cluster_12',
                      'feature_cluster_13',
                      'feature_cluster_14',
                      'feature_cluster_15',
                      'feature_cluster_16',
                      'feature_cluster_17',
                      'feature_cluster_18',
                      'feature_cluster_19',
                      'manager_skill'
                      ]


In [94]:
new_data_3b = new_data_3.select((*(col(c).cast("float").alias(c) for c in features_to_combine)), "word_features","label")
new_data_3b

DataFrame[manager_idx: float, bathrooms: float, bedrooms: float, building_id: float, latitude: float, longitude: float, manager_id: float, price: float, feature_cluster_0: float, feature_cluster_1: float, feature_cluster_2: float, feature_cluster_3: float, feature_cluster_4: float, feature_cluster_5: float, feature_cluster_6: float, feature_cluster_7: float, feature_cluster_8: float, feature_cluster_9: float, feature_cluster_10: float, feature_cluster_11: float, feature_cluster_12: float, feature_cluster_13: float, feature_cluster_14: float, feature_cluster_15: float, feature_cluster_16: float, feature_cluster_17: float, feature_cluster_18: float, feature_cluster_19: float, manager_skill: float, word_features: vector, label: int]

In [95]:
# convert to rdd
new_data_3_rdd = new_data_3b.rdd

In [103]:
#new_data_3_rdd.take(1)
columns_num = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28]

In [104]:
input_rdd = new_data_3_rdd.map(lambda x: (x[30], DenseVector([x[i] for i in columns_num])))

In [105]:
input_rdd.take(2)

# Ugh, need to convert word_features to a dense vector, unlist it, and then combine with this rdd I think...
# We also have missing values...

[(0,
  DenseVector([299.0, 1.0, 3.0, nan, 40.7399, -73.9864, nan, 5595.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5556])),
 (0,
  DenseVector([299.0, 1.0, 2.0, nan, 40.7399, -73.9864, nan, 3995.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5556]))]