In [None]:
# Linear Regression on a categorical variable using one-hot and dummy codes
import pandas
from sklearn import linear_model

# Define a toy dataset of apartment rental prices in 3 cities
df = pd.DataFrame({'City': ['SF', 'SF', 'SF', 'NYC', 'NYC', 'NYC', 'Seattle', 'Seattle', 'Seattle'], 'Rent': [3999, 4000, 4001, 3499, 3500, 3501, 2499, 2500, 2501]})
print(df['Rent'].mean())

In [None]:
# Convert the categorical vars in the DataFrame to one-hot encoding and fit a linear regression model
one_hot_df = pd.get_dummies(df, prefix=['city'])
print(one_hot_df)

model = linear_regression.LinearRegression()
model.fit(one_hot_df[['city_NYC', 'city_SF', 'city_Seattle']], one_hot_df['Rent'])
print(model.coef_)
print(model.intercept_)

# Train a linear regression model on dummy code
# Specify the 'drop_first' flag to get dummy coding
dummy_df = pd.get_dummies(df, prefix=['city'], drop_first=True)
print(dummy_df)

model.fit(dummy_df[['city_SF', 'city_Seattle']], dummy_df['Rent'])
print(model.coef_)
print(model.intercept_)

In [None]:
# Effect Coding -> Though not much used...
effect_df = dummy_df.copy()
effect_df.ix[3:5, ['city_SF', 'city_Seattle']] = -1.0
print(effect_df)

model.fit(effect_df[['city_SF', 'city_Seattle']], effect_df['Rent'])
print(model.coef_)
print(model.intercept_)

In [None]:
# Dealing with large categorical variables

In [None]:
# FEATURE HASHING
def hash_features(word_list, m):
    output = [0]*m
    for word in word_list:
        index = hash_func(word) % m # not defined
        output[index] += 1
    return output

In [None]:
# SIGNED FEATURE HASHING
def hash_features(word_list, m):
    output = [0]*m
    for word in word_list:
        index = hash_func(word) % m # not defined, use any
        sign_bit = sign_hash(word) % 2 # not defined, use any
        if(sign_bit==0):
            output[index] -= 1
        else:
            output[index] += 1
    return output

In [None]:
# Feature Hashing on Yelp Reviews Dataset
import pandas as pd
import json


# Load reviews
f = open('yelp_academic_dataset_review.json')
js = []
for i in range(10000):
    js.append(json.loads(f.readline()))
f.close()
review_df = pd.DataFrame(js)

In [None]:
# Define m as equal to the unique number of business_ids
m = len(review_df.business_id.unique())
print(m)

from sklearn.feature_extraction impotr FeatureHasher
h = FeatureHasher(n_features=m, input_type='string')
f = h.transform(review_df['business_id'])
print(review_df['business_id'].unique().tolist()[0:5])
print(f.toarray())

# Storage Size of our features
from sys import getsizeof
print("Our pandas Series, in bytes: ", getsizeof(review_df['business_id']))
print("Our Hashed numpy array, in bytes: ", getsizeof(f))

In [None]:
# BIN COUNTING
# Dataset used in Kaggle competition hosted by Avazu
import pandas as pd
df = pd.read_csv('data/train_subset.csv')
print(len(df['device_id'].unique())) # number of unique features

# For each category, we calculate:
# Theta = [counts, p(click), p(no click), p(click)/p(no click)]

def click_counting(x, bin_column):
    clicks = pd.Series(x[x['click'] > 0][bin_column].value_counts(), name='clicks')
    no_clicks = pd.Series(x[x['click'] < 1][bin_column].value_counts(), name='no_clicks')
    counts = pd.DataFrame([clicks, no_clicks]).T.fillna('0')
    counts['total_clicks'] = counts['clicks'].astype('int64') + counts['no_clicks'].astype('int64')
    return counts

def bin_counting(counts):
    counts['N+'] = counts['clicks'].astype('int64').divide(counts['total_clicks'].astype('int64'))
    counts['N-'] = counts['no_clicks'].astype('int64').divide(counts['total_clicks'].astype('int64'))
    counts['log_N+'] = counts['N+'].divide(counts['N-'])
    # Only bin counting categories
    bin_counts = counts.filter(items = ['N+', 'N-', 'log_N+'])
    return counts, bin_counts

bin_column = 'device_id'
device_clicks = click_counting(df.filter(items = [bin_column, 'click']), bin_column)
device_all, device_bin_counts = bin_counting(device_clicks)
print(len(device_bin_counts))

all_devices_sample = device_all.sort_values(by = 'total_clicks', ascending = False).head(4)
print(all_devices_sample)