# CategoricalGrouping Notebook - AAAG

In [18]:
# Basics Importation 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

# For Detailed Stats Output
import statsmodels.api as sm
# The linear regression models 
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LassoCV, ElasticNetCV
# Instantiating the linear regression models
ols = LinearRegression()
ridge = Ridge()
lasso  = Lasso()
lassocv = LassoCV()
elasticnet = ElasticNet()
elasticnetcv = ElasticNetCV()
# The tree model
from sklearn import tree
# Instantiating the tree model (regression type)
regressor = tree.DecisionTreeRegressor()
# The ensemble model for random forest and bagging
from sklearn import ensemble
# Instantiating the ensemble models
randomForest = ensemble.RandomForestRegressor()
bagging = ensemble.BaggingRegressor()
# Instantiating the boost models
gbm = ensemble.GradientBoostingRegressor()
abr = ensemble.AdaBoostRegressor()
# xg boost
import xgboost as xgb
# lg boost
import lightgbm as lgb
# K mean clustering
from sklearn.cluster import KMeans
kmeans = KMeans()
# The model selection for cross validation, k fold splits, train_test_split, grid search etc. 
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
# Some automatic feature selection functions (recursive finding, best feature selection etc.)
import sklearn.feature_selection as fs
# Importing the different error evaluation/metrics
from sklearn import metrics
from sklearn.metrics import silhouette_score
# Importing PCA
from sklearn.decomposition import PCA
# Making it so that we can see all columns of the dataframe
pd.set_option('display.max_columns', None)

# Import data
standardized_no_outlier_undummified_df_total=pd.read_csv('standardized_no_outlier_undummified_df_train.csv',index_col='Id')
standardized_with_outlier_undummified_df_test=pd.read_csv('standardized_with_outlier_undummified_df_test.csv',index_col='Id')
# Specify df to work with:
df = standardized_no_outlier_undummified_df_total
df_test = standardized_with_outlier_undummified_df_test
df_test.shape
# df_test must have : undummfieid: (1459, 70), dummified: (1459,902)

(1459, 70)

In [2]:
nominal_var_processed=['MSZoning','Street','Alley','LotShape','LandContour','LotConfig','LandSlope',\
             'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl',\
             'Exterior1st','Exterior2nd','MasVnrType','Foundation',\
             'BsmtFinType1','Heating','CentralAir',\
             'Electrical','Functional','GarageType','GarageFinish',\
             'PavedDrive','Fence','MiscFeature','SaleType','SaleCondition','MSSubClass',\
             'WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MoSold']
ordinal_var_processed=['OverallQual','OverallCond','BsmtFullBath','BsmtHalfBath','FullBath',\
             'HalfBath','BedroomAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars',\
             'YrSold','HeatingQC','KitchenQual','ExterQual','ExterCond','BsmtQual',\
             'BsmtCond','BsmtExposure','FireplaceQu','GarageQual','GarageCond']
cont_var_processed=['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','TotalBsmtSF',\
          '1stFlrSF','2ndFlrSF','GrLivArea','GarageYrBlt','GarageArea']
cont_var_for_tuning=ordinal_var_processed+cont_var_processed

In [3]:
x=df.drop('SalePrice',axis='columns')
y=df.SalePrice
x_test=df_test.drop('GrLivArea',axis='columns')
y_test=df_test.GrLivArea

In [4]:
def dummify_column(dataframe,column_name):
    '''
    ### NOTE!!! ### vector operation not working yet!!!! ###
    - dataframe takes the entire dataframe you are working on
    - column_name takes a list of strings, where the strings are the column names
    '''
    for feature in column_name:
        dummified_feature = pd.get_dummies(dataframe.loc[:,feature], prefix=feature, prefix_sep='__',drop_first=True)
        dummified_df = pd.concat([dataframe.drop(feature,axis=1),dummified_feature],axis=1,sort='False')
    return dummified_df

In [5]:
# TAKES A LONG TIME TO RUN!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# TAKES A LONG TIME TO RUN!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# K mean clustering performed on categorical variables
# perform for train (non-test) set
nominal_var_cluster_num=[]
nominal_var_cluster_num_test=[]
for cat_var in nominal_var_processed:
    # dummifying the nominal categorical variable
    dummified_column=pd.get_dummies(x.loc[:,cat_var],prefix=cat_var, prefix_sep='__')
    df_for_cluster=pd.concat([y,dummified_column],axis=1,sort='False')
    print(cat_var)
    # Finding the optimal number of clusters and storing into nominal_var_cluster_num
    kmax = df_for_cluster.shape[1]
    KNumberChoice=range(2, kmax)
    # dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
    # Doing mulitple trials for each category
    cluster_count=[]
    for trial_num in range(0,5):
#         print(trial_num)
        sil = []
        for k in range(2, kmax):
            kmeans = KMeans(n_clusters = k,random_state=trial_num,init='k-means++').fit(df_for_cluster)
            labels = kmeans.labels_
            sil.append(silhouette_score(df_for_cluster, labels, metric = 'euclidean'))
        if np.argmax(sil)<0.2: # defining threshold for 1 cluster
            cluster_count.append(1)
        else:
            cluster_count.append(KNumberChoice[np.argmax(sil)])
    nominal_var_cluster_num.append(max(set(cluster_count), key=cluster_count.count))

    # perform for the test set #######################################       
    dummified_column_test=pd.get_dummies(x_test.loc[:,cat_var],prefix=cat_var, prefix_sep='__')
    df_for_cluster_test=pd.concat([y_test,dummified_column_test],axis=1,sort='False')
    kmax_test = df_for_cluster_test.shape[1]
    KNumberChoice_test=range(2, kmax_test)
    cluster_count_test=[]
    for trial_num in range(0,5):
#         print(trial_num)
        sil = []
        for k in range(2, kmax_test):
            kmeans_test = KMeans(n_clusters = k,random_state=trial_num,init='k-means++').fit(df_for_cluster_test)
            labels_test = kmeans_test.labels_
            sil.append(silhouette_score(df_for_cluster_test, labels_test, metric = 'euclidean'))
        if np.argmax(sil)<0.2: # defining threshold for 1 cluster
            cluster_count_test.append(1)
        else:
            cluster_count_test.append(KNumberChoice_test[np.argmax(sil)])
    nominal_var_cluster_num_test.append(max(set(cluster_count_test), key=cluster_count_test.count))


MSZoning
Street
Alley
LotShape
LandContour
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
Foundation
BsmtFinType1
Heating
CentralAir
Electrical
Functional
GarageType
GarageFinish
PavedDrive
Fence
MiscFeature
SaleType
SaleCondition
MSSubClass
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
MoSold


In [6]:
# With the known cluster number for each categorical variable, perform the clustering
# perform for train (non-test) set
for ind,cat_var in enumerate(nominal_var_processed):
    dummified_column=pd.get_dummies(x.loc[:,cat_var],prefix=cat_var, prefix_sep='__')
    df_for_cluster=pd.concat([y,dummified_column],axis=1,sort='False')
    kmeans=KMeans(n_clusters=nominal_var_cluster_num[ind]).fit(df_for_cluster)
    x.loc[:,cat_var]=kmeans.labels_
    
    # perform for the test set###################
    dummified_column_test=pd.get_dummies(x_test.loc[:,cat_var],prefix=cat_var, prefix_sep='__')
    df_for_cluster_test=pd.concat([y_test,dummified_column_test],axis=1,sort='False')
    kmeans_test=KMeans(n_clusters=nominal_var_cluster_num_test[ind]).fit(df_for_cluster_test)
    x_test.loc[:,cat_var]=kmeans_test.labels_
    
clustered_df=pd.concat([y,x],axis=1,sort='False')    
clustered_df_test=pd.concat([y_test,x_test],axis=1,sort='False')    


In [7]:
# Removing the columns with pure 0's
undummified_clustered_df=clustered_df.loc[:,(clustered_df != 0).any(axis=0)]

# Removing from the test set columns there were removed from the training set!
# undummified_clustered_df_test=clustered_df_test.loc[:,(clustered_df_test != 0).any(axis=0)]
dropped_col=set(clustered_df.columns)-set(clustered_df.columns[(clustered_df != 0).any(axis=0)])
cat_cols_to_keep = set(nominal_var_processed) - set(dropped_col)

In [8]:
# dummifying the clustered_df
undummified_clustered_df_copy=undummified_clustered_df.copy()
for feature in cat_cols_to_keep:
    undummified_clustered_df_copy=dummify_column(undummified_clustered_df_copy,[feature])
dummified_clustered_df=undummified_clustered_df_copy
print(dummified_clustered_df.isnull().sum().sum())
print(dummified_clustered_df.shape)
# dummifying the clustered_df_test
undummified_clustered_df_test_copy=clustered_df_test.copy()
for feature in cat_cols_to_keep:
    undummified_clustered_df_test_copy=dummify_column(undummified_clustered_df_test_copy,[feature])
dummified_clustered_df_test=undummified_clustered_df_test_copy
print(dummified_clustered_df_test.isnull().sum().sum())
print(dummified_clustered_df_test.shape)

0
(1215, 181)
0
(1459, 196)


In [9]:
dummified_clustered_df.columns[~dummified_clustered_df.columns.isin(cont_var_for_tuning)]

Index(['SalePrice', 'RoofStyle__1', 'RoofStyle__2', 'RoofStyle__3',
       'RoofStyle__4', 'GarageFinish__1', 'GarageFinish__2', 'GarageFinish__3',
       'SaleType__1', 'SaleType__2',
       ...
       'MSSubClass__5', 'MSSubClass__6', 'MSSubClass__7', 'MSSubClass__8',
       'MSSubClass__9', 'MSSubClass__10', 'MSSubClass__11', 'MSSubClass__12',
       'MSSubClass__13', 'MSSubClass__14'],
      dtype='object', length=149)

In [10]:
####################### ONLY FOR NON-TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
####################### ONLY FOR NON-TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# Removing sub-categories with low counts and their associated observations (if less than 2% of total observation)
boolmatrix=[]
non_cont_columns=dummified_clustered_df.columns[~dummified_clustered_df.columns.isin(cont_var_for_tuning)]
for cat_feature in non_cont_columns:
    if dummified_clustered_df.loc[:,cat_feature].sum()<0.020*dummified_clustered_df.shape[0]:
        print('yes')
        boolvec=(dummified_clustered_df.loc[:,cat_feature]==1)
        boolmatrix.append(list(boolvec))
   

yes
yes
yes
yes


In [11]:
# creating boolean vector to takeout observations that have categorical low count observations
if boolmatrix!=[]:
    reduction_bool_vec=np.any(np.array(boolmatrix).transpose(),axis=1)
    temp_df=dummified_clustered_df[~reduction_bool_vec]
    purged_dummified_clustered_df=temp_df.loc[:,(temp_df != 0).any(axis=0)]
else:
    purged_dummified_clustered_df=dummified_clustered_df
print(purged_dummified_clustered_df.shape)

(1138, 177)


In [12]:
purged_dummified_clustered_df.to_csv('purged_dummified_clustered_df.csv')
dummified_clustered_df_test.to_csv('dummified_clustered_df_test.csv')

In [13]:
############# Reducing Continuous/Ordinal Only ################
cont_df=purged_dummified_clustered_df.loc[:,cont_var_for_tuning]
pca = PCA(n_components = 0.90)
pca.fit(cont_df)
reduced_cont_df = pd.DataFrame(pca.transform(cont_df))
# Performing for the test set
cont_df_test=dummified_clustered_df_test.loc[:,cont_var_for_tuning]
pca_test = PCA(n_components = 0.90)
pca_test.fit(cont_df_test)
reduced_cont_df_test = pd.DataFrame(pca_test.transform(cont_df_test))

In [14]:
categorical_purged_dummified_clustered_df=purged_dummified_clustered_df.drop(cont_var_for_tuning,axis=1)
categorical_dummified_clustered_df_test=dummified_clustered_df_test.drop(cont_var_for_tuning,axis=1)

PCA_purged_dummified_clustered_df=pd.concat([categorical_purged_dummified_clustered_df.reset_index(),reduced_cont_df],axis=1)
PCA_dummified_clustered_df_test=pd.concat([categorical_dummified_clustered_df_test.reset_index(),reduced_cont_df_test],axis=1)

In [15]:
PCA_purged_dummified_clustered_df.set_index('Id',inplace=True)
PCA_dummified_clustered_df_test.set_index('Id',inplace=True)
PCA_purged_dummified_clustered_df.to_csv('PCA_purged_dummified_clustered_df.csv')
PCA_dummified_clustered_df_test.to_csv('PCA_dummified_clustered_df_test.csv')

# My Attempt to improve on k mean clustering 
# A Working progress

In [16]:
# from sklearn.cluster import KMeans
# from sklearn import metrics
# from scipy.spatial.distance import cdist
# import numpy as np
# import matplotlib.pyplot as plt

# # k means determine k
# distortions = []
# kmax = df_for_cluster.shape[1]
# K = range(1,kmax)
# for k in K:
#     kmeanModel = KMeans(n_clusters=k).fit(df_for_cluster)
#     kmeanModel.fit(df_for_cluster)
#     distortions.append(sum(np.min(cdist(df_for_cluster, kmeanModel.cluster_centers_, 'euclidean')+0*(k), axis=1)) / df_for_cluster.shape[0])

# # Plot the elbow
# plt.plot(K, distortions, 'bx-')
# plt.xlabel('k')
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.show()

In [17]:
# from sklearn.cluster import KMeans
# from sklearn import metrics
# from scipy.spatial.distance import cdist
# import numpy as np
# import matplotlib.pyplot as plt

# # k means determine k
# K = range(1,df_for_cluster.shape[1])
# penalty_term=np.linspace(0.005,0.01,101)
# distortion_matrix = np.zeros([len(K),len(penalty_term)])
# for i,k in enumerate(K):
#     for j,pen_weight in enumerate(penalty_term):
#         #print(i,j)
#         kmeanModel = KMeans(n_clusters=k).fit(df_for_cluster)
#         kmeanModel.fit(df_for_cluster)
#         distortion_matrix[i,j]=sum(np.min(cdist(df_for_cluster, kmeanModel.cluster_centers_, 'euclidean')+pen_weight*(k), axis=1)) / df_for_cluster.shape[0]
# # The minimum index
# np.unravel_index(np.argmin(distortion_matrix, axis=None), distortion_matrix.shape)
# # Plot the elbow
# plt.pcolor(distortion_matrix)
# plt.colorbar();
# plt.xlabel('Penalization');
# plt.ylabel('k clusters');