###  Estimating "PURPOSE" of Bridges by "Classification"

Raw Data Resource:
https://archive.ics.uci.edu/ml/machine-learning-databases/bridges/bridges.data.version1

##### Dataframe is composed of 
- three scalar features:
   1. LOCATION
   2. ERECTED 
   3. LENGTH
   
   
- nine categorical features:
   4. RIVER
   5. PURPOSE
   6. LANES
   7. CLEAR-G
   8. T-OR-D
   9. MATERIAL
   10. SPAN
   11. REL-L
   12. TYPE 
   
    
##### Purpose of Project:
Making an Estimator to predict the value of feature 5:  **"PURPOSE"**


In [None]:
# importing required modules

import statistics as stat
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA 

from sklearn.ensemble import IsolationForest

from sklearn.utils import resample
#from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale 

from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score 

In [None]:
# reading data 

df_raw = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00537/sobar-72.csv', header=0) # delimiter=';'
df_raw.head()

In [None]:
# fixing columns header


df_raw.columns = ['sexualRisk', 'eating', 'Hygine',
       'aggregation', 'commitment', 'consistency',
       'spontaneity', 'Person', 'fulfill',
       'vulnerability', 'severity',
       'strength', 'willingness',
       'emotionality', 'appreciation',
       'instrumental', 'knowledge',
       'abilities', 'desires', 'cervix']
df_raw.head(3)

In [None]:
# deleting unnecessary columns (eg. ID)

del_columns = [ 'sexualRisk' ] 

keep_columns = []
for c in df_raw.columns:
    if c not in del_columns:
        keep_columns.append(c)
    
df_raw = df_raw.loc[ : , keep_columns ]
df_raw.head(3)

In [None]:
# reindexing
df_raw = df_raw.reindex(range(0,df_raw.shape[0]))

### Inspecting Columns
##### columns of dtype='object'  may contain:
 - 'whitespaces' in strings  
 - 'missing-values' which are usually shown by question-mark, '?'

In [None]:
# define a function called  find_obj_cols  which takes a dataframe
# and returns list of columns with  dtype=='object'

def find_obj_cols(df):
    cols = df.columns[ df.dtypes=='object' ]
    return cols

# object columns 
obj_cols = find_obj_cols( df_raw ); print('obj_cols',obj_cols)

In [None]:
# help
obj_cols_index = [ ]

# (auto)
for i in obj_cols_index:
    c = obj_cols[i]; print(' ', c, ' ', sorted(df_raw.loc[:,c].unique()))

In [None]:
# (input) white_cols  &  qmark_cols  &  str_cols
white_cols = [ ]; print('white_cols',white_cols)
qmark_cols = [ ]; print('qmark_cols',qmark_cols)

# (input) integers wrongfully as object  &  floats wrongfully as objects
int_o   = [ ]; print('int_o',int_o)
float_o = [ ]; print('float_o',float_o)

# (auto) str_cols
str_cols =[]
for c in obj_cols:
    if c not in int_o+float_o:
        str_cols.append(c)
print('str_cols',str_cols)

In [None]:
# (auto) define a function called  find_int_cols  which takes a dataframe
# and returns list of columns with  dtype=='int64'
def find_int_cols(df):
    cols = df.columns[ df.dtypes=='int64' ]
    return cols

# (auto) integer-columns mixed with float-columns 
int_i   = find_int_cols( df_raw );  print('int_i',int_i)
float_i = []; print('float_i',float_i)  # always empty

In [None]:
# (auto) define a function called  find_float_cols  which takes a dataframe
# and returns list of columns with  dtype=='float64'
def find_float_cols(df):
    cols = df.columns[ df.dtypes=='float64' ]
    return cols

# (auto) float-columns mixed with integer-columns
float_cols_dirty = find_float_cols( df_raw ); print('float_cols_dirty',float_cols_dirty)

In [None]:
# help
float_cols_dirty_index = list( np.arange(7,9) )

# (auto)
for i in float_cols_dirty_index:
    c = float_cols_dirty[i]; print(' ', c, ' ', np.round( sorted(df_raw.loc[:,c].unique()),2))

In [1]:
# (input) index of int-columns wrongfully in float_cols_dirty
int_index = [  ]

# (auto)
int_f = []
for i in int_index:
    c = float_cols_dirty[i]
    int_f.append(c)
     
float_f = []
for c in float_cols_dirty:
    if c not in int_f:
        float_f.append( c )
        
print('int_f',int_f); print('\nfloat_f',float_f)

NameError: name 'float_cols_dirty' is not defined

In [None]:
# int_cols & float_cols (auto)

int_cols   = list(int_o)   + list(int_i)   + list(int_f);   print('int_cols',int_cols)
float_cols = list(float_o) + list(float_i) + list(float_f); print('\nfloat_cols',float_cols)

### Fixing *'whitespaces'* in *dtype='object'* columns

In [None]:
# (auto)
df_dirty = df_raw.copy()

# define a function called fix_whitespaces
# to remove whitespaces from begining/end of strings

def fix_whitespaces( df, cols ):
    n = df.shape[0]
    for c in cols:
        s=[]
        for i in range(0,n):
            s.append( df.loc[i,c].strip() )
        df.loc[:,c] = s
    return df

# removing
df_no_white = fix_whitespaces( df_dirty, white_cols )
    
# Checking  
for c in obj_cols:
    print('\n', c, '\n', sorted(df_no_white.loc[:,c].unique()))

### Fixing '?' in *dtype='object'* columns
 - in numeric columns,  '?' => "median" of numbers
 - in string columns, '?' => "mode" of strings

In [None]:
# (auto)
df_dirty = df_no_white.copy() 
 
# define a function called fix_qmarks
# which returns corrected df[ft]

def fix_qmarks( df, ft, str_cols, num_cols ):
    
    x = df[ft].copy()
    n = df.shape[0]
    
    if ft in str_cols:   
        typ = 'str' 
    else:  
        typ = 'num' 
               
    val=[]
    for r in range(0,n):
        if (  (x[r])=='?'  ):
            ;
        else:
            val.append( x[r] )
    
    if (  typ=='str'  ):
        val = np.array(val).astype('str')
        ave = stat.mode(val)  
    elif (  typ=='num'  ):
        val = np.array(val).astype('float64')
        ave = stat.median(val)
    else:
        print("typ is neither  'str'  nor  'int'  nor  'float'")
        return df[ft]
    
    for r in range(0,n):
        if (  x[r]=='?'  ):
            x[r]=ave
    
    print(ft,typ)
    
    if   (  typ=='str'  ):
        return x.astype('str') 
    else:
        return x.astype('float64')

# fixing columns
for ft in qmark_cols: 
    df_dirty.loc[:,ft] = fix_qmarks( df_dirty, ft, str_cols, int_cols+float_cols )
     

df_no_qmark = df_dirty.copy()
print( df_no_qmark.head() )

### Fixing *'blank cells'* in all columns (auto)

In [None]:
# finding blank-cols
df_dirty = df_no_qmark.copy() 

# Boolean function
# True: feature has blank_cells
def has_blank(df,ft): 
    x = df[ft].count()
    n = df.shape[0] 
    return x<n

# blank_cols
blank_cols=[]
for c in df_dirty.columns:
    if has_blank( df_dirty, c ):
        blank_cols.append( c ) 

print('blank_cols',blank_cols)        

In [None]:
# fixing blank-cols

# define a function called fix_blank
# which returns corrected df[ft]

def fix_blank( df, ft, str_cols, int_cols, float_cols ):
    
    x = df[ft].copy()
    n = df.shape[0]
    
    if ft in str_cols:   
        typ='str'
    elif ft in int_cols: 
        typ = 'int'
    elif ft in float_cols: 
        typ = 'float'
    else:   
        print('\nWarning!\n ', ft, ' is neither in  str_cols  nor in  int_cols  nor in  float_cols')
        return x
        
    val=[]
    for r in range(0,n):
        if (  pd.notnull(x[r])==True  ): 
            val.append( x[r] )
    
    if (  typ=='str'  ):
        val = np.array(val).astype('str')
        ave = stat.mode(val) 
    elif (  typ=='int'  ):
        val = np.array(val).astype('int')
        ave = stat.median(val)
    elif (  typ=='float'  ):
        val = np.array(val).astype('float')
        ave = stat.median(val)
    else:
        print("typ is neither  'str'  nor  'int'  nor  'float'")
        return df[ft]
    
    for r in range(0,n):
        if (  pd.isnull(x[r])==True  ):  
            x[r]=ave
    
    if   (  typ=='str'  ):  
        return x.astype('str')
    elif (  typ=='int'  ):  
        return x.astype('int64')                 
    else:
        return x.astype('float64')

# fixing columns
for ft in blank_cols: 
    df_dirty.loc[:,ft]=fix_blank( df_dirty, ft, str_cols, int_cols, float_cols )

df_no_blank = df_dirty.copy()
df_no_blank.head(3)

### df_no_missing

In [None]:
# df_no_missing
df_no_missing = df_no_blank.copy()

#####   
### fixing dtype of columns

In [None]:
df = df_no_missing.copy()

for c in int_cols:   df[c]=df[c].astype('int64')
for c in float_cols: df[c]=df[c].astype('float64')
for c in str_cols:   df[c]=df[c].astype('str')
    
df_no_missing = df.copy()

print('int_cols',int_cols); print('\nfloat_cols',float_cols); print('\nstr_cols',str_cols)

### Dividing columns into 'Scalar' and 'Categorical'
The ordinal columns with less than 5 unique values are 'Categorical'. 
The ordinal columns with 5 and more unique values are 'Scalar'.

In [None]:
# Guide
'''                                         *
   dtype            n_uniques               *
                n<=4         n>=5           *              ordinal     non-ordinal
                                            *                       
   object    categorical   categorical      *       XXX:   scalar      categorical
   float       scalar        scalar         *
    int      categorical      XXX           *
                                            *    
'''  

In [None]:
# sc & cat
df = df_no_missing

cat, sc, xxx = [], [], []
for c in df.columns: 
    if   df[c].dtype=='object' : cat.append(c)
    elif df[c].dtype=='float64': sc.append(c)
    elif len(df[c].unique())<=4: cat.append(c)
    else                       : xxx.append(c)  
        
print('cat',cat); print('\nsc',sc); print('\nxxx',xxx)      

In [None]:
# help
indexes = [0,1,2] 
for i in indexes:
    c = xxx[i] 
    print( ' ', i,'  #', len(df_no_missing[c].unique()), ' ', sorted(df_no_missing[c].unique()) )

In [None]:
xxx_non_ordinal_index = [ ]

# (auto)
non_ordinal_columns,  ordinal_columns  =  [], []
for i in range(0,len(xxx)):
    if i in xxx_non_ordinal_index: non_ordinal_columns.append( xxx[i] )
    else:                              ordinal_columns.append( xxx[i] )

In [None]:
# (auto)
scalar_columns     = list(sc)  + list(ordinal_columns);     print(' scalar_columns',scalar_columns)
categorical_columns= list(cat) + list(non_ordinal_columns); print('\ncategorical_columns',categorical_columns) 

In [None]:
# sorting order of columns:  scalar first, categorical second  
df_no_missing = df_no_missing.loc[ :, scalar_columns+categorical_columns ]

for c in categorical_columns:
    df_no_missing.loc[:,c] = df_no_missing.loc[:,c].astype(str) 
    
df_no_missing.head(2)

### Data Cleaning
Typically there is some noise (dirt) in the data which should be cleaned.

##### There are two approaches to clean the data:
1. Statistical Methods, including:
 - Inter Quartile Range (IQR): to detect general univariate outliers
 - z-values: to detect extreme univariate outliers
 - mahalanobis distance test: to detect multivariate outliers


2. Machine-Learning Methods, including:
 - Isolation Forest 
 - Eliptic_envelope
 - Local Outlier Factor
 - One Class SVM 
 

In [None]:
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#                                                         #
#                  Visualizing Outliers                   #
#                        by Boxplot                       # 
#                                                         #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

In [None]:
# define a function called "plot_boxplot"

def plot_boxplot(df,ft):   
    df.boxplot(column=[ft])
    plt.grid(False)
    plt.show()    
    
for c in scalar_columns:
    plot_boxplot( df_no_missing, c )

In [None]:
# help
print( scalar_columns )

In [None]:
# defining columns with outliers
outlier_cols = [ 'eating',   'consistency', 'spontaneity', 'strength' ]

In [None]:
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#                                                         #
#                Fixing "Extreme" Outliers                # 
#                     using "z-values"                    #
#                                                         #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

In [None]:
# Option (1) Winsorizing Extreme Outliers

# df_dirty
df_dirty = df_no_missing.copy()

# define a function called winsorize
# which converts extreme outliers to threshhold
# and returns list of winsorized indexes
# z = (x-M) / SD
# +/- 3    

def winsorize(df,ft):
    x  = df[ft].copy()
    M  = x.mean()
    SD = x.std()
    z  = (x-M) / SD
    
    upper_bound = +3
    lower_bound = -3
    
    ls = df.index[  (z > upper_bound) 
                  | (z < lower_bound) ]
    
    x_Des=sorted(x,reverse=True)
    i=0
    while x_Des[i] >=  M + 3*SD:
        i=i+1 
    MAX=x_Des[i] 
    
    x_Asc=sorted(x,reverse=False)
    i=0 
    while x_Asc[i] <=  M - 3*SD:
        i=i+1  
    MIN=x_Asc[i] 
    
    for i in range(0,len(x)):
        if   z[i] > upper_bound: x[i]=MAX
        elif z[i] < lower_bound: x[i]=MIN
            
    return [x, ls]


# create a function to store the output indices 
# from multiple columns

df_no_extreme = df_dirty.copy()

index_list = []
for c in outlier_cols:
    c_clean, ls = winsorize( df_dirty, c )
    index_list.extend( ls )
    df_no_extreme.loc[ :, c ] = c_clean

df_extreme = df_no_missing.iloc[ sorted(index_list), : ] 
print( len(df_no_extreme)/len(df_raw) *100 )
df_no_extreme.head(3)

In [None]:
# Option (2) Removing Extreme Outliers
'''
# df_dirty
df_dirty = df_no_missing.copy()

# define a function called outliers
# which returns a list of index of outliers
# z = (x-M) / SD
# +/- 3    

def outliers(df,ft):
    x  = df[ft]
    M  = x.mean()
    SD = x.std()
    z  = (x-M) / SD
    
    upper_bound = +3
    lower_bound = -3
    
    ls = df.index[  (z > upper_bound) 
                  | (z < lower_bound) ]
    
    return ls



# create a function to store the output indices 
# from multiple columns    
    
index_list = []
for feature in outlier_cols:  
    index_list.extend( outliers(df_dirty,feature) )

# define a function called "remove_extreme_outliers_by_zValues"
# which returns a dataframe without extreme outliers 

def remove_extreme_outliers_by_zValues(df, ls):
    ls = sorted(set(ls))
    df = df.drop(ls)
    return df 

df_extreme    = df_dirty.iloc[ sorted(index_list), : ]
df_no_extreme = remove_extreme_outliers_by_zValues( df_dirty, index_list )
len(df_no_extreme)/len(df_raw) *100

'''  

In [None]:
# visualizing extreme outliers
df_dirty = pd.concat([df_no_extreme[scalar_columns],df_extreme[scalar_columns]],axis=0)

flag_clean   = np.ones( df_no_extreme.shape[0] ) * (+1)
flag_outlier = np.ones( df_extreme.shape[0]    ) * (-1) 

flag = list(flag_clean)
for i in list(flag_outlier):
    flag.append(i)

# PCA
x = df_dirty 
y = []
for i in range(0,len(flag)):
    if flag[i]==1: y.append('cyan')
    else:          y.append('red')

pca = PCA()
pca.fit_transform( x )
 
x_pca = scale( pca.fit_transform( x ) )
x_pc1 = x_pca[:,0] 
x_pc2 = x_pca[:,1] 

# PCA Scatter Plot 
fig, ax = plt.subplots( figsize=(10,6) )  

scatter = ax.scatter( 
      x_pc1
    , x_pc2 
    #, cmap = 'rainbow'
    , c    = y
    , s    = 300
    , edgecolors = 'k'
    , alpha      = 0.55 
    )
 
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('PCA Plot\nClean-Data  VS  Extreme-Outliers') 

plt.show()

In [None]:
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#                                                         #
#            Removing "Multivariate" Outliers             # 
#              using "Mahalanobis Distance"               #
#                                                         #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

In [None]:
# df_dirty
#df_dirty = df_no_missing.copy()
df_dirty = df_no_extreme.copy()

# define a function called "MahalanobisDist"
# which returns the Mahalanobis Distance (MD) for each record

def MahalanobisDist(df, verbose=False):
    covariance_matrix = np.cov(df, rowvar=False)
    if is_pos_def(covariance_matrix):
        inv_covariance_matrix = np.linalg.inv(covariance_matrix)
        if is_pos_def(inv_covariance_matrix):
            vars_mean = []
            for i in range(df.shape[0]):
                vars_mean.append(list(df.mean(axis=0)))
            diff = df - vars_mean
            md = []
            for i in range(len(diff)):
                md.append(np.sqrt(diff[i].dot(inv_covariance_matrix).dot(diff[i])))

            if verbose:
                print("Covariance Matrix:\n {}\n".format(covariance_matrix))
                print("Inverse of Covariance Matrix:\n {}\n".format(inv_covariance_matrix))
                print("Variables Mean Vector:\n {}\n".format(vars_mean))
                print("Variables - Variables Mean Vector:\n {}\n".format(diff))
                print("Mahalanobis Distance:\n {}\n".format(md))
            return md
        else:
            print("Error: Inverse of Covariance Matrix is not positive definite!")
    else:
        print("Error: Covariance Matrix is not positive definite!")

def is_pos_def(A):
    if np.allclose(A, A.T):
        try:
            np.linalg.cholesky(A)
            return True
        except np.linalg.LinAlgError:
            return False
    else:
        return False

# finding mahalanobis distance for each record
md = MahalanobisDist(df_dirty[scalar_columns].to_numpy(), verbose=0)


# defining a function called "multivar_outliers" 
# which returns list of indexes of multivariate outliers 

def multivar_outliers(df,md):
    #df['MD'] = md
    M  = np.mean(md)
    SD = np.std(md)
    threshhold = M + 3.0*SD
    ls = df.index[  (md > threshhold)  ]
    return ls
 
index_list = multivar_outliers( df_dirty[scalar_columns], md )


# define a function called "remove"
# which removes multivariate outliers

def remove_multivar_outliers(df, ls):
    ls = sorted(set(ls))
    df = df.drop(ls)
    return df 

df_no_multivar = remove_multivar_outliers( df_dirty, index_list ) 
df_multivar    = df_dirty.drop( df_no_multivar.index ) 
len(df_no_multivar)/len(df_raw) *100

In [None]:
# visualizing multivariate outliers
df_dirty = pd.concat([df_no_multivar[scalar_columns],df_multivar[scalar_columns]],axis=0)

flag_clean   = np.ones( df_no_multivar.shape[0] ) * (+1)
flag_outlier = np.ones( df_multivar.shape[0]    ) * (-1) 

flag = list(flag_clean)
for i in list(flag_outlier):
    flag.append(i)

# PCA
x = df_dirty 
y = []
for i in range(0,len(flag)):
    if flag[i]==1: y.append('cyan')
    else:          y.append('red')

pca = PCA()
pca.fit_transform( x )
 
x_pca = scale( pca.fit_transform( x ) )
x_pc1 = x_pca[:,0] 
x_pc2 = x_pca[:,1] 

# PCA Scatter Plot 
# Clean VS Multivar
fig, ax = plt.subplots( figsize=(10,6) )  

scatter = ax.scatter( 
      x_pc1
    , x_pc2 
    #, cmap = 'rainbow'
    , c    = y
    , s    = 300
    , edgecolors = 'k'
    , alpha      = 0.55 
    )
 
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('PCA Plot\nClean-Data  VS  Multivariate-Outliers') 

plt.show()

In [None]:
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#                                                         #
#       Cleaning scalar data using Isolation-Forest       # 
#                                                         #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

In [None]:
#df_dirty = df_no_missing.copy()
#df_dirty = df_no_extreme.copy()
df_dirty = df_no_multivar.copy()


# define a function called Clean_by_IForest
# to clean 'scalar columns' using 'Isolation Forest'

def remove_outliers_by_IForest( df, sc_cols, c ):
    sc_data = df[ sc_cols ].values
    IForest = IsolationForest( contamination=c ) 
    IForest.fit( sc_data )
    flag_clean = IForest.predict( sc_data ) == +1
    return df.loc[ flag_clean, : ]

df_no_noise = remove_outliers_by_IForest( df_dirty, scalar_columns, 0.01 )   # 4% contamination considered
df_noise = df_dirty.drop( df_no_noise.index )   

len(df_no_noise)/len(df_raw) *100

In [None]:
# visualizing noise
df_dirty = pd.concat([df_no_noise[scalar_columns],df_noise[scalar_columns]],axis=0)

flag_clean   = np.ones( df_no_noise.shape[0] ) * (+1)
flag_outlier = np.ones( df_noise.shape[0] ) * (-1) 

flag = list(flag_clean)
for i in list(flag_outlier):
    flag.append(i)

# PCA  
x = df_dirty 
y = []
for i in range(0,len(flag)):
    if flag[i]==1: y.append('cyan')
    else:          y.append('red')

pca = PCA()
pca.fit_transform( x )
 
x_pca = scale( pca.fit_transform( x ) )
x_pc1 = x_pca[:,0] 
x_pc2 = x_pca[:,1] 

# PCA Scatter Plot 
# Clean VS Multivar
fig, ax = plt.subplots( figsize=(10,6) )  

scatter = ax.scatter( 
      x_pc1
    , x_pc2 
    #, cmap = 'rainbow'
    , c    = y
    , s    = 300
    , edgecolors = 'k'
    , alpha      = 0.55 
    )
 
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('PCA Plot\nClean-Data  VS  Noise') 

plt.show()

In [None]:
# Checking Cleaned data at each stage
 
# Outliers    
Outlier_Dict = { 
      'Method'       : [  'z-values'
                        , 'Mahalanobis_Distance'
                        , 'Isolation Forest']
    
    , 'Outlier_Type' : [  'Extreme' 
                        , 'Multivariate'        
                        , 'Noise'           ]
    
    , 'Clean'        : [  len(df_no_extreme)
                        , len(df_no_multivar)
                        , len(df_no_noise) ]
    
    , 'Outlier'      : [  len(df_extreme)
                        , len(df_multivar)
                        , len(df_noise) ]
    
    , 'Clean/No_Missing (%)' : [  len(df_no_extreme)/len(df_no_missing)  *100
                                , len(df_no_multivar)/len(df_no_missing) *100
                                , len(df_no_noise)/len(df_no_missing)    *100 ] }

Outlier = pd.DataFrame( Outlier_Dict ) 
Outlier

### df_clean

In [None]:
df_clean = df_no_noise.copy()
df_clean.head( 3 )

In [None]:
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#                                                         #
#             Inspecting the categorical data             # 
#                                                         #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

In [None]:
# define a function callled plot_bars
# to show frequency of each item in each category

def plot_bars(df, ft):
    import random
    
    for c in ft:
        # Prepare Data
        S = df.groupby( c ).size().reset_index( name='counts' ) 
        S = S.sort_values('counts',ascending=False)
        S[c]=S[c].astype('str')
        n = S[c].unique().__len__()
        all_colors = list( plt.cm.colors.cnames.keys() )
        random.seed(1000)
        colors = random.choices( all_colors, k=n )
        
        # Plot Bars
        plt.figure(figsize=(10,2), dpi= 80)
        plt.bar( S[c], S['counts'], color=colors, width=.5 )
        for i, val in enumerate( S['counts'].values ):
            plt.text(  i, val, float(val)
                     , horizontalalignment='center', verticalalignment='bottom'
                     , fontdict={'fontweight':500, 'size':16}
                    )
 
        # Decoration
        #plt.gca().set_xticklabels( S[c], rotation=0, horizontalalignment='center', fontsize=16 )
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.title( c, fontsize=18)
        plt.ylabel( 'counts', fontsize=16 )
        #plt.ylim(0, 40) 
        #plt.savefig('composition_04v_Bar_Chart.png') 
        plt.show()
            
plot_bars( df_clean, categorical_columns+ordinal_columns )

In [None]:
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#                                                         #
#                    Downsampling Data                    #
#                                                         #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

In [None]:
# choosing a column to use its sub-categories to downsample
chosen_column = 'cervix'

In [None]:
# define a function callled count_cats
# to show frequency of each item in each category

def count_cats(df, ft):
    for c in ft:
        S = df[c]
        print( '\n', c, sorted(S.unique()), len(S) )
        for x in sorted( S.unique() ):
            y = S[S==x]
            print( '  ', x, ' ', len(y)) 
            
count_cats( df_clean, [chosen_column] )
print('\n"'+chosen_column+'" has',len(df_clean[chosen_column].unique()),'sub-cats')

In [None]:
# Using The Estimated Feature to Downsample

c = df_clean[ chosen_column ]
d = sorted( c.unique() )

df_0 = df_clean[ c==d[0] ]  # 143 recordS
df_1 = df_clean[ c==d[1] ]  # 123 recordS 
#df_2 = df_clean[ c==d[2] ]  # 146 recordS 
#df_3 = df_clean[ c==d[3] ]  # 131 recordS   

print(  len(df_0)
      , len(df_1)
      #, len(df_2)
      #, len(df_3)  
     ) 

In [None]:
# down_sampling  (resize)
df_0_downsampled = resample( df_0, replace=False, n_samples=len(df_0), random_state=0 )
df_1_downsampled = resample( df_1, replace=False, n_samples=len(df_1), random_state=0 )
#df_2_downsampled = resample( df_2, replace=False, n_samples=len(df_2), random_state=0 )
#df_3_downsampled = resample( df_3, replace=False, n_samples=len(df_3), random_state=0 ) 

print(  len(df_0_downsampled)
      , len(df_1_downsampled)
      #, len(df_2_downsampled)
      #, len(df_3_downsampled)  
     ) 

In [None]:
# merging down_sampled datasets
df_sample = pd.concat( [  df_0_downsampled     
                        , df_1_downsampled     
                        #, df_2_downsampled     
                        #, df_3_downsampled  
                       ] )
len( df_sample )  # 71

In [None]:
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#                                                         #
#                   Formatting the Data                   #
#                                                         #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

In [None]:
# step 1:  defining x & y

x = df_sample.copy() 

In [None]:
# step 2: One-Hot Encoding of categorical data on x

encode_columns = []
for ft in categorical_columns:
    encode_columns.append(ft)
        
x_encoded = pd.get_dummies( x, columns=encode_columns )
x_encoded.head(3)

In [None]:
# step 3: training and testing sets  

In [None]:
# step 4: scaling x
x_scaled = scale( x_encoded ) 

# KMeans Clusters
#####   

In [None]:
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#                                                         #
#              Building Preliminary Clusters              # 
#                                                         #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

In [None]:
# number of clusters 
nClsr = 3

# clusterers                                   or a guess    defaults is 10
kmean   =          KMeans( n_clusters=nClsr,  init='random',  n_init=200  )
mbkmean = MiniBatchKMeans( n_clusters=nClsr,  init='random',  n_init=200  )

clsr = kmean


clsr.fit( x_scaled )  

clsr_centroids = clsr.cluster_centers_
clsr_ss        = silhouette_score( x_scaled, clsr.labels_ )  # the smaller the better
clsr_ss

In [None]:
# visualizing
y_predict = clsr.predict( x_scaled ) 

pca = PCA() 

x_pca        = pca.fit_transform( x_scaled )
x_pca_scaled = scale( 
    np.column_stack(( 
          x_pca[:,0]
        , x_pca[:,1] 
        )))
 
# pc1 pc2
x_pc1_scaled = x_pca_scaled[:,0] 
x_pc2_scaled = x_pca_scaled[:,1]  

# PCA Scatter Plot 
fig, ax = plt.subplots( figsize=(10,6) )    

scatter = ax.scatter( 
      x_pc1_scaled
    , x_pc2_scaled 
    , cmap = 'rainbow_r'
    , c    = y_predict
    , s    = 300
    , edgecolors = 'k'
    , alpha      = 0.55 
    )
 
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('PCA Plot')

legend = ax.legend( scatter.legend_elements()[0],
                    scatter.legend_elements()[1],
                    loc='best') 


### Plotting Center
cc = clsr_centroids.copy()
cc = np.concatenate( [cc,x_scaled], axis=0 ) 
cc = scale( cc ) 
cc_pca = pca.fit_transform( cc )

cc_pca_scaled = scale( 
    np.column_stack(( 
          cc_pca[:,0]
         ,cc_pca[:,1] 
        )))

cc_pc1_scaled = cc_pca_scaled[range(0,nClsr),0] 
cc_pc2_scaled = cc_pca_scaled[range(0,nClsr),1] 

scatter = ax.scatter( 
      cc_pc1_scaled
    , cc_pc2_scaled 
    , cmap = 'rainbow_r'
    , c    = 'k'
    , s    = 350
    , edgecolors = 'k'
    , alpha      = 1.00 
    , label      = 'centeroid'
    )   
 
plt.show()

In [None]:
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#                                                         #
#               Preparing to make predictions             #
#                                                         #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

In [None]:
# Acceptable Range
print('\n Acceptable Range ... \n')

# Range of scalar_columns
Range = pd.concat([df_clean[scalar_columns].min(),df_clean[scalar_columns].max()],axis=1)
Range.columns = ['Min','Max']
print( np.round( Range, 2 ), '\n' )

# Range of categorical_columns
for col in categorical_columns:
    items = sorted( df_clean[ col ].unique() )
    print( col, '   ', items )

#####   
## The Estimator
##### Input data. Note the range.

In [None]:
# predicted item
df_no_missing.loc[0,:]

In [None]:
# input data                #  Range   
eating         =   8        #  8   15
Hygine         =   3        #  3   15
aggregation    =   5        #  2   10
commitment     =  10        #  6   15
consistency    =   5        #  4   10
spontaneity    =   7        #  5   10
Person         =   3        #  1    5
fulfill        =   9        #  3   15
vulnerability  =   9        #  3   15
severity       =   6        #  2   10
strength       =  10        #  4   15
willingness    =   9        #  3   15
emotionality   =   9        #  3   15
appreciation   =   6        #  2   10
instrumental   =   9        #  3   15
knowledge      =   9        #  3   15
abilities      =   9        #  3   15
desires        =   9        #  3   15 

cervix         =  '0'       # '0'  '1'

#####   
## The Estimation
#####  ( auto )

In [None]:
# help
x.columns 

In [None]:
# s  written in the same order as  x
s = [  eating, Hygine, aggregation, commitment, consistency,
       spontaneity, Person, fulfill, vulnerability, severity,
       strength, willingness, emotionality, appreciation,
       instrumental, knowledge, abilities, desires, cervix ]
 
# define a function called predicted_class
# which returns a string of the predicted class
def predicted_class( s, x, encode_columns ):
    ss = pd.DataFrame( [s.copy(),s.copy()], columns=x.columns )
    sx = pd.concat( [ss,x], axis=0 )
    sx_encoded = pd.get_dummies(
          sx
        , columns = encode_columns
        )
    sx_scaled = scale(  sx_encoded.iloc[1:,:]  )
    sx_scaled = sx_scaled[[0,1]]
    Estimation = clsr.predict( sx_scaled )[0] 
    
    return Estimation    

pred = predicted_class( s, x, categorical_columns )
print('Estimation:  Cluster', pred ) 

#####   
## The Visualization
#####  ( auto )

In [None]:
# visualizing
y_predict = clsr.predict( x_scaled ) 

pca = PCA() 

x_pca        = pca.fit_transform( x_scaled )
x_pca_scaled = scale( 
    np.column_stack(( 
          x_pca[:,0]
        , x_pca[:,1] 
        )))
 
# pc1 pc2
x_pc1_scaled = x_pca_scaled[:,0] 
x_pc2_scaled = x_pca_scaled[:,1]  

# PCA Scatter Plot 
fig, ax = plt.subplots( figsize=(10,6) )    

scatter = ax.scatter( 
      x_pc1_scaled
    , x_pc2_scaled 
    , cmap = 'rainbow_r'
    , c    = y_predict
    , s    = 300
    , edgecolors = 'k'
    , alpha      = 0.55 
    )
 
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('PCA Plot\nEstimation = '+np.str(pred)) 

legend = ax.legend( scatter.legend_elements()[0],
                    scatter.legend_elements()[1],
                    loc='best' ) 


### Plotting Estimation
ss = pd.DataFrame( [s.copy(),s.copy()], columns=x.columns )
sx = pd.concat( [ss,x], axis=0 )
sx_encoded = pd.get_dummies(
      sx
    , columns = encode_columns
    )
sx_scaled = scale(  sx_encoded.iloc[1:,:]  )
ss = sx_scaled[[0,0]]  
ss = np.concatenate( [ss,x_scaled], axis=0 ) 
ss = scale( ss ) 
ss_pca = pca.fit_transform( ss )

ss_pca_scaled = scale( 
    np.column_stack(( 
          ss_pca[:,0]
         ,ss_pca[:,1] 
        )))

ss_pc1_scaled = ss_pca_scaled[0,0] 
ss_pc2_scaled = ss_pca_scaled[0,1] 

scatter = ax.scatter( 
      ss_pc1_scaled
    , ss_pc2_scaled 
    , cmap = 'rainbow_r'
    , c    = 'white'
    , s    = 500
    , edgecolors = 'k' 
    , alpha      = 0.99  
    )   
 
plt.show()