In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [None]:
!pip install pyclustering

In [None]:
from pyclustering.cluster.kmedians import kmedians
from pyclustering.cluster import cluster_visualizer_multidim

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
folder_location = '/content/drive/MyDrive/ML_A3'

## Loading the data


In [None]:
df = pd.read_csv(folder_location +'/Datasets/' + 'population.csv')

## Understanding the Data

In [None]:
print(df.head())
print(df.info())
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.describe())

## Step 1 : Preprocessing


### Replacing missing values ('?') by NaN

In [None]:
df.replace(to_replace = ' ?', value = np.nan, inplace = True)

In [None]:
print(df[df == '?'].sum())

###  Check for Null values in columns

In [None]:
null_counts = df.isna().sum() / df.shape[0] * 100

In [None]:
null_counts.plot(kind = 'barh', xlabel = 'Columns', ylabel = 'Percentage of null values', title = 'Perctange of Null values in columns', figsize = (12, 12), fontsize = 8)
plt.savefig(folder_location + '/Plots/' + 'Percentage_Nulls.jpeg') 

### Removing Columns with 40% of more null values

In [None]:
null_columns = list(null_counts[lambda n : n >= 40].index)
preprocessed_df = df.drop(columns = null_columns)

In [None]:
print(preprocessed_df.info())

## Step 2 : Feature Analysis

### Finding Numerical and Categorical Columns

In [None]:
given_numerical_columns = ['AAGE', 'AHRSPAY', 'CAPGAIN', 'CAPLOSS', 'DIVVAL', 'WKSWORK'] #taken from Data_Description.csv
numerical_columns = [c for c in given_numerical_columns if c in preprocessed_df.columns]
categorical_columns = [c for c in preprocessed_df.columns if c not in numerical_columns]

In [None]:
print("Numerical Columns:", numerical_columns)
print("Categorical Columns:", categorical_columns)

### Segregating Numerical and Categorical Data

In [None]:
preprocessed_df[categorical_columns] = preprocessed_df[categorical_columns].astype('object')
numerical_df = preprocessed_df[numerical_columns]
categorical_df = preprocessed_df[categorical_columns]

In [None]:
print(numerical_df.info())
print(categorical_df.info())

### Plotting Numerical and Categorical Data Values

In [None]:
def plot_columns_values(df, type_folder, suffix = '') :
    for column in df.columns :
        plot_name = 'Values vs Frequencies for Column = ' + column
        df[column].value_counts().plot(kind = 'bar', xlabel = 'Values', ylabel = 'Frequencies', title = plot_name, figsize = (20, 20), fontsize = 12)
        plt.savefig(folder_location + suffix + '/Plots/' + type_folder + '/' + plot_name + '.jpeg', bbox_inches = 'tight', dpi = 100) 

In [None]:
plot_columns_values(numerical_df, 'Numerical Data')
plot_columns_values(categorical_df, 'Categorical Data')

### Droping features having most of the data in only one column

In [None]:
def get_unbalanced_columns(df) :
    unbalanced_columns = []
    single_column_limit = 0.7 * df.shape[0] 
    for c in df.columns :
        vals = df[c].value_counts().values
        np.sort(vals)
        if vals[0] >= single_column_limit :
            unbalanced_columns.append(c)
        elif vals.shape[0] > 1 and vals[0] >= 1000 and vals[1] <= vals[0] / 3 :
            unbalanced_columns.append(c)
    return unbalanced_columns

In [None]:
unbalanced_columns = get_unbalanced_columns(preprocessed_df)
filtered_df = preprocessed_df.drop(columns = unbalanced_columns)

In [None]:
print(filtered_df.info())

## Step 3 : Imputation, Bucketization, One-Hot Encoding, Feature Transformation

### Calculating modes for each column

In [None]:
column_modes = filtered_df.mode()

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
	print(column_modes)

### Replacing missing Features with respective Modes

In [None]:
null_columns = filtered_df.columns[filtered_df.isnull().any()]
filtered_df[null_columns] = column_modes[null_columns]

In [None]:
print(filtered_df.isnull().sum())

### Selecting Numerical Columns


In [None]:
cur_numerical_columns = [c for c in filtered_df.columns if c in numerical_columns]

In [None]:
print(cur_numerical_columns)

### Converting Numerical Values to Categorical

In [None]:
converted_df = filtered_df.copy()
converted_df['AAGE'] = pd.cut(filtered_df['AAGE'], bins = [0, 12, 18, 60, 200], labels = ['Child', 'Teenager', 'Adult', 'Senior Adult'], include_lowest = True)
converted_df['WKSWORK'] = pd.cut(filtered_df['WKSWORK'], bins = [0, 5, 10, 15, 100], labels = ['Entry', 'Intermediate', 'Mid', 'Senior'], include_lowest = True)

In [None]:
print(converted_df.info())

### One Hot Encoding

In [None]:
one_hot_df = pd.get_dummies(converted_df)

In [None]:
print(one_hot_df.head())

### Fit PCA

In [None]:
def perform_pca(one_hot_df) :
    pca = PCA()
    principal_components = pca.fit_transform(one_hot_df)
    pca_df = pd.DataFrame(principal_components)
    significance_ratios = pca.explained_variance_ratio_ * 100
    return pca, pca_df, significance_ratios

In [None]:
pca, pca_df, significance_ratios = perform_pca(one_hot_df)

In [None]:
print(pca_df)

### Analyze Cumulative Variance vs Number of Components

In [None]:
def plot_pca(ratios, suffix = ''):
    plt.bar(range(len(ratios)), ratios)
    plt.xlabel('PCA features')
    plt.ylabel('Variance Percentage')
    plt.xticks(rotation = 90)
    plt.title("PCA features vs Variance%")
    plt.savefig(folder_location + suffix + '/Plots/' + 'PCA features vs Variance Percentage.jpeg') 

In [None]:
plot_pca(significance_ratios)

### Filtering based on pca results

In [None]:
final_df = pca_df.iloc[:,:3]

In [None]:
print(final_df.head())

## K Median Clustering in range [10, 24]

In [None]:
def calculate_euclidean_distance(p1, p2) :
    squareSum = 0
    n = len(p1)
    assert(n == len(p2))
    euclidean_distance = sum([ (p1[i] - p2[i]) ** 2 for i in range(n) ]) ** 0.5
    return euclidean_distance

In [None]:
def calculate_inertia(clusters, medians, df):
    n = len(clusters)
    inertia = 0
    for i in range(n) :
        for cc in clusters[i] :
            inertia += calculate_euclidean_distance(df.iloc[cc,:], medians[i])
    return inertia

In [None]:
def perform_k_median(k_ranges, df) :
    inertias = []
    for k in k_ranges:
        initial_medians = [np.random.random(df.shape[1]) for _ in range(k)]
        kmedians_instance = kmedians(df, initial_medians)
        kmedians_instance.process()
        clusters = kmedians_instance.get_clusters()
        medians = kmedians_instance.get_medians()
        inertia = calculate_inertia(clusters, medians, df)
        inertias.append(inertia)
    return inertias

In [None]:
k_ranges = range(10, 24 + 1)
inertias = perform_k_median(k_ranges, final_df)

In [None]:
# to save some time later
# inertias = [117830.61599847802, 99246.66016625229, 103096.05769717509, 101678.41175872329, 91394.05125657705, 91184.7933737051, 79660.64976055258, 87949.72452012134, 87106.01304486161, 71011.72840426826, 69168.96455782162, 73800.78983210154, 77320.13244083677, 73072.83249477061, 67930.15988328044]

In [None]:
print(inertias)

### Plotting Interia plot to look for elbow

In [None]:
def plot_inertias(k_ranges, inertias, suffix = '') :
    plt.plot(k_ranges, inertias, '-o')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Inertia')
    plt.title("K vs Inertia")
    plt.xticks(k_ranges)
    plt.savefig(folder_location + suffix + '/Plots/' + 'K vs Inertia.jpeg')

In [None]:
plot_inertias(k_ranges, inertias)

### K-median clustering on choosen k

In [None]:
# Credits: https://pyclustering.github.io/docs/0.9.0/html/df/d68/classpyclustering_1_1cluster_1_1kmedians_1_1kmedians.html#:~:text=%23%20Visualize%20clustering%20results.
def perform_clustering(k, df):
    initial_medians = [np.random.random(df.shape[1]) for _ in range(k)]
    sample = df.values.tolist()
    kmedians_instance = kmedians(sample, initial_medians)
    kmedians_instance.process()
    clusters = kmedians_instance.get_clusters()
    medians = kmedians_instance.get_medians()
    visualizer = cluster_visualizer_multidim()
    visualizer.append_clusters(clusters, list(sample))
    visualizer.append_cluster(medians, marker = '^', markersize = 10)
    visualizer.show()
    return clusters

In [None]:
k_median_clusters = perform_clustering(20, final_df.iloc[:1000,:])

# Performing above operations on above 50K Popoulation

In [None]:
above_50k_location = '/Above 50k Population'

In [None]:
df_50k = pd.read_csv(folder_location +'/Datasets/' + 'more_than_50k.csv')

In [None]:
print(df_50k.head())
print(df_50k.info())
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df_50k.describe())

In [None]:
df_50k.replace(to_replace = ' ?', value = np.nan, inplace = True)

In [None]:
print(df_50k[df_50k == '?'].sum())

In [None]:
null_counts_50k = df_50k.isna().sum() / df_50k.shape[0] * 100

In [None]:
null_counts_50k.plot(kind = 'barh', xlabel = 'Columns', ylabel = 'Percentage of null values', title = 'Perctange of Null values in columns', figsize = (12, 12), fontsize = 8)
plt.savefig(folder_location + above_50k_location + '/Plots/' + 'Percentage_Nulls.jpeg') 

In [None]:
null_columns_50k = list(null_counts_50k[lambda n : n >= 40].index)
preprocessed_df_50k = df_50k.drop(columns = null_columns_50k)

In [None]:
print(preprocessed_df_50k.info())

In [None]:
numerical_columns_50k = [c for c in given_numerical_columns if c in preprocessed_df_50k.columns]
categorical_columns_50k = [c for c in preprocessed_df_50k.columns if c not in numerical_columns_50k]

In [None]:
print("Numerical Columns:", numerical_columns_50k)
print("Categorical Columns:", categorical_columns_50k)

In [None]:
preprocessed_df_50k[categorical_columns_50k] = preprocessed_df[categorical_columns_50k].astype('object')
numerical_df_50k = preprocessed_df_50k[numerical_columns_50k]
categorical_df_50k = preprocessed_df_50k[categorical_columns_50k]

In [None]:
print(numerical_df_50k.info())
print(categorical_df_50k.info())

In [None]:
plot_columns_values(numerical_df_50k, 'Numerical Data', above_50k_location)
plot_columns_values(categorical_df_50k, 'Categorical Data', above_50k_location)

In [None]:
unbalanced_columns_50k = get_unbalanced_columns(preprocessed_df_50k)
filtered_df_50k = preprocessed_df_50k.drop(columns = unbalanced_columns_50k)

In [None]:
print(filtered_df_50k.info())

In [None]:
null_columns_50k = filtered_df_50k.columns[filtered_df_50k.isnull().any()]
filtered_df_50k[null_columns_50k] = column_modes[null_columns_50k]

In [None]:
print(filtered_df_50k.isnull().sum())

In [None]:
cur_numerical_columns_50k = [c for c in filtered_df_50k.columns if c in numerical_columns_50k]

In [None]:
print(cur_numerical_columns_50k)

In [None]:
converted_df_50k = filtered_df_50k.copy()
converted_df_50k['AAGE'] = pd.cut(filtered_df_50k['AAGE'], bins = [0, 12, 18, 60, 200], labels = ['Child', 'Teenager', 'Adult', 'Senior Adult'], include_lowest = True)

In [None]:
print(converted_df_50k.info())

In [None]:
one_hot_df_50k = pd.get_dummies(converted_df_50k)

In [None]:
print(one_hot_df_50k.head())

In [None]:
pca_50k, pca_df_50k, significance_ratios_50k = perform_pca(one_hot_df_50k)

In [None]:
print(pca_df_50k)

In [None]:
plot_pca(significance_ratios_50k, above_50k_location)

In [None]:
final_df_50k = pca_df_50k.iloc[:,:3]

In [None]:
print(final_df_50k.head())

In [None]:
inertias_50k = perform_k_median(k_ranges, final_df_50k)

In [None]:
# to save time later
# inertias = [2232.2624452265068, 2036.7833628630062, 1713.434374458752, 1620.5378952143776, 1536.7771307837518, 1498.172682238264, 1671.9770938588897, 1648.6562163968747, 1310.3376223683824, 1545.56109540825, 1509.6517147092775, 1427.047253780836, 1407.7818914761235, 1068.8986607834693, 1236.8617624407639]

In [None]:
print(inertias_50k)

In [None]:
plot_inertias(k_ranges, inertias_50k, above_50k_location)

In [None]:
k_median_clusters_50k = perform_clustering(20, final_df_50k.iloc[:1000,:])

## 6.2, 3, 4

In [None]:
pca_df_inverse = pca.inverse_transform(pca_df)
print(pca_df_inverse)

In [None]:
pca_df_inverse_50k = pca_50k.inverse_transform(pca_df_50k)
print(pca_df_inverse_50k)