In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re as re
from datetime import date
from pandas_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler

In [None]:
def category_crosstab(category, target, dataframe):
    df = pd.crosstab(dataframe[target], dataframe[category], normalize='columns').transpose().sort_values(by=0, ascending=False)
    
    return df

def kmeans_clusterer(category, target, dataframe, validation_dataframe, k_clusters = 0, colors = 'rainbow'):
    import matplotlib.pyplot as plt
    from kneed import KneeLocator
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score
    from sklearn.preprocessing import StandardScaler

    
    columns= [category, target]
    cluster_data = dataframe[columns]
    
    conditional_probability = category_crosstab(category, target, cluster_data)
    cluster_data['conditional_probability'] = cluster_data[category].apply(lambda x: conditional_probability.loc[x][1])
    
    cluster_data['Income_count']=cluster_data['Income'].copy()
    
    clustering_data = cluster_data.groupby(category).agg({'Income':'sum',
                                                          'Income_count':'count',
                                                          'conditional_probability':'first'}).sort_values(by='Income_count')
    clustering_data
    
    
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(clustering_data)

    kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 1500,
    "random_state": 42,
        }

    # A list holds the SSE values for each k
    sse = []
    
    number_unique_categories = len(dataframe[category].unique())
    if number_unique_categories > 10: 
        max_number_clusters = 11
    else:
        max_number_clusters = number_unique_categories

    for k in range(1, max_number_clusters):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)
    
  
    # Define Elbow point

    kl = KneeLocator(range(1, max_number_clusters), 
                     sse, 
                     curve="convex", 
                     direction="decreasing")
    
    if k_clusters == 0 : 
        number_cluster = kl.elbow
    else: 
        number_cluster = k_clusters
    
    # Run k means 
    
    kmeans= KMeans(n_clusters=number_cluster, **kmeans_kwargs)
    kmeans.fit(scaled_features)
    
    # assign kmeans labels to each category
    clustering_data['kmeans_cluster'] = kmeans.labels_
    
    
    # plot SSE (inertia) vs number of clusters - Improve
    plt.style.use('dark_background')
    plt.style.use('dark_background')
    fig, axes = plt.subplots(3,1,figsize=(10,20))
    fig.suptitle(category, fontsize=16)
    
        # first plot - inertia vs number of clusters
        
    x = range(1, max_number_clusters)
    y = sse
    #plt.style.use("fivethirtyeight")
    axes[0].plot(x, sse, marker = "D" )
    plt.sca(axes[0])
    plt.set_cmap(colors)
    plt.title('Inertia vs Number of Clusters')
    plt.xticks(range(1, max_number_clusters))
    plt.xlabel("Number of Clusters")
    plt.ylabel("SSE")
    plt.plot(x[number_cluster -1], y[number_cluster -1], 'ro')
    
        # second plot - number of individuals in class vs conditional probability
    
    sns.scatterplot(x='Income_count', 
                    y='conditional_probability', 
                    hue = 'kmeans_cluster' , 
                    data = clustering_data,
                    palette = colors,
                    ax = axes[1])
    
    plt.sca(axes[1])
    plt.title('Number of individuals in Category vs Conditional Probability')
    #plt.xticks(range(0, 100))
    # Set x-axis label
    plt.xlabel('Number of individuals in Category')
    # Set y-axis label
    plt.ylabel('Conditional Probability')
    
        # third plot - number of individuals in class that hit target vs conditional probability

    sns.scatterplot(x='Income', 
                    y='conditional_probability', 
                    hue = 'kmeans_cluster' , 
                    data = clustering_data,
                    palette = colors,
                    ax = axes[2])
    
    plt.sca(axes[2])
    plt.title('Number of individuals in Category w/ Target vs Conditional Probability')
    #plt.xticks(range(0, 100))
    # Set x-axis label
    plt.xlabel('Number of individuals in Category')
    # Set y-axis label
    plt.ylabel('Conditional Probability')
    
    #3d plotting 
    
    fig = plt.figure()
    fig.suptitle(category, fontsize=16)
    ax = plt.axes(projection='3d')
    
    # Data for three-dimensional scattered points
    zdata = clustering_data['conditional_probability']
    xdata = clustering_data['Income']
    ydata = clustering_data['Income_count']
    ax.scatter3D(xdata, ydata, zdata, c=clustering_data['kmeans_cluster'])#, cmap=colors);
    ax.set_xlabel('People in Category')
    ax.set_ylabel('People in Category w/target')
    ax.set_zlabel('Conditional Probability');
    
    
    display(clustering_data.sort_values(by='kmeans_cluster'))
    # Add cluster as dimension 
    new_category= category +' - Clustered'
    dataframe[new_category] = dataframe[category].apply(lambda x: clustering_data['kmeans_cluster'][x])
    validation_dataframe[new_category] = validation_dataframe[category].apply(lambda x: clustering_data['kmeans_cluster'][x])
    

    print('Process done')
    return kmeans

In [None]:
def simple_kmeans(clustering_data, max_k =10, k_clusters = 'else', visualize = True):
    import matplotlib.pyplot as plt
    from kneed import KneeLocator
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score
    from sklearn.preprocessing import StandardScaler

    
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(clustering_data)
    scaled_df = pd.DataFrame(scaled_features)
    scaled_df.columns = clustering_data.columns

    kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 1500,
    "random_state": 42,
        }

    # A list holds the SSE values for each k
    sse = []
    
    # Measuring SSE for different k cluster levels
    max_number_clusters = max_k
    
    
    for k in range(1, max_number_clusters):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)
        
        # Define Elbow point Automatically or manually. 

    kl = KneeLocator(range(1, max_number_clusters), 
                     sse, 
                     curve="convex", 
                     direction="decreasing")
    
    
    if k_clusters == 'else' : 
        number_cluster = kl.elbow
    else: 
        number_cluster = k_clusters
    
    if visualize == True:
    
        # plot SSE (inertia) vs number of clusters - Improve

        plt.style.use('fivethirtyeight') # set dark style, 'cause its simply better. 
        #plt.set_cmap('Set1')

        fig, axes = plt.subplots(1,1,figsize=(10,20)) #3 subplots, each with its row
        #fig.suptitle(category, fontsize=16)

            # first plot - inertia vs number of clusters

        x = range(1, max_number_clusters)
        y = sse
        #plt.style.use("fivethirtyeight")
        axes.plot(x, sse )

        plt.sca(axes) # select ax0 
        #plt.set_cmap(colors)

        plt.title('Inertia vs Number of Clusters') #title
        plt.xticks(range(1, max_number_clusters)) # xticks
        plt.xlabel("Number of Clusters") # xlabels
        plt.ylabel("SSE") # ylabels
        plt.plot(x[number_cluster -1], y[number_cluster -1], color='green', marker='X', 
                 linestyle='dashed', linewidth=15, markersize=25) # show that represents 

    
    
    # Run k means with right number of clusters 
    
    kmeans= KMeans(n_clusters=number_cluster, **kmeans_kwargs)
    kmeans.fit(scaled_features)
    
    # assign kmeans labels to each category
    clustering_data['kmeans_cluster'] = kmeans.labels_
    scaled_df['kmeans_cluster'] = kmeans.labels_
    
    display(clustering_data.sort_values(by='kmeans_cluster'))
    
    return scaled_df

def boxplot_cluster_compparisson(dataframe):

    variables = dataframe.iloc[:,:-1].columns
    number_variables = len(variables)
    height = number_variables*20

    fig, ax = plt.subplots(number_variables,1, figsize =(height,height))

    for variable, x in zip(variables, range(number_variables)): 
        sns.boxplot(x= dataframe.iloc[:,-1] ,
                    y = dataframe[variable],
                       ax = ax[x])     
    return fig

def kmeans_analysis(clustering_data, max_k =10, k_clusters = 'else', visualize = True):
    
    sc_df = simple_kmeans(clustering_data, max_k =10, k_clusters = 'else', visualize = True)
    graphs = boxplot_cluster_compparisson(sc_df)
    return graphs, sc_df

In [None]:
def plot_cumulative_significance_PCA(pca, plotTitle):
    """Takes the PCA model after fit and transform, plotting the cumulative significance of each component"""

    # figure and axes
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

    # draw plots
    ax1.plot(pca.explained_variance_, marker=".", markersize=12)
    ax2.plot(pca.explained_variance_ratio_, marker=".", markersize=12, label="Proportion")
    ax2.plot(np.cumsum(pca.explained_variance_ratio_), marker=".", markersize=12, linestyle="--", label="Cumulative")

    # customizations
    ax2.legend()
    ax1.set_title(plotTitle, fontsize=14)
    ax2.set_title("Variance Explained", fontsize=14)
    ax1.set_ylabel("Eigenvalue")
    ax2.set_ylabel("Proportion")
    ax1.set_xlabel("Components")
    ax2.set_xlabel("Components")
    ax1.set_xticks(range(0, pca.n_components_, 2))
    ax1.set_xticklabels(range(1, pca.n_components_ + 1, 2))
    ax2.set_xticks(range(0, pca.n_components_, 2))
    ax2.set_xticklabels(range(1, pca.n_components_ + 1, 2))

    plt.show()

In [None]:
def pca_analysis(subgroups_pca_dic, path_to_excel):
   # writer = pd.ExcelWriter('/Files/test_pca.xlsx')

    for subGroup in hood_subgroups_pca_dic.keys():
        columns = hood_subgroups_dic[subGroup]
        k = hood_subgroups_pca_dic[subGroup]

        #PCA fit
        pca = PCA(n_components=k)
        pca_feat = pca.fit_transform(pca_data[columns])

        #Creating dataframe
        pca_feat_names = [f"PC{i}" for i in range(k)]
        pca_df = pd.DataFrame(pca_feat, index=pca_data[columns].index, columns=pca_feat_names)

        # Reassigning df to contain pca variables
        pca_df = pd.concat([pca_data[columns], pca_df], axis=1)

        # Interpreting each Principal Component
        loadings = pca_df[columns + pca_feat_names].corr().loc[columns, pca_feat_names]
        print("\n\n\n//////////////////////////////////%s" % subGroup)
        display(loadings.style.applymap(_color_red_or_green))

        # Returning an excel file (sorry) with the analysis

        test='home_structures'

        loadings.style.applymap(_color_red_or_green).to_excel(writer, subGroup)
        #writer.save()

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_row', 500)

# 0. Loading our Data

In [None]:
df = pd.read_csv('Data/donors.csv')

In [None]:
df

In [None]:
df.head()

# 1.Analysing Data

 ## Donation history

- RAMNTALL  -->  Dollar amount of lifetime gifts to date
- NGIFTALL  -->  Number of lifetime gifts to date
- AVGGIFT  -->   Average dollar amount of gifts to date
- NUMPROM  -->   Lifetime number of promotions received to date
- NUMPRM12 -->   Number of promotions received in the last 12 months
- CARDPROM -->   Lifetime number of card promotions received to date
- CARDPRM12 -->  Number of card promotions received in the last 12 months
- CARDGIFT -->   Number of lifetime gifts to card promotions to date
- MINRAMNT -->   Dollar amount of smallest gift to date
- MAXRAMNT -->   Dollar amount of largest gift to date


### Data to generate/keep
- AVGGIFT - Average donated amount
- NGIFTALL / NUMPROM - Success percentage

In [None]:
columns_source = [
    'RAMNTALL',
    'NGIFTALL',
    'AVGGIFT',
    'NUMPROM',
    'NUMPRM12',
    'CARDPROM',
    'CARDPM12',
    'CARDGIFT', 
    'MINRAMNT', 
    'MAXRAMNT',
    'LASTGIFT', 
    'TIMELAG', 
]

filtered_df = df[columns_source].copy()

In [None]:
current_date = pd.to_datetime(df['RDATE_3'].mode(), infer_datetime_format=True)[0]

current_date


In [None]:
filtered_df['first_time_delta'] = (pd.to_datetime(df['NEXTDATE'], infer_datetime_format=True) - pd.to_datetime(df['FISTDATE'], infer_datetime_format=True)).dt.days

filtered_df['minmax_time_delta'] = (pd.to_datetime(df['MINRDATE'], infer_datetime_format=True) - pd.to_datetime(df['MAXRDATE'], infer_datetime_format=True)).dt.days

filtered_df['maxmin_dollar_diff'] = df['MAXRAMNT'] - df['MINRAMNT']

#filtered_df['customer_age'] = (pd.to_datetime([current_date for x in range(df.shape[0])]) -(pd.to_datetime(df['LASTDATE'], infer_datetime_format=True)))

filtered_df

In [None]:
df[columns_source].isna().sum()

In [None]:
#df[df['TIMELAG'].isna()][rfa_columns]

In [None]:
# Make sure datatype is correct
df[columns_source].dtypes

In [None]:
#fix dtypes
df['RAMNTALL'] = df['RAMNTALL'].astype('int64')

In [None]:
df[columns_source].head()

In [None]:
# NGIFTALL / NUMPROM - Success percentage
filtered_df['SUCCESS_PCT'] = df['NGIFTALL'] / df['NUMPROM']
filtered_df['SUCCESS_PCT'].head()

## Adding columns

- Percentage of time as each-category
- Variance on donation value

#### Percentage of time as each-category

In [None]:
def get_percentage_as_category(source_dataframe, target_df, category):
    re_expression = re.compile('^RFA_\d{1,2}$')

    rfa_columns = [column for column in source_dataframe.columns.values if re_expression.match(column)]

    rfas = source_dataframe[rfa_columns].copy()
    #rint(rfas.head())

    rfas = rfas.applymap(lambda val: 1 if val[0] == category else 0)
    #rint(rfas.head())
    
    target_df['PCT_TIME_LAPSED_%s' % category] =  rfas.sum(axis=1) / df['NUMPROM']

    return target_df

In [None]:
filtered_df = get_percentage_as_category(df, filtered_df, 'F')
filtered_df.head()

In [None]:
filtered_df = get_percentage_as_category(df, filtered_df, 'N')
filtered_df.head()

In [None]:
filtered_df= get_percentage_as_category(df, filtered_df, 'A')
filtered_df.head()

In [None]:
filtered_df= get_percentage_as_category(df, filtered_df, 'L')
filtered_df.head()

In [None]:
filtered_df= get_percentage_as_category(df, filtered_df, 'I')
filtered_df.head()

In [None]:
filtered_df= get_percentage_as_category(df, filtered_df, 'S')
filtered_df.head()

In [None]:
"""

def rfa_normalizer(source_dataframe, target_df, byte, category):
    re_expression = re.compile('^RFA_\d{1,2}$')

    rfa_columns = [column for column in source_dataframe.columns.values if re_expression.match(column)]

    rfas = source_dataframe[rfa_columns].copy()
    print(rfas.head())

    rfas = rfas.applymap(lambda val: 1 if val[byte] == category else 0)
    print(rfas.head())
    
    target_df['PCT_TIME_LAPSED_%s' % category] =  rfas.sum(axis=1) / df['NUMPROM']

    return target_df
"""

In [None]:
"""
category1 = { 'S': 5, 
             'A' : 4,
             'N' : 3, 
             'F' : 2, 
             'L' : 1, 
             'I' : 0
                    }

category2 = {'4': 4, 
             '3': 3,
             '2' : 2,
             '1': 1
                    }

category3 = {}

"""

In [None]:
re_expression = re.compile('^RFA_\d{1,2}$')
rfa_columns = [column for column in df.columns.values if re_expression.match(column)]
df[df['RFA_22'].str.startswith('P')][rfa_columns].head()

In [None]:
re_expression = re.compile('^RAMNT_\d{1,2}$')
ramnt_columns = [column for column in df.columns.values if re_expression.match(column)]
df[df['RFA_22'].str.startswith('P')][ramnt_columns].head()

#### Variance on donation value

In [None]:
re_expression = re.compile('^RAMNT_\d{1,2}$')

ramt_columns = [column for column in df.columns.values if re_expression.match(column)]
ramt_columns[:3]

In [None]:
ramts = df[ramt_columns].copy()
ramts.head()

In [None]:
filtered_df['GIFT_VAR'] = ramts.var(axis=1)
filtered_df['GIFT_VAR'].fillna(0, inplace=True)
filtered_df.head()

## Correlation check on filtered columns so far

In [None]:
corr = filtered_df.corr()
corr = corr[np.abs(corr) > 0.45]


plt.figure(figsize=(20,20))
sns.heatmap(corr, annot=True)
plt.show()

## Analysing Interests columns

In [None]:
columns_interests= [
    'COLLECT1',
    'VETERANS',
    'BIBLE',
    'CATLG',
    'HOMEE',
    'PETS',
    'CDPLAY',
    'STEREO',
    'PCOWNERS',
    'PHOTO',
    'CRAFTS',
    'FISHER',
    'GARDENIN',
    'BOATS',
    'WALKER',
    'KIDSTUFF',
    'CARDS',
    'PLATES'
]

In [None]:
interests_df = df[columns_interests].copy()
interests_df.head()

In [None]:
interests_df = interests_df.applymap(lambda val: 1 if val == 'Y' else 0)
interests_df.head()

In [None]:
corr = interests_df.corr()
corr = corr[np.abs(corr) > 0.45]

plt.figure(figsize=(25,25))
sns.heatmap(corr, annot=True)
plt.show()

    low correlations, going to ignore for now since there's to many variables

## Adding Country Size

In [None]:
filtered_df['county_size'] = df['GEOCODE2'].replace({'A':1, 'B':2, 'C':3, 'D':4, ' ': np.NaN})

In [None]:
geocode_df = df[['GEOCODE','GEOCODE2']].copy()
geocode_df

## Analysing wealth columns

In [None]:
filtered_df

In [None]:
len(df)

In [None]:
df['WEALTH1'].count(), df['WEALTH1'].isna().sum()

In [None]:
df['WEALTH2'].count(), df['WEALTH2'].isna().sum()

In [None]:
df.WEALTH1.max(), df.WEALTH1.min()

In [None]:
df.WEALTH2.max(), df.WEALTH2.min()

In [None]:
wealth_df = df[['WEALTH1','WEALTH2']].copy()

In [None]:
pct_non_NaN = wealth_df.mean(axis=1).count() / wealth_df.shape[0]
print('Percentage of columns with value after merge: %1.2f%%' % (pct_non_NaN *100))

In [None]:
wealth_df['Merged'] = wealth_df.max(axis=1)
wealth_df.head()

In [None]:
filtered_df['WEALTH'] = wealth_df.Merged
filtered_df.dropna(subset=['WEALTH'], inplace=True)
del wealth_df

filtered_df.head()

## Analysing other columns

In [None]:
other_df = df[['GENDER','INCOME','HOMEOWNR']].copy()
other_df.head()

In [None]:
other_df.HOMEOWNR.replace(['',' '], 'U', inplace=True)
other_df.HOMEOWNR.replace(['U'], 0, inplace=True)
other_df.HOMEOWNR.replace(['H'], 1, inplace=True)

In [None]:
other_df.isna().sum()

In [None]:
other_df.GENDER.replace(['A','C'],'J', inplace=True)
other_df.GENDER.replace(' ','U', inplace=True)
gender_dummies = pd.get_dummies(other_df.GENDER, prefix='GENDER')

In [None]:
other_df.drop(columns='GENDER', inplace=True)
other_df = other_df.join(gender_dummies)

In [None]:
filtered_df = pd.concat([filtered_df, other_df], axis=1)
filtered_df.dropna(subset=['INCOME'], inplace=True)
filtered_df.head()

## Analysing Children columns

In [None]:
children_columns = [
    'CHILD03',
    'CHILD07',
    'CHILD12',
    'CHILD18',
    'NUMCHLD'
]

In [None]:
children_df = df[children_columns].copy()
children_df.head()

In [None]:
children_df.replace(' ', 0, inplace=True)
children_df.replace('', 0, inplace=True)
children_df.replace(['M','F','B'], 1, inplace=True)
children_df.fillna(0, inplace=True)

children_df = children_df.astype('int64')
children_df.head()

In [None]:
children_df['SUM_ageGap_columns'] = children_df[children_columns[:-1]].sum(axis=1)
children_df.drop(columns=children_columns[:-1], inplace=True)

children_df.head()

In [None]:
filtered_df['CHILDREN'] = children_df.max(axis=1)
filtered_df.head()

In [None]:
filtered_df.shape

## Adding Neighborhood Socio Economic Status

In [None]:
int(df['DOMAIN'].mode()[0][1])

In [None]:
socio_economic_status = df['DOMAIN'].apply(lambda x : int(x[1]) if x != ' ' else int(df['DOMAIN'].mode()[0][1])) # assign mode in case of NaN
#socio_economic_status = pd.get_dummies(socio_economic_status, prefix='SES_')

socio_economic_status

In [None]:
rurality = df['DOMAIN'].apply(lambda x : x[0] if x != ' ' else df['DOMAIN'].mode()[0][0])
rurality.replace( {'U':4,
                  'C':3,
                  'S':2, 
                  'T':1,
                  'R':0}, inplace=True)
rurality

In [None]:
filtered_df['ses'] = socio_economic_status
filtered_df.head()

In [None]:
filtered_df['rurality'] = rurality
filtered_df.head()

In [None]:
filtered_df.dropna(inplace=True)

In [None]:
filtered_df.shape

## Adding some more useful columns 

In [None]:
df['MDMAUD_R'].value_counts()

In [None]:
df['MDMAUD_F'].value_counts()

In [None]:
df['MDMAUD_A'].value_counts()

In [None]:
df['OSOURCE'].value_counts()

In [None]:
df['NOEXCH'].value_counts()

In [None]:
filtered_df['RECINHSE'] = df['RECINHSE'].apply(lambda x : 1 if x == 'X' else 0)
filtered_df

In [None]:
filtered_df['RECP3'] = df['RECP3'].apply(lambda x : 1 if x == 'X' else 0)

In [None]:
filtered_df['RECPGVG'] = df['RECPGVG'].apply(lambda x : 1 if x == 'X' else 0)

In [None]:
filtered_df['RECSWEEP'] = df['RECSWEEP'].apply(lambda x : 1 if x == 'X' else 0)

In [None]:
filtered_df['HIT'] = df['HIT']

In [None]:
filtered_df['MAJOR'] = df['MAJOR'].apply(lambda x : 1 if x == 'X' else 0)

In [None]:
df[['GEOCODE','GEOCODE2']].isna()

df[((df['GEOCODE'].isna() == False) | (df['GEOCODE2'].isna() == False))]

# Later on, we can reduce number of NaN by correcting this geocode.



In [None]:
filtered_df

## Normalizing data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# https://stats.stackexchange.com/questions/164917/should-data-be-centeredscaled-before-applying-t-sne 
scaler = StandardScaler()

scaled_filtered_df = pd.DataFrame(scaler.fit_transform(filtered_df), columns = filtered_df.columns)
scaled_filtered_df

In [None]:
filtered_df.dropna(inplace=True)

In [None]:
filtered_df.shape

## Correlation check

In [None]:
corr = filtered_df.corr()
corr = corr[corr > 0.4]

plt.figure(figsize=(15,15))
sns.heatmap(corr, annot=True, linewidths=0.2)
plt.show()

    overall looking ok

# Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch

In [None]:
inertia = []
k_range = range(1,15)

In [None]:
for k in k_range:
    k_means = KMeans(n_clusters=k)
    k_means.fit(filtered_df)
    inertia.append(k_means.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(k_range, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
scaled_filtered_df

In [None]:
from sklearn.manifold import TSNE

X = scaled_filtered_df

X_embedded = TSNE(n_components=2).fit_transform(X)



In [None]:
X_embedded

In [None]:
x = X_embedded[:,0]
y =X_embedded[:,1]
plt.figure(figsize=(20,20))
sns.scatterplot(x,y, hue = filtered_df['GENDER_F'] )

In [None]:
z = len(filtered_df.columns)

In [None]:
import matplotlib.pyplot as plt 

In [None]:
for column in filtered_df.columns: 
    plt.figure(figsize=(20,20))
    sns.scatterplot(x,y, hue = filtered_df[column])
    plt.savefig('Images/%s_TSNE.png' % column)

In [None]:
from sklearn.manifold import TSNE

X = scaled_filtered_df

X_embedded3 = TSNE(n_components=3).fit_transform(X)


In [None]:
df1 = pd.DataFrame(X_embedded3, columns = ['0','1','2'])
df1['MAJOR'] = filtered_df['MAJOR']

In [None]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
#df = px.data.iris()
fig = px.scatter_3d(df1, x='0', y='1', z='2',
              color='MAJOR')
fig.show()

In [None]:
from sklearn.feature_selection import VarianceThreshold

k_means = KMeans(n_clusters=3)
k_means.fit(filtered_df)

new_column = 'cluster'

selector = VarianceThreshold()

filtered_df[new_column] = k_means.predict(selector.fit_transform(filtered_df))


selector

In [None]:
selector.get_support()

In [None]:
filtered_df.head()

In [None]:
filtered_df.groupby(new_column).mean().transpose()

In [None]:
pctimes = ['PCT_TIME_LAPSED_F', 'PCT_TIME_LAPSED_N', 'PCT_TIME_LAPSED_L', 'PCT_TIME_LAPSED_S', 'PCT_TIME_LAPSED_A', 'PCT_TIME_LAPSED_I']
target =  ['AVGGIFT', 'SUCCESS_PCT']
total = pctimes+target

In [None]:
model = KMeans(n_clusters=3)

new_column = 'cluster_affinity'

#selector = VarianceThreshold()

filtered_df[new_column] = model.fit_predict(filtered_df[total])


#filtered_df[new_column] = model.fit_predict(selector.fit_transform(filtered_df.iloc[:,:-1]))


selector.get_support()

In [None]:
filtered_df['cluster_affinity'].value_counts()

In [None]:
filtered_df.iloc[:,:-2]

In [None]:
filtered_df.isna().sum()

In [None]:
X_embedded

In [None]:
len(pd.Series(model.labels_).unique())

In [None]:
figure = boxplot_cluster_compparisson(filtered_df)

In [None]:
figure.savefig('Images/pretty_boxplot.png')