# Modeling / Cluster Creation

In [None]:
# regular imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import math

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings("ignore")

# Wrangling
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.cluster import KMeans
from scipy import stats
import sklearn.preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from scipy import stats
from scipy.stats import pearsonr, spearmanr, kruskal
from scipy.stats.mstats import winsorize
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import csv
import acquire
import prepare
import explore

df = acquire.get_bach_df()
df = prepare.clean_college_df(df)

cleaned_df = prepare.clean_step1(df)
new_df = prepare.avg_net_price(cleaned_df)
new_df['major_category'] = new_df.major_name.apply(prepare.categorize_major)

# Ensure you have `2017_2018_2019_earning_by_major.csv` within working folder

new_df = prepare.earnings_merge(new_df)
df = prepare.create_roi_cols(new_df)

df.head()

# income brackets

income_0_30000 = [
'other_fam_income_0_30000',
 'private_fam_income_0_30000',
 'program_fam_income_0_30000',
 'pub_fam_income_0_30000']

income_30001_48000 = [
 'other_fam_income_30001_48000',
 'private_fam_income_30001_48000',
 'program_fam_income_30001_48000',
 'pub_fam_income_30001_48000']

income_48001_75000 = [
'other_fam_income_48001_75000',
'private_fam_income_48001_75000',
'program_fam_income_48001_75000',
'pub_fam_income_48001_75000']

income_75001_110000 = [
'other_fam_income_75001_110000',
'private_fam_income_75001_110000',
'program_fam_income_75001_110000',
'pub_fam_income_75001_110000']

income_over_110000 = [
'other_fam_income_over_110000',
'private_fam_income_over_110000',
'program_fam_income_over_110000',
'pub_fam_income_over_110000']



df = prepare.get_fam_income_col(df, income_0_30000, "fam_income_0_30000")
df = prepare.get_fam_income_col(df, income_30001_48000, "fam_income_30001_48000")
df = prepare.get_fam_income_col(df, income_48001_75000, "fam_income_48001_75000")
df = prepare.get_fam_income_col(df, income_75001_110000, "fam_income_75001_110000")
df = prepare.get_fam_income_col(df, income_over_110000, "fam_income_over_110000")

df = prepare

In [None]:
# Calling split df from Google Drive
train = pd.read_csv('train_imputed.csv')
validate = pd.read_csv('validate_imputed.csv')
test = pd.read_csv('test_imputed.csv')

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
train.head()

## Scaling

Features to scale:

avg_sat_admitted
avg_faculty_salary
med_debt_pell_students
med_debt_non_pell
median_debt_completed
fam_income_0_30000
fam_income_30001_48000
fam_income_48001_75000
fam_income_75001_110000
fam_income_over_110000
avg_net_price
2017
2018
2019

In [None]:
def scale_data(train, validate, test):
    
    scale_columns = ['avg_sat_admitted', 'ACT_score_mid', 'title_IV_student_number', 'avg_faculty_salary', 'med_debt_pell_students', 'median_debt_non_pell', 'median_debt_completed', 'fam_income_0_30000', 'fam_income_30001_48000', 'fam_income_48001_75000', 'fam_income_75001_110000', 'fam_income_over_110000', 'avg_net_price', '2017', '2018', '2019']
    
    train_scaled = train.copy()
    validate_scaled = validate.copy()
    test_scaled = test.copy()
    
    mms = MinMaxScaler()
    
    mms.fit(train[scale_columns])
    
    train_scaled[scale_columns] = mms.transform(train[scale_columns])
    validate_scaled[scale_columns] = mms.transform(validate[scale_columns])
    test_scaled[scale_columns] = mms.transform(test[scale_columns])
    
    return train_scaled, validate_scaled, test_scaled

In [None]:
# Calling our model.py; Assigning df variables to our called function, `scale_data`
train_scaled, validate_scaled, test_scaled = scale_data(train, validate, test)

In [None]:
train_scaled.head()

In [None]:
def find_k(x_train_scaled, cluster_vars, k_range):
    sse = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k)

        # X[0] is our X_train dataframe..the first dataframe in the list of dataframes stored in X. 
        kmeans.fit(x_train_scaled[cluster_vars])

        # inertia: Sum of squared distances of samples to their closest cluster center.
        sse.append(kmeans.inertia_) 

    # compute the difference from one k to the next
    delta = [round(sse[i] - sse[i+1],0) for i in range(len(sse)-1)]

    # compute the percent difference from one k to the next
    pct_delta = [round(((sse[i] - sse[i+1])/sse[i])*100, 1) for i in range(len(sse)-1)]

    # create a dataframe with all of our metrics to compare them across values of k: SSE, delta, pct_delta
    k_comparisons_df = pd.DataFrame(dict(k=k_range[0:-1], 
                             sse=sse[0:-1], 
                             delta=delta, 
                             pct_delta=pct_delta))

    # plot k with inertia
    plt.plot(k_comparisons_df.k, k_comparisons_df.sse, 'bx-')
    plt.xlabel('k')
    plt.ylabel('SSE')
    plt.title('The Elbow Method to find the optimal k\nFor which k values do we see large decreases in SSE?')
    plt.show()

    # plot k with pct_delta
    plt.plot(k_comparisons_df.k, k_comparisons_df.pct_delta, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Percent Change')
    plt.title('For which k values are we seeing increased changes (%) in SSE?')
    plt.show()

    # plot k with delta
    plt.plot(k_comparisons_df.k, k_comparisons_df.delta, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Absolute Change in SSE')
    plt.title('For which k values are we seeing increased changes (absolute) in SSE?')
    plt.show()

    return k_comparisons_df

## Creating Clusters

### Cluster 1: `major_category`, `admission_rate`, `avg_sat_admitted`, `median_debt_completed`

In [None]:
# list of variables I will cluster on. 
cluster_vars = ['admission_rate', 'avg_sat_admitted', 'median_debt_completed']
cluster_name = 'Admission with Debt'
k_range = range(2,20)

In [None]:
find_k(train_scaled, cluster_vars, k_range)

In [None]:
# create kmean object
kmeans = KMeans(n_clusters=5, random_state = 123)

# fit to train and assign cluster ids to observations
kmeans.fit(train_scaled[cluster_vars])

cluster1 = kmeans.predict(train_scaled[cluster_vars])

cluster1

In [None]:
train_scaled['admission_to_debt_cluster'] = cluster1

In [None]:
pd.get_dummies(train_scaled['admission_to_debt_cluster'], prefix= 'area', drop_first=False)

In [None]:
# visualize the cluster
#sns.scatterplot(data=train_scaled['area_cluster'], y = 'longitude', x = 'logerror', hue= 'area_cluster')
plt.show()
sns.barplot(data=train_scaled, x='admission_to_debt_cluster', y='roi_5yr')

In [None]:
train_scaled.head()

### Cluster 2: `region_ipeds`, `avg_faculty_salary`, `avg_sat_admitted`, `admission_rate`

In [None]:
# list of variables I will cluster on. 
cluster_vars_2 = ['region_ipeds', 'avg_sat_admitted', 'avg_faculty_salary', 'admission_rate']
cluster_name_2 = 'Admission by Region'
k_range = range(2,20)

In [None]:
find_k(train_scaled, cluster_vars_2, k_range)

In [None]:
# create kmean object
kmeans = KMeans(n_clusters=4, random_state = 123)

# fit to train and assign cluster ids to observations
kmeans.fit(train_scaled[cluster_vars_2])

cluster2 = kmeans.predict(train_scaled[cluster_vars_2])

cluster2

In [None]:
train_scaled['admission_by_region_cluster'] = cluster2

In [None]:
pd.get_dummies(train_scaled['admission_by_region_cluster'], prefix= 'region', drop_first=False)

In [None]:
# visualize the cluster
#sns.scatterplot(data=train_scaled['area_cluster'], y = 'longitude', x = 'logerror', hue= 'area_cluster')
plt.show()
sns.barplot(data=train_scaled, x='admission_by_region_cluster', y='roi_5yr')

In [None]:
train_scaled.head()

### Cluster 3: `fam_income_0_30000`, `fam_income_30001_48000`, `fam_income_48001_75000`, `fam_income_75001_110000`, `fam_income_over_110000`, `admission_rate`

In [None]:
# list of variables I will cluster on. 
cluster_vars_3 = ['fam_income_0_30000', 'fam_income_30001_48000', 'fam_income_48001_75000', 'fam_income_75001_110000', 'fam_income_over_110000', 'admission_rate']
cluster_name_3 = 'Admission by Family Income'
k_range = range(2,20)

In [None]:
find_k(train_scaled, cluster_vars_3, k_range)

In [None]:
# create kmean object
kmeans = KMeans(n_clusters=4, random_state = 123)

# fit to train and assign cluster ids to observations
kmeans.fit(train_scaled[cluster_vars_3])

cluster3 = kmeans.predict(train_scaled[cluster_vars_3])

cluster3

In [None]:
train_scaled['admission_by_fam_income'] = cluster3

In [None]:
pd.get_dummies(train_scaled['admission_by_fam_income'], prefix= 'income', drop_first=False)

In [None]:
# visualize the cluster
#sns.scatterplot(data=train_scaled['area_cluster'], y = 'longitude', x = 'logerror', hue= 'area_cluster')
plt.show()
sns.barplot(data=train_scaled, x='admission_by_fam_income', y='roi_5yr')

In [None]:
train_scaled.head()

### Cluster 4: 
`admission_rate`, `avg_sat_admitted`, `avg_faculty_salary`, `med_debt_pell_students`, `median_debt_non_pell`

In [None]:
# list of variables I will cluster on. 
cluster_vars_4 = ['admission_rate', 'avg_sat_admitted', 'avg_faculty_salary', 'med_debt_pell_students', 'median_debt_non_pell']
cluster_name_4 = 'Admission, SAT score, fac_salary, by Debt Pell and non-Pell'
k_range = range(2,20)

In [None]:
find_k(train_scaled, cluster_vars_4, k_range)

In [None]:
# create kmean object
kmeans = KMeans(n_clusters=4, random_state = 123)

# fit to train and assign cluster ids to observations
kmeans.fit(train_scaled[cluster_vars_4])

cluster4 = kmeans.predict(train_scaled[cluster_vars_4])

cluster4

In [None]:
train_scaled['admission_by_region_debt'] = cluster4

In [None]:
pd.get_dummies(train_scaled['admission_by_region_debt'], prefix= 'region_debt', drop_first=False)

In [None]:
# visualize the cluster
#sns.scatterplot(data=train_scaled['area_cluster'], y = 'longitude', x = 'logerror', hue= 'area_cluster')
plt.show()
sns.barplot(data=train_scaled, x='admission_by_region_debt', y='roi_5yr')

In [None]:
train_scaled.head()

### Cluster 5: `major_category`, `admission_rate`, `avg_sat_admitted`, `median_debt_completed`

In [None]:
# list of variables I will cluster on. 
cluster_vars_5 = ['admission_rate', 'avg_sat_admitted', 'median_debt_completed']
cluster_name_5 = 'Admission with Debt'
k_range = range(2,20)

In [None]:
find_k(train_scaled, cluster_vars, k_range)

In [None]:
# create kmean object
kmeans = KMeans(n_clusters=5, random_state = 123)

# fit to train and assign cluster ids to observations
kmeans.fit(train_scaled[cluster_vars])

cluster1 = kmeans.predict(train_scaled[cluster_vars])

cluster1

In [None]:
train_scaled['admission_to_debt_cluster'] = cluster1

In [None]:
pd.get_dummies(train_scaled['admission_to_debt_cluster'], prefix= 'area', drop_first=False)

In [None]:
# visualize the cluster
#sns.scatterplot(data=train_scaled['area_cluster'], y = 'longitude', x = 'logerror', hue= 'area_cluster')
plt.show()
sns.barplot(data=train_scaled, x='admission_to_debt_cluster', y='roi_5yr')