### In-Class Exercise 1: Load the clean communitiy crime data

- Create a dataframe for predictor and response variables
- Scale the predictor variables as done in Feature Engineering 1 - Principal Components


In [None]:
from sklearn.manifold import TSNE

# Data processing and functions
import pandas as pd
import numpy as np
import scipy as sp

# Analytics and modeling
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn import manifold
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline
import statsmodels.api as sm
import statsmodels.sandbox.tools.tools_pca as sm_pca
from statsmodels.formula.api import ols as sm_ols
from statsmodels.stats.anova import anova_lm as sm_anova
from patsy.contrasts import Treatment
from patsy import dmatrices

# Graphing and visualizing
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib import cm
from pylab import savefig

# Setting graphing preferences
sns.set(style="darkgrid", color_codes=True)

# Printing
import locale

# Show plots locally
locale.setlocale( locale.LC_ALL, '' )

%matplotlib inline

In [None]:
# Load our data
path = "/Users/mead/Fall2017/DonBrown-DS6001/FeatureEngineering/"

file = "CrimeDataClean.csv"

label = "CrimeVariableLabels.txt"

# DATA
crime_df_clean = pd.read_csv(path + file, low_memory = False)

In [None]:
# Divide up into predictors and responses
predictors = crime_df_clean.columns[:101]
response = crime_df_clean.columns[102:]

In [None]:
# Standardize by removing the raw crime counts (want just PerPop)
response_start = crime_df_clean.columns.get_loc('murders')
# Just using pattern information to extract these columns more easily
cols = list(range(response_start + 1, 118, 2))
cols.extend([118,119])

In [None]:
# Get response df
crime_resp_df = crime_df_clean[crime_df_clean.columns[cols]]

# Get the predicted df
crime_pred_df = crime_df_clean.loc[:, :'LemasPctOfficDrugUn']
crime_pred_df.shape

In [None]:
# Pick the columns we need to scale
crime_pred_df.describe()
# Some of these are percentages; only need to scale the one with ranges 

preds2scale = crime_pred_df.columns[crime_pred_df.apply('max', axis = 0) > 1000]
print(preds2scale)
preds2notscale = crime_pred_df.columns[crime_pred_df.apply('max', axis = 0) < 1000]
print(preds2notscale)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,100))


# Acctually perform the scaling
scaled_df = pd.DataFrame(scaler.fit_transform(crime_pred_df[preds2scale]),
                        columns = preds2scale)

scaled_df.describe()

In [None]:
# And combine with the unscaled columns
pred_scaled_df = pd.concat([scaled_df, crime_pred_df[preds2notscale]], axis=1)

In [None]:
pred_scaled_df.shape

In [None]:
crimes_scaled_preds_df = pred_scaled_df
crimes_scaled_preds_df.describe()

### In-Class Exercise 2 

- Get two tSNE components for the scaled predictors using the default values for the hyperparameters
- Plot the observations in these components
- Plot the observations in these components with the violent crimes cut using the box plot statistics
- Plot the observations in these components with the nonviolent crimes cut using the box plot statistics
- Plot the observations in these components with one of the other response variables

In [None]:
# tsne 

# Get two components for plotting
components = 2

tsne = TSNE(n_components= components, init='random',
                         random_state=42)

crimes_tsne = tsne.fit_transform(crimes_scaled_preds_df)

In [None]:
plt.scatter(crimes_tsne[:,0], crimes_tsne[:,1])

In [None]:
# Doing the first two PC scatterplots with ViolentCrimesPerPop as the binned response
violent_box = crime_resp_df.boxplot(column = 'ViolentCrimesPerPop',
                                   return_type = 'dict', sym = 'b')

violent_box_bounds = [violent_box['whiskers'][0].get_ydata()[1], # lower whisker
                    violent_box['boxes'][0].get_ydata()[0],      # lower box 25%
                    violent_box['medians'][0].get_ydata()[0],    # median
                    violent_box['boxes'][0].get_ydata()[2],      # upper box 75%
                    violent_box['whiskers'][1].get_ydata()[1]]   # upper whisker

print(violent_box_bounds)

In [None]:
# Now need to make a new column to correctly take care 
# of each of these boundary points and label our data by violent crime rates
crime_levels = ['Great', 'Okay', 'Neutral', 'Bad', 'Abyssmal']

# And now bin the data
bins = violent_box_bounds
bins.append(crime_resp_df['ViolentCrimesPerPop'].max())

In [None]:
# Now use pd.cut to break the Violent Crimes into the different labels that we created above based on their bins
Violent = pd.cut(crime_resp_df['ViolentCrimesPerPop'], bins, labels = crime_levels)

In [None]:
# PCA plot with labels for Violent Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[Violent == i, 0],
                   crimes_tsne[Violent == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE Plot of the Predictor Variables with Violent Crime Rate')


In [None]:
# Doing the first two PC scatterplots with nonViolPerPop as the binned response
nonviolent_box = crime_resp_df.boxplot(column = 'nonViolPerPop',
                                   return_type = 'dict', sym = 'b')

nonviolent_box_bounds = [nonviolent_box['whiskers'][0].get_ydata()[1], # lower whisker
                    nonviolent_box['boxes'][0].get_ydata()[0],      # lower box 25%
                    nonviolent_box['medians'][0].get_ydata()[0],    # median
                    nonviolent_box['boxes'][0].get_ydata()[2],      # upper box 75%
                    nonviolent_box['whiskers'][1].get_ydata()[1]]   # upper whisker

print(nonviolent_box_bounds)

In [None]:
# Now need to make a new column to correctly take care 
# of each of these boundary points and label our data by nonviolent crime rates
crime_levels = ['Great', 'Okay', 'Neutral', 'Bad', 'Abyssmal']

# And now bin the data
bins = nonviolent_box_bounds
bins.append(crime_resp_df['nonViolPerPop'].max())

In [None]:
# Now use pd.cut to break the NonViolent Crimes into the different labels that we created above based on their bins
NonViolent = pd.cut(crime_resp_df['nonViolPerPop'], bins, labels = crime_levels)

# PCA plot with labels for NonViolent Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[NonViolent == i, 0],
                   crimes_tsne[NonViolent == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE Plot of the Predictor Variables with Nonviolent Crime Rate')

In [None]:
# Doing the first two PC scatterplots with nonViolPerPop as the binned response
rapes_box = crime_resp_df.boxplot(column = 'rapesPerPop',
                                   return_type = 'dict', sym = 'b')

rapes_box_bounds = [rapes_box['whiskers'][0].get_ydata()[1], # lower whisker
                    rapes_box['boxes'][0].get_ydata()[0],      # lower box 25%
                    rapes_box['medians'][0].get_ydata()[0],    # median
                    rapes_box['boxes'][0].get_ydata()[2],      # upper box 75%
                    rapes_box['whiskers'][1].get_ydata()[1]]   # upper whisker

print(rapes_box_bounds)


# Produce all of these with a log transformation

In [None]:
# Now need to make a new column to correctly take care 
# of each of these boundary points and label our data by rape rates
crime_levels = ['Great', 'Okay', 'Neutral', 'Bad', 'Abyssmal']

# And now bin the data
bins = rapes_box_bounds
bins.append(crime_resp_df['rapesPerPop'].max())

In [None]:
# Now use pd.cut to break the Rape Crimes into the different labels that we created above based on their bins
Rapes = pd.cut(crime_resp_df['rapesPerPop'], bins, labels = crime_levels)

# PCA plot with labels for Rape Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[Rapes == i, 0],
                   crimes_tsne[Rapes == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE Plot of the Predictor Variables with Rape Rate')

### In-Class Exercise 3 

- Get two tSNE components for the scaled predictors using perplexity values of 5 and 100
- Plot the observations in these components
- Plot the observations in these components with the violent crimes cut using the box plot statistics
- Plot the observations in these components with the nonviolent crimes cut using the box plot statistics
- Plot the observations in these components with one of the other response variables

In [None]:
# tsne -- PERPLEXITY OF 5

# Get two components for plotting
components = 2

tsne = TSNE(n_components= components, init='random',
                         random_state=42, perplexity = 5)

crimes_tsne = tsne.fit_transform(crimes_scaled_preds_df)

In [None]:
plt.scatter(crimes_tsne[:,0], crimes_tsne[:,1])

In [None]:
# PCA plot with labels for Violent Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[Violent == i, 0],
                   crimes_tsne[Violent == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE (perplexity = 5) Plot of the Predictor Variables with Violent Crime Rate')


In [None]:
# PCA plot with labels for NonViolent Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[NonViolent == i, 0],
                   crimes_tsne[NonViolent == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE (perplexity = 5) Plot of the Predictor Variables with NonViolent Crime Rate')


In [None]:
# PCA plot with labels for Rape Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[Rapes == i, 0],
                   crimes_tsne[Rapes == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE (perplexity = 5) Plot of the Predictor Variables with Rape Crime Rate')


In [None]:
# tsne - PERPLEXITY = 100

# Get two components for plotting
components = 2

tsne = TSNE(n_components= components, init='random',
                         random_state=42, perplexity = 100)

crimes_tsne = tsne.fit_transform(crimes_scaled_preds_df)

In [None]:
plt.scatter(crimes_tsne[:,0], crimes_tsne[:,1])

In [None]:
# PCA plot with labels for Violent Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[Violent == i, 0],
                   crimes_tsne[Violent == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('PtSNE (perplexity = 100) Plot of the Predictor Variables with Violent Crime Rate')


In [None]:
# PCA plot with labels for Non-Violent Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[NonViolent == i, 0],
                   crimes_tsne[NonViolent == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE (perplexity = 100) Plot of the Predictor Variables with Non-Violent Crime Rate')


In [None]:
# PCA plot with labels for Rape Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[Rapes == i, 0],
                   crimes_tsne[Rapes == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE (perplexity = 100) Plot of the Predictor Variables with Rape Crime Rate')


### In-Class Exercise 4 

- Get two tSNE components for the scaled predictors with log transformations using at least two different values for the hyperparameters
- Plot the observations in these components
- Plot the observations in these components with the violent crimes cut using the box plot statistics
- Plot the observations in these components with the nonviolent crimes cut using the box plot statistics
- Plot the observations in these components with one of the other response variables

In [None]:
# Take the log transform of the predictor space
crimes_scaled_preds_log_df = np.log(crimes_scaled_preds_df + .1)

In [None]:
# tsne -- PERPLEXITY OF 5

# Get two components for plotting
components = 2

tsne = TSNE(n_components= components, init='random',
                         random_state=42, perplexity = 5)

crimes_tsne = tsne.fit_transform(crimes_scaled_preds_log_df)

In [None]:
plt.scatter(crimes_tsne[:,0], crimes_tsne[:,1])

In [None]:
# PCA plot with labels for Violent Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[Violent == i, 0],
                   crimes_tsne[Violent == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE (perplexity = 5) Plot of the Log of the Predictor Variables with Violent Crime Rate')

In [None]:
# PCA plot with labels for Non-Violent Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[NonViolent == i, 0],
                   crimes_tsne[NonViolent == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE (perplexity = 5) Plot of the Log of the Predictor Variables with Non-Violent Crime Rate')


In [None]:
# PCA plot with labels for Rape Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[Rapes == i, 0],
                   crimes_tsne[Rapes == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE (perplexity = 5) Plot of the Log of the Predictor Variables with Rape Crime Rate')


In [None]:
# tsne -- PERPLEXITY OF 100

# Get two components for plotting
components = 2

tsne = TSNE(n_components= components, init='random',
                         random_state=42, perplexity = 100)

crimes_tsne = tsne.fit_transform(crimes_scaled_preds_log_df)

In [None]:
plt.scatter(crimes_tsne[:,0], crimes_tsne[:,1])

In [None]:
# PCA plot with labels for Violent Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[Violent == i, 0],
                   crimes_tsne[Violent == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE (perplexity = 100) Plot of the Log of the Predictor Variables with Violent Crime Rate')

In [None]:
# PCA plot with labels for Non-Violent Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[NonViolent == i, 0],
                   crimes_tsne[NonViolent == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE (perplexity = 100) Plot of the Log of the Predictor Variables with Non-Violent Crime Rate')


In [None]:
# PCA plot with labels for Rape Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[Rapes == i, 0],
                   crimes_tsne[Rapes == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE (perplexity = 100) Plot of the Log of the Predictor Variables with Rape Crime Rate')


### In-Class Exercise 5

- Insert a principal component and comparable t-SNE plot
- Compare and contrast these two plots using criteria of your choosing

### In-Class Exercise 5 Answers

- I decided to include a PCA plot and a tSNE plot (perplexity = 30) for Violent Crime using the log-transformed predictors
- Looking at these plots, you can see that they have both managed to find some structure in the data such that we can see a separation in towns from the places with more Violent Crime to places with less Violent Crime. In this case, the first two components of the PCA have managed to induce a linear separation. In the tSNE though we actually get a much better look at the non-linear relationship underlying the data.

In [None]:
# tsne -- PERPLEXITY OF 30

# Get two components for plotting
components = 2

tsne = TSNE(n_components= components, init='random',
                         random_state=42, perplexity = 30)

crimes_tsne = tsne.fit_transform(crimes_scaled_preds_log_df)

## perform PCA on the predictor variables

n = len(crimes_scaled_preds_log_df.columns)

pca = PCA(n_components = n)
# default is the number of columns in the data 
# but can be set to any integer less than or equal to that value

crimes_pca = pca.fit_transform(crimes_scaled_preds_log_df)


In [None]:
# PCA log plot with labels for Violent Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_pca[Violent == i, 0],
                   crimes_pca[Violent == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('Principal Component Plot of the log of the Predictor Variables with Violent Crime Rate')

In [None]:
# tSNE plot with labels for Violent Crime levels
colors = ['navy', 'green', 'darkorange', 'pink', 'red']

fig, ax = plt.subplots()
for i, color in  zip(crime_levels, colors):
        ax.scatter(crimes_tsne[Violent == i, 0],
                   crimes_tsne[Violent == i, 1], c = color, label=i)


ax.legend(loc = 3)
ax.grid(True)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title('tSNE (perplexity = 30) Plot of the Log of the Predictor Variables with Violent Crime Rate')