# Classification

In this notebook, we'll explore **logistic regression** as a way of predicting a binary outcome and classifying data.

In [None]:
# Import our necessary toolboxes
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import scipy.io
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Part I: Example with GRE scores

<div class="alert alert-success"><b>Task</b>: Below, load in <code>logisticGRE.csv</code> (in our Data folder) using Pandas and assign it to <code>df</code>. Then, add another column <code>gre_adj</code> that is the original GRE scoress minus their mean. Finally, create a scatter plot of the the original scores versus the admitted.</div>

In [None]:
# Load data

# Add mean-adjusted column


In [None]:
# Plot scatter


Clearly, this data is not appropriate for a linear regression. Let's see if a logistic regression can help us predict which scores lead to admission.

In [None]:
from sklearn.linear_model import LogisticRegression #1

x = np.array(df['gre_adj'])
y = np.array(df['admitted'])

logRegression = LogisticRegression() # Set up model
logRegression.fit(x.reshape(len(x),1),y.reshape(len(y),1)) # Fit model

x_pred = np.arange(x.min(),x.max(),1) # Create a range of values to predict

y_pred = logRegression.predict_proba(x_pred.reshape(len(x_pred),1)) # Predict y values

y_proba = np.array([prob[1] for prob in y_pred]) # Just take the second value (for 1)

In [None]:
plt.scatter(x_pred+np.mean(x_orig),y_pred[:,1],c='k') #1
plt.title('Predicted Grad school admittance as function of GRE scores') #2
plt.xlabel('GRE scores') #3
plt.ylabel('Admitted') #4
plt.show()

In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(gre_adj.reshape(-1, 1), admitted, test_size=0.2, random_state=42)

# Create and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict probabilities for the test set
y_prob = model.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc_score = roc_auc_score(y_test, y_prob)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

## Part 2: Back to our simulated neural population

In the last notebook, we used PCA followed by K-Means clustering to see if we could identify different cell types within our sample. Below, we'll give it a try with logistic regression.

In [None]:
# Specify fields to generate data
generatorFields=['type_num', 'transmission', 'latMean','latStd','volMean','volStd', 'depthMean', \
                 'depthStd','maxrateMean', 'maxrateStd','spontMean','spontStd','widthMean','widthStd', \
                 'axonMean','axonStd','dendriteMean','dendriteStd']

# Specify each property value
Type1=[1, 'excitatory', 14, .5, 150,30,500,20,.9, .1,.02,.01, 1, .05, 160, 20, 180,30 ] #Cortical Layer 4 pyramid
Type2=[2, 'excitatory', 15, .5, 120,30,300,20,.8, .1,.07,.01, 1, .04, 150, 20, 150,30 ] #L2 pyramidal
Type3=[3, 'inhibitory', 15, 1, 120,30, 300,20, .95,.1,.2,.1, .2,.001,150, 10, 150,10 ] #L2 inhibitory PV+
Type4=[4, 'inhibitory', 17, 4, 110,30,300,20, .3, .1,.02, .01,.3,.005,150, 10, 150,40 ] #L2 inhibitory SOM+ 
Type5=[5, 'excitatory', 22, 5, 180,20,800,100,.35,.2,.35, .1,.5,.1, 1000, 500, 200,60 ] #L6 excitatory pyramid
Type6=[6, 'inhibitory', 13, .5, 100,30,500,20,.95,.1,.2,.1, .2,.001,150, 10, 150,10 ] #L4 inhibitory FS+

dftype = pd.DataFrame([Type1,Type2,Type3,Type4,Type5,Type6],columns=generatorFields)
dftype

In [None]:
cellFields = ['latency','volume','depth','maxrate','spont','width','axon','dendrite','transmission']
dataset = pd.DataFrame(columns = cellFields) # Inititialize our dataset

for i in range(100): # For one hundred neurons
    tt = np.random.randint(6) # Randomly choose a cell type
    trans = dftype.loc[tt,'transmission'] # Excitatory or inhibitory?
    latency = dftype.loc[tt,'latMean']+np.random.randn()*dftype.loc[tt]['latStd'] # Mean latency, with some jitter
    vol=dftype.loc[tt,'volMean']+np.random.randn()*dftype.loc[tt]['volStd'] # Mean volume, with some jitter
    z = dftype.loc[tt,'depthMean']+np.random.randn()*dftype.loc[tt]['depthStd'] # Mean depth, with some jitter
    maxrate = dftype.loc[tt,'maxrateMean']+np.random.randn()*dftype.loc[tt]['maxrateStd']
    spont = dftype.loc[tt,'spontMean']+np.random.randn()*dftype.loc[tt]['spontStd'] 
    waveWidth = dftype.loc[tt,'widthMean']+np.random.randn()*dftype.loc[tt]['widthStd']
    axon = dftype.loc[tt,'axonMean']+np.random.randn()*dftype.loc[tt]['axonStd'] 
    dendrite = dftype.loc[tt,'dendriteMean']+np.random.randn()*dftype.loc[tt]['dendriteStd']
    
    # Append each simulated property to the dataset
    dataset = pd.concat([dataset,pd.DataFrame
    ([[latency,vol,z,maxrate,spont,waveWidth,axon,dendrite,trans]],columns=cellFields)],ignore_index=True)
    
print(dataset.shape)
nrows,ncolumns = dataset.shape
dataset.head()

In [None]:
# Prepare Data
X = dataset.iloc[:, :-1].values 
y = dataset.iloc[:, -1].values 

# Create binary labels for logistic regression
y_binary = np.where(y == 'excitatory', 0, 1) 

# Divide into train and test sets and standardize values of each
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.20, random_state=0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Apply PCA
pca = PCA(n_components=2) 
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Create DataFrame for plotting with seaborn
df_train_pca = pd.DataFrame(data=X_train_pca, columns=['PC1', 'PC2'])
df_train_pca['Transmission'] = y_train.astype(str)  

<div class="alert alert-success"><b>Task</b>: Fit a logistic regression model to the training set.

In [None]:
# 3. Fit Logistic Regression
model = LogisticRegression()
model.fit(...)

Now, we can visualize our results by using coloring each point by the probability of being excitatory or inhibitory, and using the coefficients to reconstruct the logistic regression line in PCA space.

In [None]:
# Predict Probabilities
df_train_pca['Probability'] = model.predict_proba(X_train_pca)[:, 1] 

# Visualize with lmplot
plot = sns.lmplot(x='PC1', y='PC2', hue='Probability', data=df_train_pca, fit_reg=False, palette='coolwarm', legend=False)
ax = plot.axes[0, 0]

# Normalize probabilities to 0-1 range for colormap
norm = plt.Normalize(df_train_pca['Probability'].min(), df_train_pca['Probability'].max())

# Get scatter plot points and apply colormap
points = ax.collections[0]
points.set_cmap('coolwarm')
points.set_norm(norm)  

# Create colorbar
cbar = plt.colorbar(points)
cbar.ax.set_ylabel('Probability of Inhibitory', rotation=270, labelpad=15)

# Overlay decision boundary
b = model.intercept_[0]
w1, w2 = model.coef_.T
c = -b/w2
m = -w1/w2
xd = np.array([df_train_pca['PC1'].min(), df_train_pca['PC1'].max()])
yd = m*xd + c
plt.plot(xd, yd, 'k', lw=1, ls='--') 

plt.title('Logistic Regression Decision Boundary')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

We can compare our logistic regression findings with the ground truth...

In [None]:
sns.lmplot(x='PC1', y='PC2', hue='Transmission', data=df_train_pca, fit_reg=False, palette='coolwarm')
plt.show()

### About this notebook
The examples in this notebook are based off of those in *Neural Data Science* Chapters 7 and 9.