In [None]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# This is new
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [None]:
# fetch data 

admission_data = pd.read_csv('Kaggle_Data/admission_data_ng.csv')

admission_data.head()

In [None]:
# plot the features against the classification
X0 = admission_data[admission_data['Admit'] == 0]
X1 = admission_data[admission_data['Admit'] == 1]
plt.scatter(X0['Test 1 Score'], X0['Test 2 Score'], color = 'red', marker = 'o', label = 'reject')
plt.scatter(X1['Test 1 Score'], X1['Test 2 Score'], color = 'blue', marker = 'x', label = 'admit')
plt.xlabel('Test 1 Score')
plt.ylabel('Test 2 Score')
plt.legend(loc='lower left')
plt.show()

In [None]:
admission_data.isna().sum()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(admission_data.drop(columns = ['Admit']), admission_data['Admit'], test_size=0.2, stratify = admission_data['Admit'], random_state=50)
# In the above split the stratify = y essentially makes sure the fractions of the classification is maintained
X_train
X_test
y_train
y_test

In [None]:
model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'auto', penalty = 'none')
# While using multiclass case do multi_class = 'over' or 'auto'; can also try other solvers
# While doing regularization, use penalty = 'l2' and also C = 10.0 (need to try other values too)

model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
model.coef_

# This is the coefficient Beta_0
model.intercept_

In [None]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_Admit'])
test_output.head()

In [None]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Percentage of correct predictions is ')
print(model.score(X_test, y_test))

In [None]:
test_output = test_output.merge(X_test, left_index = True, right_index = True)
test_output.head()

In [None]:
model.predict_proba(admission_data.drop(columns = ['Admit']))

In [None]:
data_with_prob = X_train.copy()
data_with_prob['Admit'] = y_train
# Next we give the probability of predicting 1 (in multiclass, there will be probabilities by class)
data_with_prob['Probability'] = model.predict_proba(data_with_prob.drop(columns = ['Admit']))[:,1]
data_with_prob.head()

In [None]:
test_output['Probability'] = model.predict_proba(test_output.drop(columns = ['Admit', 'pred_Admit']))[:,1]
test_output.head()

#### Visualize data

In [None]:
# plot the features against the classification [Training]
X0_right = data_with_prob[(data_with_prob['Admit'] == 0) & (data_with_prob['Probability'] < 0.5)]
X1_right = data_with_prob[(data_with_prob['Admit'] == 1) & (data_with_prob['Probability'] >= 0.5)]
X0_wrong = data_with_prob[(data_with_prob['Admit'] == 0) & (data_with_prob['Probability'] >= 0.5)]
X1_wrong = data_with_prob[(data_with_prob['Admit'] == 1) & (data_with_prob['Probability'] < 0.5)]

plt.scatter(X0_right['Test 1 Score'], X0_right['Test 2 Score'], color = 'red', marker = 'o', label = 'reject accurate')
plt.scatter(X1_right['Test 1 Score'], X1_right['Test 2 Score'], color = 'blue', marker = 'x', label = 'admit accurate')
plt.scatter(X0_wrong['Test 1 Score'], X0_wrong['Test 2 Score'], color = 'black', marker = 'o', label = 'reject inaccurate')
plt.scatter(X1_wrong['Test 1 Score'], X1_wrong['Test 2 Score'], color = 'cyan', marker = 'x', label = 'admit inaccurate')
plt.xlabel('Test 1 Score')
plt.ylabel('Test 2 Score')
plt.legend(loc='lower left')
plt.show()

In [None]:
# plot the features against the classification [Testing]
X0_right = test_output[(test_output['Admit'] == 0) & (test_output['Probability'] < 0.5)]
X1_right = test_output[(test_output['Admit'] == 1) & (test_output['Probability'] >= 0.5)]
X0_wrong = test_output[(test_output['Admit'] == 0) & (test_output['Probability'] >= 0.5)]
X1_wrong = test_output[(test_output['Admit'] == 1) & (test_output['Probability'] < 0.5)]

plt.scatter(X0_right['Test 1 Score'], X0_right['Test 2 Score'], color = 'red', marker = 'o', label = 'reject accurate')
plt.scatter(X1_right['Test 1 Score'], X1_right['Test 2 Score'], color = 'blue', marker = 'x', label = 'admit accurate')
plt.scatter(X0_wrong['Test 1 Score'], X0_wrong['Test 2 Score'], color = 'black', marker = 'o', label = 'reject inaccurate')
plt.scatter(X1_wrong['Test 1 Score'], X1_wrong['Test 2 Score'], color = 'cyan', marker = 'x', label = 'admit inaccurate')
plt.xlabel('Test 1 Score')
plt.ylabel('Test 2 Score')
plt.legend(loc='upper left')
plt.show()