In [2]:
import pandas as pd
import numpy as np

In [3]:
data_source = "http://www.ats.ucla.edu/stat/data/binary.csv"
df = pd.read_csv(data_source)

In [4]:
df[:].shape

(400, 4)

In [5]:
df.dtypes

admit      int64
gre        int64
gpa      float64
rank       int64
dtype: object

In [6]:
df.describe()

Unnamed: 0,admit,gre,gpa,rank
count,400.0,400.0,400.0,400.0
mean,0.3175,587.7,3.3899,2.485
std,0.466087,115.516536,0.380567,0.94446
min,0.0,220.0,2.26,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,580.0,3.395,2.0
75%,1.0,660.0,3.67,3.0
max,1.0,800.0,4.0,4.0


In [11]:
(df.ix[:,1:].corrwith(df["admit"])).sort_values(ascending=False)

gre     0.184434
gpa     0.178212
rank   -0.242513
dtype: float64

In [12]:
df.max().sort_values()

admit      1.0
gpa        4.0
rank       4.0
gre      800.0
dtype: float64

In [13]:
y_data = df["admit"].values.reshape(-1,1)
y_data[:5]

array([[0],
       [1],
       [1],
       [1],
       [0]])

In [14]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

scaled_column = []

for i, col_name in enumerate(df.ix[:,1:].columns):
    if len(df[col_name].unique()) != 2:
        scaled_column.append(col_name)

        
df_scaled = df
df_scaled[scaled_column] = min_max_scaler.fit_transform(df_scaled[scaled_column])
df_scaled[:5]

Unnamed: 0,admit,gre,gpa,rank
0,0,0.275862,0.775862,0.666667
1,1,0.758621,0.810345,0.666667
2,1,1.0,1.0,0.0
3,1,0.724138,0.534483,1.0
4,0,0.517241,0.385057,1.0


In [15]:
x_data = df_scaled.ix[:,1:].values
x_data[1]

array([ 0.75862069,  0.81034483,  0.66666667])

In [25]:
from sklearn.cross_validation import KFold
from sklearn import linear_model, datasets
from sklearn.metrics import precision_recall_fscore_support

logreg = linear_model.LogisticRegression(fit_intercept=True)
kf = KFold(y_data.shape[0], n_folds=5,shuffle=True, )

for i, (train_index, test_index) in enumerate(kf):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    X_train, X_test = x_data[train_index], x_data[test_index]
    y_train, y_test = y_data[train_index], y_data[test_index]
    
    logreg.fit(X_train, y_train.ravel())
    
    y_pred = logreg.predict(X_test)
    y_true = y_test.ravel()
    hit_count = sum( y_pred== y_true)
    total_count = len(test_index)
    accuracy_rate = hit_count / total_count
    print("#",i, accuracy_rate)
    print("#",i, "Recall:", precision_recall_fscore_support(y_true, y_pred))

TRAIN: 320 TEST: 80
# 0 0.675
# 0 Recall: (array([ 0.73529412,  0.33333333]), array([ 0.86206897,  0.18181818]), array([ 0.79365079,  0.23529412]), array([58, 22]))
TRAIN: 320 TEST: 80
# 1 0.6625
# 1 Recall: (array([ 0.68493151,  0.42857143]), array([ 0.92592593,  0.11538462]), array([ 0.78740157,  0.18181818]), array([54, 26]))
TRAIN: 320 TEST: 80
# 2 0.6875
# 2 Recall: (array([ 0.67105263,  1.        ]), array([ 1.        ,  0.13793103]), array([ 0.80314961,  0.24242424]), array([51, 29]))
TRAIN: 320 TEST: 80
# 3 0.775
# 3 Recall: (array([ 0.77777778,  0.75      ]), array([ 0.96551724,  0.27272727]), array([ 0.86153846,  0.4       ]), array([58, 22]))
TRAIN: 320 TEST: 80
# 4 0.675
# 4 Recall: (array([ 0.67567568,  0.66666667]), array([ 0.96153846,  0.14285714]), array([ 0.79365079,  0.23529412]), array([52, 28]))
