In [1]:
# import the relevant library
import psycopg2 as pg

# import the custom configuration library created for postgresql
from config import config

# load the config class object with the default database parameters
params = config()

# connect to the postgresql instance using the parameters above
conn = pg.connect(**params)

# create a cursor to trigger queries and fetch results
curs = conn.cursor()

# execute the SQL query to pull the scaled feature set into a variable
curs.execute('select * from dtml_mstr_scld;')
query_op = curs.fetchall()

In [2]:
# convert the result to a python dataframe
import pandas as pd

# store the converted dataframe as feature set
X = pd.DataFrame(query_op)

# for i in range(10):
#     print(query_op[i])

In [6]:
# we have our feature set now with 'id' as the pkey.
print(X.head(2))

     0     1          2          3          4          5          6   \
0  1664  1665  42.491388  43.481895  43.007952 -43.526217  43.478949   
1  1665  1666  43.310071 -42.721131 -41.747681  41.690790  43.478949   

          7          8          9     ...           25   26   27        28  \
0  43.211374 -43.539516  43.064434    ...     1.791759  0.0  0.0  2.484907   
1  43.211374 -43.539516  43.064434    ...     1.098612  0.0  0.0  2.564949   

         29        30         31         32        33        34  
0  4.406719  2.079442 -41.203168  41.487241  4.990433  5.676754  
1  5.081404  2.564949   7.003974  41.487241  7.070724  0.000000  

[2 rows x 35 columns]


In [3]:
# retrievening y - response vector

# execute the SQL query to pull the response vector y into a variable
curs.execute('select id, sr_flag from dtml_mstr_raw where id <= (select max(id) from dtml_mstr_scld);')
query_op_y = curs.fetchall()

In [8]:
for i in range(10):
    print(query_op_y[i])

(1, 'N')
(2, 'N')
(3, 'N')
(4, 'N')
(5, 'Y')
(6, 'N')
(7, 'N')
(8, 'N')
(9, 'N')
(10, 'N')


In [4]:
y = pd.DataFrame(query_op_y)

In [19]:
print(X.shape)
print(y.shape)

# sort the dataframes so that they are in the order of pkey id
X.sort_values(by=1) # index for id column in X is 1
y.sort_values(by=0) # index for id column in y is 0

(1000000, 35)
(1000000, 2)


Unnamed: 0,0,1
0,1,N
1,2,N
2,3,N
3,4,N
4,5,Y
5,6,N
6,7,N
7,8,N
8,9,N
9,10,N


In [22]:
print(y[0][0], y[1][0])
print(pd.Series(y[1]))

1 N
0         N
1         N
2         N
3         N
4         Y
5         N
6         N
7         N
8         N
9         N
10        N
11        Y
12        N
13        N
14        N
15        N
16        N
17        N
18        N
19        N
20        N
21        N
22        N
23        N
24        N
25        N
26        N
27        N
28        N
29        N
         ..
999970    N
999971    Y
999972    N
999973    N
999974    Y
999975    N
999976    N
999977    N
999978    N
999979    N
999980    Y
999981    N
999982    N
999983    N
999984    N
999985    N
999986    Y
999987    Y
999988    N
999989    N
999990    N
999991    Y
999992    N
999993    N
999994    N
999995    N
999996    N
999997    N
999998    N
999999    N
Name: 1, Length: 1000000, dtype: object


In [5]:
# using LabelBinarizer to convert the response vector y to a binary value Y/N -> 0/1, for eg.
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
y_bin = lb.fit_transform(pd.Series(y[1]))

# convert the numpy array back to the pandas series
y_bin = pd.Series(y_bin.reshape(-1))

print('y Type: ', type(y))
print('y_bin Type: ', type(y_bin))

y Type:  <class 'pandas.core.frame.DataFrame'>
y_bin Type:  <class 'pandas.core.series.Series'>


In [10]:
# split the feature set and response vectors 
# into corresponding training and testin sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size=0.25, random_state=1)

In [12]:
# check the types of the records
print('X_train Type: ', type(X_train))
print('X_test Type: ', type(X_test))
print('y_train Type: ', type(y_train))
print('y_test Type: ', type(y_test))

# check the shapes of the records
print('X_train Shape: ', X_train.shape)
print('X_test Shape: ', X_test.shape)
print('y_train Shape: ', y_train.shape)
print('y_test Shape: ', y_test.shape)

print(y_train.head())
print(y_test.head())

# predicting everything as True(Y)
y_naive = pd.Series([1]*len(y_test))
print(y_naive.head())

X_train Type:  <class 'pandas.core.frame.DataFrame'>
X_test Type:  <class 'pandas.core.frame.DataFrame'>
y_train Type:  <class 'pandas.core.series.Series'>
y_test Type:  <class 'pandas.core.series.Series'>
X_train Shape:  (750000, 35)
X_test Shape:  (250000, 35)
y_train Shape:  (750000,)
y_test Shape:  (250000,)
668713    1
913109    0
33970     0
192873    1
980511    0
dtype: int64
276826    0
849425    0
504499    0
601054    0
980221    1
dtype: int64
0    1
1    1
2    1
3    1
4    1
dtype: int64


In [14]:
# Calculating the naive parameters for the data
# Pre-calculating the metrices for a naive model. 
# That is when the outcome is always predicted as true, how will the raw model behave.
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, classification_report

# predicting everything as True(Y)
y_naive = pd.Series([1]*len(y_test))

print('-------------------')
print('--- Naive Model ---')
print('-------------------')
print('Accuracy: ', accuracy_score(y_naive, y_test))
print('Recall: ', recall_score(y_naive, y_test))
print('Precision: ', precision_score(y_naive, y_test))
print('F1-Score: ', f1_score(y_naive, y_test))
# print('Matthews Correlation Coefficient: ', matthews_corrcoef(y_naive, y_test))
print('Classification Report: \n', classification_report(y_naive, y_test, target_names=['Short'], labels=[1]))
# print ("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore))

-------------------
--- Naive Model ---
-------------------
Accuracy:  0.191248
Recall:  0.191248
Precision:  1.0
F1-Score:  0.3210884719218836
Classification Report: 
              precision    recall  f1-score   support

      Short       1.00      0.19      0.32    250000

avg / total       1.00      0.19      0.32    250000



In [20]:
# Classification
# Model 1: 
## LogisticRegression with GridSearchCV ## 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [21]:
# attributes for GridSearchCV
pipeline = Pipeline([    
    ('clf', LogisticRegression())
])

parameters = {
    'clf__penalty': ('l1', 'l2'),
    'clf__C': (0.01, 0.1, 1, 10),    
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)


In [None]:
grid_fit = grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


Exception in thread Thread-4:
Traceback (most recent call last):
  File "/home/anshul/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/anshul/anaconda3/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/home/anshul/anaconda3/lib/python3.6/multiprocessing/pool.py", line 405, in _handle_workers
    pool._maintain_pool()
  File "/home/anshul/anaconda3/lib/python3.6/multiprocessing/pool.py", line 246, in _maintain_pool
    self._repopulate_pool()
  File "/home/anshul/anaconda3/lib/python3.6/multiprocessing/pool.py", line 239, in _repopulate_pool
    w.start()
  File "/home/anshul/anaconda3/lib/python3.6/multiprocessing/process.py", line 105, in start
    self._popen = self._Popen(self)
  File "/home/anshul/anaconda3/lib/python3.6/multiprocessing/context.py", line 277, in _Popen
    return Popen(process_obj)
  File "/home/anshul/anaconda3/lib/python3.6/multiprocessing/popen_fork.py", line 19, 