In [None]:
#---------------Logistic Regression---------------


# Logistic regression is named for the function used at the core
# of the method, the logistic function.

# The logistic function, also called the sigmoid function was developed by statisticians
# to describe properties of population growth in ecology, rising quickly and maxing out at
# the carrying capacity of the environment. It’s an S-shaped curve that can take any real-valued 
# number and map it into a value between 0 and 1, but never exactly at those limits.

# This is the logistic function ---> 1 / (1 + e^-value)

'''
        Logistic regression uses the same equation as linear equation i.e the line 
        equation but with a twist.
        The Line equation :
            
            y = B0 + B1 * x

        This is a line where y is the output variable we want to predict,
        x is the input variable we know and B0 and B1 are coefficients we need to estimate.
        
        The equation of Logistic Regression is:
            
            y  = B0 + B1 * x
            yhat = sigmoid(y) i.e yhat = 1 / (1 + e^-y) where e is the Euler's number or the
            EXP() function
            
        here we use the logistic function also known as sigmoid function because we have to classify a value
        between 0 and 1 i.e Logistic regression is used for binary classification it is a regression algorithm 
        used for binary classification.
        
        The role of sigmoid funcion here is to convert the value of y into either 0 or 1
        we can say that it return as probability value if it is greater than some specified threshold
        then it returns 1 or returns 0 .
'''


In [None]:
'''
******Pros of logistic regression******

 1 : Simple and linear

 2 : Reliable

 3 : No parameters to tune

******Cons of LR*******

 1 : Cannot handle non-linearities in the data'''

'''
******Some applications of Logistic Regression****** 

 1 : Image Segmentation and Categorization

 2 : Geographic Image Processing

 3 : Handwriting recognition

 4 : Healthcare : Analyzing a group of over million people for myocardial infarction 
     within a period of 10 years is an application area of logistic regression.

 5 : Prediction whether a person is depressed or not based on bag of words from the 
     corpus seems to be conveniently solvable using logistic regression and SVM.
     
'''


In [90]:
#importing our libraries that will be used
import pandas as pd
from sklearn.model_selection import train_test_split


In [91]:
# Getting our dataset we will work with credit card fraud detection

dataset = pd.read_csv('Downloads/creditcard.csv')

# Cleaning our dataset removing the rows with values NaN 

dataset.dropna()

# Printing first five instances of our dataset
# The pandas function head( prints the first five instances of dataframe )
# The pandas function tail( prints the first five instances of dataframe )

dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [98]:
#Getting the X and Y data 
# where the X values are the number of features excluding class column
# and Y is the target i.e class

X = dataset.ix[:,dataset.columns != 'Class']
y = dataset['Class']

In [99]:
# Printing the first five instances of X to check the data is appropriate or not

X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [100]:
# Printing the first five instances of y

y.head()

0    0
1    0
2    0
3    0
4    0
Name: Class, dtype: int64

In [101]:
# Splitting the data into training and testing

x_train,x_test,y_train,y_test = train_test_split(X,y)

In [102]:
# Importing our Logistic regression classifier from sklearn library

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train,y_train)

# The score function returns the accuracy score of our model 

print('The accuracy score is:',clf.score(x_test,y_test)*100,'%')

The accuracy score is: 99.88904806044773 %


In [110]:

predict = clf.predict(x_train)
#print(prediction)

In [111]:
#predicting first five values from the dataset.

for i in range(5):
    if predict[i] == 0:
        print('Normal')
    else:
        print('Fraud')

Normal
Normal
Normal
Normal
Normal
