## Student Name: Akhil Mathew
## Student ID: 100799990

In [1]:
#Load Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from sklearn.datasets import load_breast_cancer
data = pd.read_csv('breast_cancer.csv')
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'breast_cancer.csv'

In [None]:
#shape of dataset
print("shape of dataframe is : ", data.shape)

# summary of data
data.info()
#Get Statistical details of data
data.describe()

There are a total of 33 variables in this dataset, with a total of 569 observations. “diagnosis” is our Target Variable, which has two categories Malignant (M) and Benign (B) rest all are input features. 
In addition, we can observe that our dataset contains both numerical and Categorical features.

In [None]:
#We’ll remove columns with only one unique value because their variance will be 0 and they won’t help us anticipate anything.
#Checking the unique value counts in columns
featureValues={}
for d in data.columns.tolist():
    count=data[d].nunique()
    if count==1:
        featureValues[d]=count
# List of columns having same 1 unique value        
cols_to_drop= list(featureValues.keys())
print("Columns having 1 unique value are :n",cols_to_drop)

In [None]:
#We’ll now drop the employee_id column because it’s merely a unique identifier, and then verify each field in the dataset for null value percentages.
#Drop employee_id column as it is just a unique id
data.drop("id",inplace=True,axis=1)
data.drop("Unnamed: 32",inplace=True,axis=1)
#Checking null percentage
data.isnull().mean()*100

In [None]:
#Exploratory Data Analysis before creating a Logistic Regression Model

# cchart for distribution of target variable
fig= plt.figure(figsize=(10,3) )
fig.add_subplot(1,2,1)
a= data["diagnosis"].value_counts(normalize=True).plot.pie()
fig.add_subplot(1,2,2)
churnchart=sns.countplot(x=data["diagnosis"])
plt.tight_layout()
plt.show()

We can observe from the above charts that, malignant cancer patients are less compared to benign cancer patients,
indicating that there is a class imbalance because class B has more data points or observations than class M

In [None]:
# Making the dependent variables to numerical values
data['diagnosis'] = data['diagnosis'].map({'M':0, 'B':1})

In [None]:
'''We will plot correlations between different variables using a heatmap.'''
#correlation between features
plt.subplots(figsize=(25,25))
corr_plot = sns.heatmap(data.corr(),annot = True,linewidths=3 )
plt.title("Correlation plot")
plt.show()

Most of the features in the datasets are mutually correlated and as per the figure only a few independent variables have relation with independent variable "diagnosis"

In [None]:
'''Train-Test Split
We will divide the dataset into two subsets: train and test. To perform the train-test split, we’ll use Scikit-learn machine learning.

Train subset – we will use this subset to fit/train the model
Test subset – we will use this subset to evaluate our model'''


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop('diagnosis',axis=1), data['diagnosis'], test_size=0.20,  random_state=101)

In [None]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

In [None]:
#check for distribution of labels
y_train.value_counts(normalize=True)

In [None]:
#Hyperparameter tuning

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# define model/create instance
from sklearn.linear_model import LogisticRegression

#Step 3 - Building the model and Cross Validation model

#make instance of model with default parameters except class weight
#as we will add class weights due to class imbalance problem

from sklearn.linear_model import LogisticRegression
new_logmodel = LogisticRegression(class_weight={0:0.7,1:0.3})
new_logmodel.fit(X_train,y_train)
new_predictions = new_logmodel.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test,new_predictions))

#After splitting the dataset, we have 43846 observations in the training subset and 10962 in the test subset.'''
#We have simply built a regressor model with LogisticRegression with default values. Now for StratifiedKFold, we have kept n_splits to be 10, dividing our dataset for 10 times. Also, the shuffling is kept to be True.

In [None]:
#Confusion Matrix function

def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):
    """Plots a confusion matrix."""
    if classes is not None:
        sns.heatmap(cm, cmap="YlGnBu", xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True, annot_kws={'size':50})
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Visualizing cm

cm = confusion_matrix(y_test,new_predictions) 
cm_norm = cm / cm.sum(axis=1).reshape(-1,1)

plot_confusion_matrix(cm_norm, classes = new_logmodel.classes_, title='Confusion matrix')

93% was predcited correctly among all the true values, where 7% was predicted wrongly as true. 96% of values are predicted correctly as negative, wehere remaining 4% have predicted as positive which was actually wrong.

In [None]:

'''The receiver operating characteristic (ROC) curve is another common tool used with binary classifiers. The dotted line represents the ROC curve of a purely random classifier; 
a good classifier stays as far away from that line as possible (toward the top-left corner).'''

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, new_logmodel.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, new_logmodel.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

As mentioned above, it is evident from the diagram that our model performs well since our ROC curve is plotted far from the central dotted line.

In [None]:
# Prediction
y_pred=new_logmodel.predict(X_test)

In [None]:
# Check Accuracy
from sklearn.metrics import accuracy_score
score=accuracy_score(y_test,y_pred)

In [None]:
score

In [None]:
# Create a Pickle file  
import pickle
pickle_out = open("new_logmodel.pkl","wb")
pickle.dump(new_logmodel, pickle_out)
pickle_out.close()

In [None]:

import numpy as np
import pickle
import pandas as pd
from flask import Flask, request
import flasgger
from flasgger import Swagger

app=Flask(__name__)
Swagger(app)

pickle_in = open("new_logmodel.pkl","rb")
classifier=pickle.load(pickle_in)


@app.route('/predict_test', methods=["POST"])
def predict_test_class():
    
    """Let's predict the class for iris
    This is using docstrings for specifications.
    ---
    parameters:  
      - name: file
        in: formData
        type: file
        required: true
    responses:
        200:
            description: The output values
        
    """
    df_test=pd.read_csv(request.files.get("file"))
    prediction=classifier.predict(df_test)
    return " The Predicated Class for the TestFile is"+ str(list(prediction))


if __name__=='__main__':
    app.run()