# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import auc,classification_report,mean_squared_error,roc_curve,confusion_matrix,accuracy_score
from glmnet import LogitNet
%matplotlib inline


# Reading the text file data and converting into a dataframe

In [2]:
data_frame = pd.read_csv("DScasestudy1.txt", delimiter="\t")
data_frame.shape


FileNotFoundError: File b'DScasestudy1.txt' does not exist

In [None]:
data_frame.head(5)


In [None]:
data_frame.columns


# Separating out the reponse column and the input columns 

In [None]:
variables = data_frame.drop(['response'], axis =1)
response = data_frame[['response']]


In [None]:
print("data shape of variables",variables.shape)


In [None]:
print("data shape of response",response.shape)


# Checking the head of variables and response dataset's

In [None]:
variables.head(10)


In [None]:
response.head(10)


# Checking the tail of variables and response dataset's

In [None]:
variables.tail(10)


In [None]:
response.tail(10)


# Performing summary Statistics and getting to know about Variables dataset

In [None]:
variables.info()


In [None]:
variables.describe()


# Checking whether there are any Null Values in the Variables DataSet 

In [None]:
#variables.isnull().any()
#getting the sum of all null values present in the variables dataset
variables.isnull().sum().sum()


# Checking Null values in the Response DataSet

In [None]:
response.info()


In [None]:
response.describe()


In [None]:
response.isnull().sum().sum()


# Plotting the Histogram to see How many 0's and 1's present in the respone Dataset

In [None]:
print("number of unique values",response['response'].nunique())
unique_values_count = response['response'].value_counts()
print("value counts of unique values\n",unique_values_count)


In [None]:
#plotting the bar chart for o's and 1's with frequency
plt.style.use('ggplot')

x = ['0', '1']

unique_values_frequency = [407, 123]

x_pos = [i for i, _ in enumerate(x)]

plt.bar(x_pos, unique_values, color='lightblue')
plt.xlabel("1's and 0's")
plt.ylabel("Frequency")
plt.title("Bar Chart 1's and 0's")

plt.xticks(x_pos, x)

plt.show()


# Functions to calculate the train and test errors

In [None]:
def calc_train_error(X_train, y_train,model):
    '''returns in-sample error for already fit model.'''
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    rmse = np.sqrt(mse)
    return mse
    
def calc_validation_error(X_test, y_test, model):
    '''returns out-of-sample error for already fit model.'''
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    return mse
    
def calc_metrics(X_train, y_train, X_test, y_test, model):
    '''fits model and returns the RMSE for in-sample error and out-of-sample error'''
    model.fit(X_train, y_train)
    train_error = calc_train_error(X_train, y_train, model)
    validation_error = calc_validation_error(X_test, y_test, model)
    return train_error, validation_error


# Splitting into train and test datasets with different test sizes and predicting using GLMNET python for performing Lasso Regression

In [None]:
#splitting data into training and testing phases
data = variables
target = response


# Choosing Test_size = 0.4

In [None]:
x_train4, x_test4, y_train4, y_test4 = train_test_split(data,target,test_size=0.4,shuffle=True, random_state=15)


In [None]:
model = LogitNet()
print(model.fit)
model4 = model.fit(x_train4,y_train4)
print(model4.alpha)
print(model4.lambda_best_)
predictions4 = model4.predict(x_test4)


In [None]:
for i in range(model4.coef_path_.shape[1]):
    plt.plot(model4.lambda_path_, model4.coef_path_[0, i, :])
ax = plt.gca()
ax.set_xlim(right=model4.lambda_path_.max())
ax.set_xlabel("lambda")
ax.set_ylabel("Coef Value")
ax.invert_xaxis()
plt.show()


In [None]:
confusion_matrix(y_test4,predictions4)


In [None]:
print(classification_report(y_test4,predictions4))
print("mean squared error",mean_squared_error(y_test4,predictions4))


# selecting test_size 0.3

In [None]:
x_train3, x_test3, y_train3, y_test3 = train_test_split(data,target,test_size=0.3,shuffle=True, random_state=15)


In [None]:
model3 = model.fit(x_train3,y_train3)
print(model3.alpha)
print(model3.lambda_best_)
predictions3 = model3.predict(x_test3)


In [None]:
for i in range(model3.coef_path_.shape[1]):
    plt.plot(model3.lambda_path_, model3.coef_path_[0, i, :])
ax = plt.gca()
ax.set_xlim(right=model3.lambda_path_.max())
ax.set_xlabel("lambda")
ax.set_ylabel("Coef Value")
ax.invert_xaxis()
plt.show()


In [None]:
confusion_matrix(y_test3,predictions3)


In [None]:
print(classification_report(y_test3,predictions3))
print("mean squared error",mean_squared_error(y_test3,predictions3))


# selecting testsize 0.25

In [None]:
x_train25, x_test25, y_train25, y_test25 = train_test_split(data,target,test_size=0.25,shuffle=True, random_state=15)


In [None]:
model25 = model.fit(x_train25,y_train25)
print(model25.alpha)
print(model25.lambda_best_)
predictions25 = model25.predict(x_test25)
predict_prob = model25.predict_proba(x_test25)


In [None]:
np.unique(predict_prob[:,1])


In [None]:
fpr,tpr,_ = roc_curve(y_test25,predict_prob[:,1])
plt.plot(fpr,tpr)
plt.xlim(0,1)
plt.ylim(0,1)
plt.show()
auc(fpr,tpr)


In [None]:
for i in range(model25.coef_path_.shape[1]):
    plt.plot(model25.lambda_path_, model25.coef_path_[0, i, :])
ax = plt.gca()
ax.set_xlim(right=model25.lambda_path_.max())
ax.set_xlabel("lambda")
ax.set_ylabel("Coef Value")
ax.invert_xaxis()
plt.show()


In [None]:
confusion_matrix(y_test25,predictions25)


In [None]:
print(classification_report(y_test25,predictions25))
print("mean squared error",mean_squared_error(y_test25,predictions25))


# selecting testsize 0.2

In [None]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(data,target,test_size=0.2,shuffle=True, random_state=15)


In [None]:
model2 = model.fit(x_train2,y_train2)
print(model2.alpha)
print(model2.lambda_best_)
predictions2 = model2.predict(x_test2)
predict_proba = model2.predict_proba(x_test2)


In [None]:
fpr,tpr,_ = roc_curve(y_test2,predict_proba[:,1])
plt.plot(fpr,tpr)
plt.xlim(0,1)
plt.ylim(0,1)
plt.show()
auc(fpr,tpr)


In [None]:
for i in range(model2.coef_path_.shape[1]):
    plt.plot(model2.lambda_path_, model2.coef_path_[0, i, :])
ax = plt.gca()
ax.set_xlim(right=model2.lambda_path_.max())
ax.set_xlabel("lambda")
ax.set_ylabel("Coef Value")
ax.invert_xaxis()
plt.show()


In [None]:
confusion_matrix(y_test2,predictions2)


In [None]:
print(classification_report(y_test2,predictions2))
print("mean squared error",mean_squared_error(y_test2,predictions2))


In [None]:
confusion_matrix(y_test,predictions)


In [None]:
print(classification_report(y_test,predictions))
print("mean squared error",mean_squared_error(y_test,predictions))


# Calculating training and testing errors for all test sizes

In [None]:
def errors(trainig_x,training_y,testing_x,testing_y,your_model):
    print("train_error",calc_train_error(trainig_x,training_y,your_model))
    print("test_error",calc_validation_error(testing_x,testing_y,your_model))
    print("metrics",calc_metrics(trainig_x,training_y,testing_x,testing_y,your_model))
    

In [None]:
#train test error for test_size 0.4
errors(x_train4,y_train4,x_test4,y_test4,model4)


In [None]:
#train test error for test_size 0.3
errors(x_train3,y_train3,x_test3,y_test3,model3)


In [None]:
#train test error for test_size 0.25
errors(x_train25,y_train25,x_test25,y_test25,model25)


In [None]:
#train test error for test_size 0.2
errors(x_train2,y_train2,x_test2,y_test2,model2)


# splitting my dataset into train, validation and test datasets

In [None]:
X_intermediate, X_test, y_intermediate, y_test = train_test_split(data, 
                                                                  target, 
                                                                  shuffle=True,
                                                                  test_size=0.2, 
                                                                  random_state=15)

# train/validation split (gives us train and validation sets)
X_train, X_validation, y_train, y_validation = train_test_split(X_intermediate,
                                                                y_intermediate,
                                                                shuffle=False,
                                                                test_size=0.25,
                                                                random_state=2018)


In [None]:
model_s = model.fit(X_train,y_train)


In [None]:
# delete intermediate variables
del X_intermediate, y_intermediate

# print proportions
print('train: {}% | validation: {}% | test {}%'.format(round(len(y_train)/len(target),2),
                                                       round(len(y_validation)/len(target),2),
                                                       round(len(y_test)/len(target),2)))


In [None]:
# calculate errors
new_train_error = mean_squared_error(y_train, model_s.predict(X_train))
new_validation_error = mean_squared_error(y_validation, model_s.predict(X_validation))
new_test_error = mean_squared_error(y_test, model_s.predict(X_test))

print("new train error",new_train_error)
print("new test error", new_test_error)
print("new validation error", new_validation_error)


# Performing K-Fold Cross Validation and calculating the Metrics plotting the ROC_Curve

In [None]:
K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=21)
train_errors = []
validation_errors = []
accuracy = []
auc_array= []
for train_index, val_index in kf.split(data,target):
    # split data
    X_train, X_val = data.iloc[train_index], data.iloc[val_index]
    y_train, y_val = target.iloc[train_index], target.iloc[val_index]
    # instantiate model
    model_k = model.fit(X_train,y_train)
    predictions = model_k.predict(X_val)
    print(classification_report(y_val,predictions))
    print("accuracy_score",accuracy_score(y_val,predictions))
    accuracy.append(accuracy_score(y_val,predictions))
    predict_prob = model_k.predict_proba(X_val)
    fpr,tpr,_ = roc_curve(y_val,predict_prob[:,1])
    plt.plot(fpr,tpr)
    plt.show()
    plt.xlim(0,1)
    plt.ylim(0,1)
    k = auc(fpr,tpr)
    print("auc",k)
    auc_array.append(k)
    #calculate errors
    train_error, val_error = calc_metrics(X_train, y_train, X_val, y_val, model_k)
    
    # append to appropriate list
    train_errors.append(train_error)
    validation_errors.append(val_error)
    

In [None]:
print("train_errors",train_errors)
print("validation_errors",validation_errors)


# Calculating the mean for train, validation, accuracy, Auc mean

In [None]:
print("train_error mean",sum(train_errors)/len(train_errors))
print("validation_error mean",sum(validation_errors)/len(validation_errors))
print("accuracy_mean",sum(accuracy)/len(accuracy))
print("Auc_mean",sum(auc_array)/len(auc_array))