In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames



# Pretty display for notebooks
%matplotlib inline


data = pd.read_csv("DTNA\sprint3_master.csv")


display(data.head(n=5))

Unnamed: 0,CAL_DATE,ITEM_NO,LOC,SR_FLAG,CRFA_C,CRFA_R,CRFA_F,CRFA_A,EOQ,PKG_QTY,OUTPROC_FLAG,SCORE_862_INSTABILITY_AVG,AVG_ADJ_QTY,DESTINID,AVG_DAYS_LATE
0,9/19/2017,000 446 08 75,65,True,S,9,1,2,1,48,True,100.0,-1,65,0.35
1,9/19/2017,04-28651-000,65,True,S,9,4,2,1,50,False,100.0,-1,65,0.35
2,9/19/2017,05-30811-001,65,True,S,9,1,2,1,6,False,100.0,-1,65,0.35
3,9/19/2017,06-53144-000,65,True,S,9,1,2,1,400,False,100.0,-880,65,0.35
4,9/19/2017,06-93145-000,65,True,S,9,4,2,1,0,False,100.0,-1,65,0.35


In [2]:
# Split the data into features and target label
SR_FLAG = data['SR_FLAG']
features_raw = data.drop('SR_FLAG', axis = 1)
SR_FLAG=SR_FLAG.apply(lambda x: np.where(x==False,0,1))


In [3]:
features_raw['CAL_DATE']=features_raw['CAL_DATE']
features_raw['CAL_DATE']=pd.to_datetime(features_raw['CAL_DATE'])

features_raw=pd.DataFrame({"YEAR": features_raw['CAL_DATE'].dt.year,
              "MONTH": features_raw['CAL_DATE'].dt.month,
              "DAY": features_raw['CAL_DATE'].dt.day,
              "DAY_OF_YEAR": features_raw['CAL_DATE'].dt.dayofyear,
              "WEEK": features_raw['CAL_DATE'].dt.week,
              "WEEK_OF_YEAR": features_raw['CAL_DATE'].dt.weekofyear,
              "DAY_OF_WEEK": features_raw['CAL_DATE'].dt.dayofweek,
              "WEEK_DAY": features_raw['CAL_DATE'].dt.weekday,
              "QUARTER": features_raw['CAL_DATE'].dt.quarter,
             }).join(features_raw)

features_raw = features_raw.drop('CAL_DATE', axis = 1)

display(features_raw.head(n = 5))

Unnamed: 0,DAY,DAY_OF_WEEK,DAY_OF_YEAR,MONTH,QUARTER,WEEK,WEEK_DAY,WEEK_OF_YEAR,YEAR,ITEM_NO,...,CRFA_R,CRFA_F,CRFA_A,EOQ,PKG_QTY,OUTPROC_FLAG,SCORE_862_INSTABILITY_AVG,AVG_ADJ_QTY,DESTINID,AVG_DAYS_LATE
0,19,1,262,9,3,38,1,38,2017,000 446 08 75,...,9,1,2,1,48,True,100.0,-1,65,0.35
1,19,1,262,9,3,38,1,38,2017,04-28651-000,...,9,4,2,1,50,False,100.0,-1,65,0.35
2,19,1,262,9,3,38,1,38,2017,05-30811-001,...,9,1,2,1,6,False,100.0,-1,65,0.35
3,19,1,262,9,3,38,1,38,2017,06-53144-000,...,9,1,2,1,400,False,100.0,-880,65,0.35
4,19,1,262,9,3,38,1,38,2017,06-93145-000,...,9,4,2,1,0,False,100.0,-1,65,0.35


In [4]:
features_numeric = ['DAY','DAY_OF_WEEK','DAY_OF_YEAR','MONTH','QUARTER','WEEK','WEEK_DAY','WEEK_OF_YEAR','YEAR','LOC','CRFA_R', 'CRFA_F','CRFA_A','EOQ','PKG_QTY','SCORE_862_INSTABILITY_AVG','DESTINID','AVG_DAYS_LATE']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[features_numeric] = features_raw[features_numeric].apply(lambda x: np.log(x + 1))

In [5]:
display(features_log_transformed.head(n = 5))

Unnamed: 0,DAY,DAY_OF_WEEK,DAY_OF_YEAR,MONTH,QUARTER,WEEK,WEEK_DAY,WEEK_OF_YEAR,YEAR,ITEM_NO,...,CRFA_R,CRFA_F,CRFA_A,EOQ,PKG_QTY,OUTPROC_FLAG,SCORE_862_INSTABILITY_AVG,AVG_ADJ_QTY,DESTINID,AVG_DAYS_LATE
0,2.995732,0.693147,5.572154,2.302585,1.386294,3.663562,0.693147,3.663562,7.609862,000 446 08 75,...,2.302585,0.693147,1.098612,0.693147,3.89182,True,4.615121,-1,4.189655,0.300105
1,2.995732,0.693147,5.572154,2.302585,1.386294,3.663562,0.693147,3.663562,7.609862,04-28651-000,...,2.302585,1.609438,1.098612,0.693147,3.931826,False,4.615121,-1,4.189655,0.300105
2,2.995732,0.693147,5.572154,2.302585,1.386294,3.663562,0.693147,3.663562,7.609862,05-30811-001,...,2.302585,0.693147,1.098612,0.693147,1.94591,False,4.615121,-1,4.189655,0.300105
3,2.995732,0.693147,5.572154,2.302585,1.386294,3.663562,0.693147,3.663562,7.609862,06-53144-000,...,2.302585,0.693147,1.098612,0.693147,5.993961,False,4.615121,-880,4.189655,0.300105
4,2.995732,0.693147,5.572154,2.302585,1.386294,3.663562,0.693147,3.663562,7.609862,06-93145-000,...,2.302585,1.609438,1.098612,0.693147,0.0,False,4.615121,-1,4.189655,0.300105


In [6]:
# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler

features_numeric = ['DAY','DAY_OF_WEEK','DAY_OF_YEAR','MONTH','QUARTER','WEEK','WEEK_DAY','WEEK_OF_YEAR','YEAR','LOC','CRFA_R', 'CRFA_F','CRFA_A','EOQ','PKG_QTY','SCORE_862_INSTABILITY_AVG','AVG_ADJ_QTY','DESTINID','AVG_DAYS_LATE']
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[features_numeric] = scaler.fit_transform(features_log_transformed[features_numeric])

# Show an example of a record with scaling applied
display(features_log_minmax_transform.head(n = 5))

Unnamed: 0,DAY,DAY_OF_WEEK,DAY_OF_YEAR,MONTH,QUARTER,WEEK,WEEK_DAY,WEEK_OF_YEAR,YEAR,ITEM_NO,...,CRFA_R,CRFA_F,CRFA_A,EOQ,PKG_QTY,OUTPROC_FLAG,SCORE_862_INSTABILITY_AVG,AVG_ADJ_QTY,DESTINID,AVG_DAYS_LATE
0,0.830482,0.356207,0.820122,0.736966,0.563171,0.834033,0.356207,0.834033,0.0,000 446 08 75,...,0.460299,0.0,0.323657,0.0,0.422544,True,0.349242,1.0,1.0,0.441483
1,0.830482,0.356207,0.820122,0.736966,0.563171,0.834033,0.356207,0.834033,0.0,04-28651-000,...,0.460299,0.609205,0.323657,0.0,0.426888,False,0.349242,1.0,1.0,0.441483
2,0.830482,0.356207,0.820122,0.736966,0.563171,0.834033,0.356207,0.834033,0.0,05-30811-001,...,0.460299,0.0,0.323657,0.0,0.211272,False,0.349242,1.0,1.0,0.441483
3,0.830482,0.356207,0.820122,0.736966,0.563171,0.834033,0.356207,0.834033,0.0,06-53144-000,...,0.460299,0.0,0.323657,0.0,0.650779,False,0.349242,0.985936,1.0,0.441483
4,0.830482,0.356207,0.820122,0.736966,0.563171,0.834033,0.356207,0.834033,0.0,06-93145-000,...,0.460299,0.609205,0.323657,0.0,0.0,False,0.349242,1.0,1.0,0.441483


In [7]:
#features_log_minmax_transform=np.where(features_log_minmax_transform['OUTPROC_FLAG']==False,0,1)

features_log_minmax_transform['OUTPROC_FLAG']=features_log_minmax_transform['OUTPROC_FLAG'].apply(lambda x: np.where(x==False,0,1))

display(features_log_minmax_transform.head(n = 5))

Unnamed: 0,DAY,DAY_OF_WEEK,DAY_OF_YEAR,MONTH,QUARTER,WEEK,WEEK_DAY,WEEK_OF_YEAR,YEAR,ITEM_NO,...,CRFA_R,CRFA_F,CRFA_A,EOQ,PKG_QTY,OUTPROC_FLAG,SCORE_862_INSTABILITY_AVG,AVG_ADJ_QTY,DESTINID,AVG_DAYS_LATE
0,0.830482,0.356207,0.820122,0.736966,0.563171,0.834033,0.356207,0.834033,0.0,000 446 08 75,...,0.460299,0.0,0.323657,0.0,0.422544,1,0.349242,1.0,1.0,0.441483
1,0.830482,0.356207,0.820122,0.736966,0.563171,0.834033,0.356207,0.834033,0.0,04-28651-000,...,0.460299,0.609205,0.323657,0.0,0.426888,0,0.349242,1.0,1.0,0.441483
2,0.830482,0.356207,0.820122,0.736966,0.563171,0.834033,0.356207,0.834033,0.0,05-30811-001,...,0.460299,0.0,0.323657,0.0,0.211272,0,0.349242,1.0,1.0,0.441483
3,0.830482,0.356207,0.820122,0.736966,0.563171,0.834033,0.356207,0.834033,0.0,06-53144-000,...,0.460299,0.0,0.323657,0.0,0.650779,0,0.349242,0.985936,1.0,0.441483
4,0.830482,0.356207,0.820122,0.736966,0.563171,0.834033,0.356207,0.834033,0.0,06-93145-000,...,0.460299,0.609205,0.323657,0.0,0.0,0,0.349242,1.0,1.0,0.441483


In [9]:
print("{} total features before one-hot encoding.".format(len(features_log_minmax_transform.columns)))

features_log_minmax_transform = pd.get_dummies(data=features_log_minmax_transform)


print("{} total features after one-hot encoding.".format(len(features_log_minmax_transform.columns)))


22 total features before one-hot encoding.
11431 total features after one-hot encoding.


In [10]:
# Import train_test_split
from sklearn.cross_validation import train_test_split

features_log_minmax_transform=features_log_minmax_transform.fillna(0)

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_log_minmax_transform, 
                                                    SR_FLAG, 
                                                    test_size = 0.2, 
                                                    random_state = 0)



In [11]:
# TODO: Calculate accuracy, precision and recall
accuracy = np.sum(SR_FLAG) / len(SR_FLAG)

recall = 1
precision = np.sum(SR_FLAG) / (np.sum(SR_FLAG) + (len(SR_FLAG)-np.sum(SR_FLAG)))


fscore = ((1 + np.square(.5))*precision*recall) /  ((np.square(.5)*precision)+recall)


print ("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore))

Naive Predictor: [Accuracy score: 0.2996, F-score: 0.3484]


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score,fbeta_score
# TODO: Initialize the classifier
clf = LogisticRegression()


parameters = {'C': [10**-i for i in range(-5, 5)], 'class_weight': [None, 'balanced']}



scorer = make_scorer(fbeta_score,beta=.5)


grid_obj = GridSearchCV(estimator=clf,param_grid=parameters,scoring=scorer)


grid_fit = grid_obj.fit(X_train,y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [18]:
# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: %s" %format(accuracy_score(y_test, predictions), '.4f'))
print("F-score on testing data: %s" %format(fbeta_score(y_test, predictions, beta = 0.5), '.4f'))
#print("Optimized Model")
print("Final accuracy score on the testing data: %s" %format(accuracy_score(y_test, best_predictions), '.4f'))
print("Final F-score on the testing data: %s" %format(fbeta_score(y_test, best_predictions, beta = 0.5), '.4f'))

Unoptimized model
------
Accuracy score on testing data: 0.8492
F-score on testing data: 0.7679
Final accuracy score on the testing data: 0.8627
Final F-score on the testing data: 0.7722
