# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import pyodbc 
import getpass
import pandas.io.sql

# For Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import matplotlib


# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.pipeline import Pipeline 


# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)

from feature_engine.transformation import (
    LogTransformer,
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper


# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Pandas Settings
# pd.options.display.float_format = '{:.1f}'.format
pd.set_option('display.max_rows', 40000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

# Miscellaneous imports
from datetime import datetime
from scipy.stats import norm
import re

# Data Collection

In [2]:
for driver in pyodbc.drivers():
    print(driver)

SQL Server
SQL Server Native Client 11.0
SQL Server Native Client RDA 11.0
ODBC Driver 17 for SQL Server


In [3]:
# Some other example server values are
# server = 'localhost\sqlexpress' # for a named instance
# server = 'myserver,port' # to specify an alternate port
server = 'IN3074556W1' 
database = 'Advanced_Analytics' 
username = input(" Enter username: ")
password = getpass.getpass(" Enter Password: ")

cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};\
                       SERVER='+server+';\
                       DATABASE='+database+';\
                       Trusted_Connection=yes;')


cursor = cnxn.cursor()

# cnxn.close()

 Enter username:  MEA\XF762GD
 Enter Password:  ··········


In [4]:
read='select * from ConsumerElectronics'
# directly read into dataframe
df_Consumer_electronics= pandas.io.sql.read_sql(read, cnxn)
print(df_Consumer_electronics.shape)
df_Consumer_electronics.head()
# cnxn.close()

(1648824, 20)


Unnamed: 0,fsn_id,order_date,Year,Month,order_id,order_item_id,gmv,units,deliverybdays,deliverycdays,s1_fact_order_payment_type,sla,cust_id,pincode,product_analytic_super_category,product_analytic_category,product_analytic_sub_category,product_analytic_vertical,product_mrp,product_procurement_sla
0,ACCCX3S58G7B5F6P,2015-10-17 15:11:54,2015,10,3419301000000000.0,3419301000000000.0,6400.0,1,\N,\N,COD,5,-1.01299130778588e+18,-7.79175582905735e+18,CE,CameraAccessory,CameraAccessory,CameraTripod,7190.0,0
1,ACCCX3S58G7B5F6P,2015-10-19 10:07:22,2015,10,1420831000000000.0,1420831000000000.0,6900.0,1,\N,\N,COD,7,-8.99032457905512e+18,7.33541149097431e+18,CE,CameraAccessory,CameraAccessory,CameraTripod,7190.0,0
2,ACCCX3S5AHMF55FV,2015-10-20 15:45:56,2015,10,2421913000000000.0,2421913000000000.0,1990.0,1,\N,\N,COD,10,-1.0404429420466e+18,-7.47768776228657e+18,CE,CameraAccessory,CameraAccessory,CameraTripod,2099.0,3
3,ACCCX3S5AHMF55FV,2015-10-14 12:05:15,2015,10,4416592000000000.0,4416592000000000.0,1690.0,1,\N,\N,Prepaid,4,-7.60496084352714e+18,-5.83593163877661e+18,CE,CameraAccessory,CameraAccessory,CameraTripod,2099.0,3
4,ACCCX3S5AHMF55FV,2015-10-17 21:25:03,2015,10,4419525000000000.0,4419525000000000.0,1618.0,1,\N,\N,Prepaid,6,2.8945572083453e+18,5.34735360997242e+17,CE,CameraAccessory,CameraAccessory,CameraTripod,2099.0,3


In [5]:
# reduce un wanted columns
df_Consumer_electronics1=df_Consumer_electronics
df_Consumer_electronics1=df_Consumer_electronics1.drop(['fsn_id','order_id','order_item_id',
                                                      'cust_id','pincode','product_analytic_super_category',
                                                        'product_procurement_sla'],axis=1)
print(df_Consumer_electronics1.shape)
df_Consumer_electronics1.head()

(1648824, 13)


Unnamed: 0,order_date,Year,Month,gmv,units,deliverybdays,deliverycdays,s1_fact_order_payment_type,sla,product_analytic_category,product_analytic_sub_category,product_analytic_vertical,product_mrp
0,2015-10-17 15:11:54,2015,10,6400.0,1,\N,\N,COD,5,CameraAccessory,CameraAccessory,CameraTripod,7190.0
1,2015-10-19 10:07:22,2015,10,6900.0,1,\N,\N,COD,7,CameraAccessory,CameraAccessory,CameraTripod,7190.0
2,2015-10-20 15:45:56,2015,10,1990.0,1,\N,\N,COD,10,CameraAccessory,CameraAccessory,CameraTripod,2099.0
3,2015-10-14 12:05:15,2015,10,1690.0,1,\N,\N,Prepaid,4,CameraAccessory,CameraAccessory,CameraTripod,2099.0
4,2015-10-17 21:25:03,2015,10,1618.0,1,\N,\N,Prepaid,6,CameraAccessory,CameraAccessory,CameraTripod,2099.0


In [6]:
# reduce un wanted rows
#by selecting Camera & CameraAccessory from product_analytic_category column
# select data for 3 months

df_Consumer_electronics1 = df_Consumer_electronics1.loc[(df_Consumer_electronics1['product_analytic_category']=='CameraAccessory') | (df_Consumer_electronics1['product_analytic_category']=='Camera')]


df_Consumer_electronics1 = df_Consumer_electronics1.loc[(df_Consumer_electronics1['order_date'] >= '2015-06-26') & (df_Consumer_electronics1['order_date'] <'2015-12-26')]

df_Consumer_electronics1.shape

(155300, 13)

In [7]:
read='select * from Media_Investment'
# directly read into dataframe
df_media_investment= pandas.io.sql.read_sql(read, cnxn)
df_media_investment.head()
# cnxn.close()

Unnamed: 0,Year,Month,Total_Investment,TV,Digital,Sponsorship,Content_Marketing,Online_marketing,Affiliates,SEM,Radio,Other
0,2015,7,17.1,0.2,2.5,7.4,0.0,1.3,0.5,5.0,,
1,2015,8,5.1,0.0,1.3,1.1,0.0,0.1,0.1,2.5,,
2,2015,9,96.300003,3.9,1.4,62.799999,0.6,16.4,5.0,6.2,,
3,2015,10,170.199997,6.1,12.6,84.699997,3.4,24.4,7.0,31.9,,
4,2015,11,51.200001,4.2,1.3,14.2,0.2,19.6,6.6,5.2,,


In [8]:
read='select * from Monthly_NPS_Score'
# directly read into dataframe
df_monthly_nps_score= pandas.io.sql.read_sql(read, cnxn)
df_monthly_nps_score.head()
# cnxn.close()

Unnamed: 0,column1,column2,column3,column4,column5,column6,column7,column8,column9,column10,column11,column12,column13
0,Month,July'15,Aug'15,Sept'15,Oct'15,Nov'15,Dec'15,Jan'16,Feb'16,Mar'16,Apr'16,May'16,June'16
1,NPS,54.6,60.0,46.9,44.4,47.0,45.8,47.1,50.3,49.0,51.8,47.3,50.5
2,Stock Index,1177,1206,1101,1210,1233,1038,1052,1222,1015,1242,1228,1194


In [9]:
read='select * from Product_List'
# directly read into dataframe
df_product_list= pandas.io.sql.read_sql(read, cnxn)
df_product_list.head()
# cnxn.close()

Unnamed: 0,Product,Frequency,Percent
0,\N,5828,0.4
1,AmplifierReceiver,4056,0.2
2,AudioMP3Player,112892,6.8
3,Binoculars,14599,0.9
4,BoomBox,2879,0.2


In [10]:
read='select * from Special_Sale_Calendar'
# directly read into dataframe
df_special_sale_calendar= pandas.io.sql.read_sql(read, cnxn)
df_special_sale_calendar.head()
# cnxn.close()

Unnamed: 0,Sale_Calendar_ID,Start_Date,End_Date
0,1,2015-07-18,2015-07-19
1,2,2015-08-15,2015-08-17
2,3,2015-08-28,2015-08-30
3,4,2015-10-15,2015-10-17
4,5,2015-11-07,2015-11-14
