# Data Preparation for Textile Dashboard Use-Cases

# * Importing Packages

In [None]:
import pandas as pd
import numpy as np
import random
import datetime


# import warnings
# warnings.filterwarnings('ignore')
pd.options.display.max_rows = 1500
pd.options.display.max_columns = 25

# * Some Functions

In [None]:
def replacing_values(value_to_search, feature, value):
    for i in range(len(data)):
        if data['frequency'][i] == value_to_search:
            data[feature][i] = value
            
def replacing_values_I(region,value_to_search, Unit, value):
    for i in range(len(data)):
        if int(data['frequency'][i]) == int(value_to_search) and str(data.Region[i]) == str(region):
            data[Unit][i] = value

# * Analysing Dataset

__Superstore__

In [None]:
# Fetching Data
dt4 = pd.read_csv('/home/arsal/Data/Textile/superstore-data/superstore_dataset2011-2015.csv')
dt4

In [None]:
# Analysing shape of the data
dt4.shape

In [None]:
# Checking columns
dt4.columns

In [None]:
# Seperating data of the year 2011 from the whole data
data = dt4.iloc[:3549]

In [None]:
# What's the shape of the data after slicing the old dataset
data.shape

# * Analysing Features

* __Dropping columns that are not necessary__

In [None]:
data.drop(columns=['Postal Code', 'State', 'Country', 'Region', 'City', 'Market'], inplace=True)
data.columns

In [None]:
data.shape

* __Analysing the column "Ship Mode"__

In [None]:
data['Ship Mode'].value_counts()

* __Analysing the column "Segments"__

In [None]:
data.Segment.value_counts()

* __Analysing the column "Shipping Cost"__

In [None]:
data['Shipping Cost']

* __Analysing the column "Order Date"__

In [None]:
data.head()

In [None]:
data.info()

# * Creating Features

* __Extracting multiple features (Day, Month, Year) from the feature 'Order Date'__

In [None]:
data['Year'] = pd.DatetimeIndex(data['Order Date']).year
data['Month'] = pd.DatetimeIndex(data['Order Date']).month
data['Day'] = pd.DatetimeIndex(data['Order Date']).day

In [None]:
data[['Order Date', 'Day', 'Month', 'Year']]

* __Extracting multiple features (Quarter and Week) from the features (Day, Month, Year)__

__Quarter__

In [None]:
data['Quarter'] = np.nan
for i in range(len(data)):
    if data.Month[i] in [1,2,3,4]:
        data['Quarter'][i] = 1
    if data.Month[i] in [5,6,7,8]:
        data['Quarter'][i] = 2
    if data.Month[i] in [9,10,11,12]:
        data['Quarter'][i] = 3

In [None]:
data.Quarter = data.Quarter.astype(int)

In [None]:
data['Quarter']

In [None]:
data.Quarter.value_counts()

__Week__

In [None]:
data['Order Date']

In [None]:
data['Week'] = np.nan

In [None]:
for i in range(len(data)):
    if data.Day[i] in [1,2,3,4,5,6]:
        data.Week[i] = 1
    if data.Day[i] in [7,8,9,10,11,12]:
        data.Week[i] = 2

In [None]:
# Converting datatype from float to int 
data.Week = data.Week.astype(int)

In [None]:
data.Week.value_counts()

In [None]:
# verification
data.Day.value_counts()

In [None]:
Week1 = 162+167+213+218+249+372
Week2 = 171+389+383+300+503+422
print(Week1)
print(Week2)

* __Creating Column as Customer_Type__

In [None]:
cust_type_list = ['Men', 'Women', 'Children']
data['Customer_Type'] = np.nan

In [None]:
for i in range(len(data)):
    data['Customer_Type'].iloc[i] = random.choice(cust_type_list)

In [None]:
data.Customer_Type.value_counts()

* __Converting Column 'Shipping Cost' to 'Labour_Cost'__

In [None]:
# Renaming Column
data.rename({'Shipping Cost':'Labour_Cost'}, axis=1, inplace=True)
data.Labour_Cost.head()

* __Converting Column 'Segments' to 'Market'__

In [None]:
data['Market'] = data['Segment'].apply(lambda x: 'Outlet' if x == 'Consumer' or x == 'Corporate' else 'Online')
data.Market.value_counts()

In [None]:
# Dropping the column as it is not required further
data.drop(columns=['Segment'], inplace=True)
data.columns

* __Converting Column 'Ship Mode' to 'Product_Market'__

In [None]:
data['Product_Market'] = data['Ship Mode'].apply(lambda x: 'Local' if x == 'Standard Class' or x == 'Second Class' or x == 'First Class' else 'Export')
data.Product_Market.value_counts()

In [None]:
# Dropping the column as it is not required further
data.drop(columns=['Ship Mode'], inplace=True)
data.columns

* __Creating Column as Region__

In [None]:
list_regions = ['Sindh', 'Punjab', 'ICT', 'Gilgit','Balochistan', 'KPK']

In [None]:
data['Region'] = np.nan

In [None]:
for i in range(len(data)):
    data.Region.iloc[i] = random.choice(list_regions)

In [None]:
data.Region.value_counts()

* __Creating column as Unit__

In [None]:
data['Unit'] = np.nan

In [None]:
# Creating lists for different units in each city
sindh_list = ['Karachi', 'Hyderabad', 'Sukkur', 'Jamshoro', 'Jacobabad', 'Khairpur', 'Thatta']
punjab_list = ['Lahore', 'Multan', 'Faislabad', 'Sargodha', 'Sialkot', 'Bhawalpur', 'Sahiwal', 'Rawalpindi', 'Taxila']
kpk_list = ['Peshawar', 'Abbottabad', 'Mardan', 'Naran', 'Kulachi', 'Sawat', 'Kohat', 'Chitral']
ict_list = ['Islamabad']
gilgit_list = ['Skardu', 'Hunza', 'Sust', 'Shigar', 'Ghizer', 'Dayor', 'Ishqoman']
baloch_list = ['Loralai', 'Kalat', 'Derabugti', 'Turbat', 'Chaman', 'Gawadar', 'Quetta', 'Ziarat', 'Khuzdar']

In [None]:
# Mapping Sindh Cities
for i in range(len(data)):
    if data.Region[i] == 'Sindh':
        data.Unit[i] = random.choice(sindh_list)
        print(data.Unit[i])

In [None]:
data[data.Region=='Sindh']['Unit'].value_counts()

In [None]:
# Mapping Punjab Cities
for i in range(len(data)):
    if data.Region[i] == 'Punjab':
        data.Unit[i] = random.choice(punjab_list)
        print(data.Unit[i])

In [None]:
data[data.Region=='Punjab']['Unit'].value_counts()

In [None]:
# Mapping Balochistan Cities
for i in range(len(data)):
    if data.Region[i] == 'Balochistan':
        data.Unit[i] = random.choice(baloch_list)
        print(data.Unit[i])

In [None]:
data[data.Region=='Balochistan']['Unit'].value_counts()

In [None]:
# Mapping KPK Cities
for i in range(len(data)):
    if data.Region[i] == 'KPK':
        data.Unit[i] = random.choice(kpk_list)
        print(data.Unit[i])

In [None]:
data[data.Region=='KPK']['Unit'].value_counts()

In [None]:
# mapping ICT
for i in range(len(data)):
    if data.Region[i] == 'ICT':
        data.Unit[i] = random.choice(ict_list)
        print(data.Unit[i])

In [None]:
data[data.Region=='ICT']['Unit'].value_counts()

In [None]:
# Mapping gilgit Cities
for i in range(len(data)):
    if data.Region[i] == 'Gilgit':
        data.Unit[i] = random.choice(gilgit_list)
        print(data.Unit[i])

In [None]:
data[data.Region=='Gilgit']['Unit'].value_counts()

__Verfying Changes__

In [None]:
len(data.Unit.value_counts())

In [None]:
length = len(sindh_list) + len(punjab_list) + len(baloch_list) + len(kpk_list) + len(ict_list) + len(gilgit_list)
length

* __Creating column as Outlet__

In [None]:
# Creating a feature named "Outlet"
data['Outlet'] = np.nan

In [None]:
khi_lst = ['Nazimabad', 'Defence', 'Karsaz', 'Malir', 'Johar', 'Bahadurabad', 'Tariq-Road', 'Askari','Korangi', 'Gulshan']
hyd_lst = ['Qasimabad', 'Mallapur', 'Latifabad', 'Gulshan-e-Zeal', 'Defence-Housing-Society ', 'Gulistan-e-Sajjad']
jmshro_lst = ['Almanzar', 'Bhatti-Cafe-Road', 'Rail-o-Mian', 'Al-Hussaini-Road', 'Cadet-College-Street']
khrpur_lst = ['Shah-Hussain', 'Mitho-Marri', 'Bhurgari', 'Mumtaz-Colony', 'Wapda-Colony', 'Talpur-Colony', 'Faizabad-Colony']
skkr_lst = ['Soomar-Goth', 'Jaffarabad', 'Nasirabad', 'Arain-Moholla', 'Pir-Murad-Shah-Colony', 'Old-Sukkur', 'Shahi-bazaar']
jcbabd_lst = ['ADC-Colony', 'Dastaghir-Road', 'Haji-Shah-Muarad-Rind', 'Village-Qalati-Khan', 'Ameer-Baksh-Solangi', 'Khandoo-Khan-Brohi']
Thtta_lst = ['ZS-Moholla', 'Makli', 'Sheree-Mamai-Dev', 'Two-Domes', 'Ghulamullah-Road']

silkt_lst = ['Mughal-Town', 'Dheera-Sanda', 'Begwal', 'kais', 'Dhudianwali', 'Bhoth', 'Jhai']
bhwlpr_lst = ['Munshiwala', 'Premier-Road', 'Basti-Miani', 'Goth-Lashkar', 'Noor-Mahal', 'Basti-Rahimabad', 'Hotwali']
fslbd_lst = ['Samaana', 'Miraanwala', 'Liaqatabad', 'Fakharabad', 'Sultan-Nagar', 'Bhai-Wala', 'Ghona-East', 'Bawachak', 'Risala']
srgda_lst = ['Fatima-Jinnah-Rad' , 'Remount-Depot', 'Muqaam-e-Hayaat-Road', 'Cantonment-Road']
rwlpndi_lst = ['Razaq-Town', 'Lakhu', 'Chaklala-Cantt', 'Lalkurti', 'Muqadmewali']
txla_lst = ['Kohsar-Colony', 'Timber-Market', 'Shahpur', 'Karamwal', 'PMO-Colony', 'Model-Town', 'E-Type-Street']
mltn_lst = ['Ismailabad', 'Nandla', 'Multan-Cantonment']
lhr_lst = ['Allama-Road', 'Shadman', 'Hando', 'Mandila', 'Heera-Mandi', 'Valencia']
shwl_lst = ['Aratulla-Road', 'Sapphire-Hotel-Road', 'lalazar']

klt_lst = ['Harboi-Road', 'Kohing', 'Neemargh-Road']
ziart_lst= ['Naughaza-Road', 'Ziarat-Residency-Road']
gwdr_lst = ['East_Bay-park', 'Koh-e-Batil', 'PNS-Akram', 'AlNoor-Cargo-City', 'Koh-e-Mehdi']
lrali_lst = ['Nawakilli', 'Killitata', 'Loralai-Cantt']
khzdr_lst = ['Zahri', 'Gunga', 'Muhammadwala', 'Khuzdar-Cantt', 'Jinnad-Cantt']
chmn_lst = ['Malik-Alijan-Road', 'Haji-Balo', 'Dewana-Kot', 'Kili-Mehboob']
drabgti_lst = ['Sui', 'Pir-Koh']
Queta_lst = ['Hazaraganji-Road', 'UOB-Road', 'Samungli-Road']
trbt_lst = ['Kallag', 'Gogdan', 'Kohe-e-Muraadi', 'Raeesi', 'Shahani-Bazar', 'Chahser']

pshwr_lst = ['Tatara-Park-Road', 'Pakha-Ghulam', 'Qasim-Kalay', 'Pajagi', 'Baloo', 'Jhagra', 'Akbarpura']
chtrl_lst = ['Ayun-Fort-Road', 'Batrik', 'Ishpata-Road', 'Konar-Side-Road']
koht_lst = ['Billitang', 'Razgeer-Banda', 'Mohd-Zai-Street', 'KDA-Street']
abtbd_lst = ['Kehal-Road', 'Muslim-Town', 'Malakpura', 'Machar-Colony', 'Nawashera', 'Bilal-Town', 'Kaghan-Colony']
mrdn_lst = ['Sheikh-Maltoon-Town', 'Ram-Bagh', 'Mayar', 'Faqeerabad', 'Meervas', 'Deputy-Kale']
klchi_lst = ['DIK', 'Faqeernoor-Mohd-Road']
swat_lst = ['Chalyar', 'Matta', 'Charbagh', 'Derai', 'Kanju', 'Kalakalay', 'Manglor']
naran_lst = ['PDTC-Area', 'Katha', 'Saif-ul-Muluk-Road', 'Lalazar']

ict_lst = ['Phamra', 'Nun', 'Daman-e-Koh', 'Phulgran', 'Jhangi-Syedan', 'Nurpur-Shahan', 'Saidpur']

ghzr_lst = ['Ghizer-Valley']
sust_lst = ['jamalabad', 'Gircha', 'Hussainabad', 'Sost-Bazar', 'Sartiz']
ishqomn_lst = ['Ishqoman-Valley']
hnza_lst = ['Karimabad', 'Aliabad', 'Sultanabad', 'Askurdas', 'Altit', 'Garelt']
dyor_lst = ['Dayor-Valley']
skrdu_lst = ['Cadet-College-Road', 'Rangah', 'Chonda', 'Chundah', 'Snowland-Palace-Road']
shigr_lst = ['Qasimabad-Broqkhoor', 'Mohola-Kiahong', 'Skoro', 'Sankhor']

In [None]:
# Mapping Sindh Outlets
for i in range(len(data)):
    if data.Unit[i] == 'Karachi':
        data.Outlet[i] = random.choice(khi_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Hyderabad':
        data.Outlet[i] = random.choice(hyd_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Jamshoro':
        data.Outlet[i] = random.choice(jmshro_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Khairpur':
        data.Outlet[i] = random.choice(khrpur_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Sukkur':
        data.Outlet[i] = random.choice(skkr_lst)
        print(data.Unit[i])

for i in range(len(data)):
    if data.Unit[i] == 'Jacobabad':
        data.Outlet[i] = random.choice(jcbabd_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Thatta':
        data.Outlet[i] = random.choice(Thtta_lst)
        print(data.Unit[i])

In [None]:
print(data[data.Unit == 'Karachi']['Outlet'].value_counts())
print(data[data.Unit == 'Hyderabad']['Outlet'].value_counts())
print(data[data.Unit == 'Jamshoro']['Outlet'].value_counts())
print(data[data.Unit == 'Khairpur']['Outlet'].value_counts())
print(data[data.Unit == 'Sukkur']['Outlet'].value_counts())
print(data[data.Unit == 'Jacobabad']['Outlet'].value_counts())
print(data[data.Unit == 'Thatta']['Outlet'].value_counts())

In [None]:
# Mapping punjab Outlets
for i in range(len(data)):
    if data.Unit[i] == 'Sialkot':
        data.Outlet[i] = random.choice(silkt_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Bhawalpur':
        data.Outlet[i] = random.choice(bhwlpr_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Faislabad':
        data.Outlet[i] = random.choice(fslbd_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Sargodha':
        data.Outlet[i] = random.choice(srgda_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Rawalpindi':
        data.Outlet[i] = random.choice(rwlpndi_lst)
        print(data.Unit[i])

for i in range(len(data)):
    if data.Unit[i] == 'Taxila':
        data.Outlet[i] = random.choice(txla_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Multan':
        data.Outlet[i] = random.choice(mltn_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Lahore':
        data.Outlet[i] = random.choice(lhr_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Sahiwal':
        data.Outlet[i] = random.choice(shwl_lst)
        print(data.Unit[i])

In [None]:
# Mapping Balochistan Outlets
for i in range(len(data)):
    if data.Unit[i] == 'Kalat':
        data.Outlet[i] = random.choice(klt_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Ziarat':
        data.Outlet[i] = random.choice(ziart_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Gawadar':
        data.Outlet[i] = random.choice(gwdr_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Loralai':
        data.Outlet[i] = random.choice(lrali_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Khuzdar':
        data.Outlet[i] = random.choice(khzdr_lst)
        print(data.Unit[i])

for i in range(len(data)):
    if data.Unit[i] == 'Chaman':
        data.Outlet[i] = random.choice(chmn_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Derabugti':
        data.Outlet[i] = random.choice(drabgti_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Quetta':
        data.Outlet[i] = random.choice(Queta_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Turbat':
        data.Outlet[i] = random.choice(trbt_lst)
        print(data.Unit[i])

In [None]:
# Mapping KPK Outlets
for i in range(len(data)):
    if data.Unit[i] == 'Peshawar':
        data.Outlet[i] = random.choice(pshwr_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Chitral':
        data.Outlet[i] = random.choice(chtrl_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Kohat':
        data.Outlet[i] = random.choice(koht_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Abbottabad':
        data.Outlet[i] = random.choice(abtbd_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Mardan':
        data.Outlet[i] = random.choice(mrdn_lst)
        print(data.Unit[i])

for i in range(len(data)):
    if data.Unit[i] == 'Kulachi':
        data.Outlet[i] = random.choice(klchi_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Sawat':
        data.Outlet[i] = random.choice(swat_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Naran':
        data.Outlet[i] = random.choice(naran_lst)
        print(data.Unit[i])
        

In [None]:
# Mapping Islamabad Outlets
for i in range(len(data)):
    if data.Unit[i] == 'Islamabad':
        data.Outlet[i] = random.choice(ict_lst)
        print(data.Unit[i])

In [None]:
# Mapping Gilgit Outlets
for i in range(len(data)):
    if data.Unit[i] == 'Ghizer':
        data.Outlet[i] = random.choice(ghzr_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Sust':
        data.Outlet[i] = random.choice(sust_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Ishqoman':
        data.Outlet[i] = random.choice(ishqomn_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Hunza':
        data.Outlet[i] = random.choice(hnza_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Dayor':
        data.Outlet[i] = random.choice(dyor_lst)
        print(data.Unit[i])

for i in range(len(data)):
    if data.Unit[i] == 'Skardu':
        data.Outlet[i] = random.choice(skrdu_lst)
        print(data.Unit[i])
        
for i in range(len(data)):
    if data.Unit[i] == 'Shigar':
        data.Outlet[i] = random.choice(shigr_lst)
        print(data.Unit[i])

In [None]:
# Verfication
data[['Region', 'Unit', 'Outlet']]

In [None]:
# Making a csv of the already done work
data.to_csv('/home/arsal/Data/Textile/superstore-data/superstore-part1.csv')

# Further Transformation on the above created csv

In [125]:
# Importing packages
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 1500
pd.options.display.max_columns = 30

In [126]:
# Fetching data
data = pd.read_csv('/home/arsal/Data/Textile/superstore-data/superstore-part1.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Customer ID,Customer Name,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Labour_Cost,Order Priority,Year,Month,Day,Quarter,Week,Customer_Type,Market,Product_Market,Region,Unit,Outlet
0,0,42433,AG-2011-2040,1/1/2011,6/1/2011,TB-11280,Toby Braunhardt,OFF-TEN-10000025,Office Supplies,Storage,"Tenex Lockers, Blue",408.3,2,0.0,106.14,35.46,Medium,2011,1,1,1,1,Women,Outlet,Local,Balochistan,Derabugti,Pir-Koh
1,1,22253,IN-2011-47883,1/1/2011,8/1/2011,JH-15985,Joseph Holt,OFF-SU-10000618,Office Supplies,Supplies,"Acme Trimmer, High Speed",120.366,3,0.1,36.036,9.72,Medium,2011,1,1,1,1,Women,Outlet,Local,Gilgit,Sust,Hussainabad
2,2,48883,HU-2011-1220,1/1/2011,5/1/2011,AT-735,Annie Thurman,OFF-TEN-10001585,Office Supplies,Storage,"Tenex Box, Single Width",66.12,4,0.0,29.64,8.17,High,2011,1,1,1,1,Men,Outlet,Local,KPK,Naran,Lalazar
3,3,11731,IT-2011-3647632,1/1/2011,5/1/2011,EM-14140,Eugene Moren,OFF-PA-10001492,Office Supplies,Paper,"Enermax Note Cards, Premium",44.865,3,0.5,-26.055,4.82,High,2011,1,1,1,1,Children,Online,Local,Sindh,Hyderabad,Defence-Housing-Society
4,4,22255,IN-2011-47883,1/1/2011,8/1/2011,JH-15985,Joseph Holt,FUR-FU-10003447,Furniture,Furnishings,"Eldon Light Bulb, Duo Pack",113.67,5,0.1,37.77,4.7,Medium,2011,1,1,1,1,Women,Outlet,Local,Gilgit,Ishqoman,Ishqoman-Valley


In [127]:
# Removing unwanted columns
data.drop(columns=['Unnamed: 0','Row ID'], inplace=True)
data.head(2)

Unnamed: 0,Order ID,Order Date,Ship Date,Customer ID,Customer Name,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Labour_Cost,Order Priority,Year,Month,Day,Quarter,Week,Customer_Type,Market,Product_Market,Region,Unit,Outlet
0,AG-2011-2040,1/1/2011,6/1/2011,TB-11280,Toby Braunhardt,OFF-TEN-10000025,Office Supplies,Storage,"Tenex Lockers, Blue",408.3,2,0.0,106.14,35.46,Medium,2011,1,1,1,1,Women,Outlet,Local,Balochistan,Derabugti,Pir-Koh
1,IN-2011-47883,1/1/2011,8/1/2011,JH-15985,Joseph Holt,OFF-SU-10000618,Office Supplies,Supplies,"Acme Trimmer, High Speed",120.366,3,0.1,36.036,9.72,Medium,2011,1,1,1,1,Women,Outlet,Local,Gilgit,Sust,Hussainabad


In [128]:
data.columns

Index([u'Order ID', u'Order Date', u'Ship Date', u'Customer ID',
       u'Customer Name', u'Product ID', u'Category', u'Sub-Category',
       u'Product Name', u'Sales', u'Quantity', u'Discount', u'Profit',
       u'Labour_Cost', u'Order Priority', u'Year', u'Month', u'Day',
       u'Quarter', u'Week', u'Customer_Type', u'Market', u'Product_Market',
       u'Region', u'Unit', u'Outlet'],
      dtype='object')

# * Analysing Features

* __Analysing feature "Quantity"__

In [129]:
data.Quantity.value_counts()

2     861
3     682
1     612
4     423
5     320
6     253
7     165
8     102
9      70
10     22
12     13
11     11
14     10
13      5
Name: Quantity, dtype: int64

In [130]:
data.Quantity.describe()

count    3549.000000
mean        3.520147
std         2.288630
min         1.000000
25%         2.000000
50%         3.000000
75%         5.000000
max        14.000000
Name: Quantity, dtype: float64

In [131]:
type(data.Quantity)

pandas.core.series.Series

# * Creating Features

* __Creating feature "Sales_per_day"__

In [132]:
def fill(index):
    for i in reversed(range(index)):
        if np.isnan(data.Sales_per_day[i]) == True:
            data.Sales_per_day[i] = data.Sales_per_day[index]
            

In [133]:
data['Sales_per_day'] = data.Quantity

In [134]:
start = None
end = None
summ = None
check = True
for i in range(1, len(data)):
    if data.Day[i] == data.Day[i-1]:
        data.Sales_per_day[i] = data.Sales_per_day[i-1] + data.Sales_per_day[i]
        data.Sales_per_day[i-1] = np.nan
        end = i
        check = True
    else:
        check = False
    if check == False or i == 3548:
        fill(end)

In [135]:
data[['Day','Quantity', 'Sales_per_day']]

Unnamed: 0,Day,Quantity,Sales_per_day
0,1,2,19.0
1,1,3,19.0
2,1,4,19.0
3,1,3,19.0
4,1,5,19.0
5,1,2,19.0
6,2,2,54.0
7,2,2,54.0
8,2,1,54.0
9,2,3,54.0


# * Analysing Fifth Dataset

__Women Clothing E-Commerce__

In [None]:
# # Fetching Data
# dt4 = pd.read_csv('/home/arsal/Data/Textile/womensclothingecommerce/Womens-Clothing-E-Commerce-Reviews.csv')
# dt4

In [None]:
# # Analysing shape of the data
# dt4.shape