In [1]:
import pandas as pd 
import numpy as np 

data_path = '../Datasets/Data23.xlsx'

sales = pd.read_excel(data_path,sheet_name = 'SALES', parse_dates = ['DateTime'])
products = pd.read_excel(data_path,sheet_name = "PRODUCTS")
customers = pd.read_excel(data_path,sheet_name = 'CUSTOMERS', parse_dates = ['UserFirstTransaction'])

In [2]:
sales.head(3)

Unnamed: 0,TransactionID,UserID,DateTime,ProductID,Channel,PaymentType,Price,Discount
0,1,500546547,2017-01-01 01:40:39.180,10334,MOBILE,Cash,51.0,No
1,2,500240813,2017-01-01 03:27:58.490,10610,WEB,Cash,108.0,No
2,3,500460527,2017-01-01 04:42:46.500,10579,MOBILE,Cash,22.5,No


In [3]:
customers.head(3)

Unnamed: 0,UserID,UserFirstTransaction,Gender,Location,Age
0,500234532,2011-10-12 00:00:00.000,FEMALE,ANTALYA,19
1,500234631,2018-04-06 15:19:33.990,FEMALE,BURSA,20
2,500234642,2016-04-23 00:00:00.000,MALE,IZMIR,41


In [4]:
products.head(3)

Unnamed: 0,ProductID,Category
0,10001,Female Shoes
1,10002,Male Shoes
2,10003,TVs and TV Sets


Making base dataframe

In [5]:
today = sales['DateTime'].max();today

last_transactions = sales.groupby('UserID')['DateTime'].max().reset_index()
last_transactions.columns = ['UserID','LastTransaction']
last_transactions['Recency'] = (today - last_transactions['LastTransaction']).dt.days
last_transactions['Churned'] = np.where(last_transactions['Recency'] > 252, 1,0)
last_transactions.head()

Unnamed: 0,UserID,LastTransaction,Recency,Churned
0,500234532,2017-12-17 18:08:28.150,195,0
1,500234631,2018-05-11 13:22:46.950,50,0
2,500234642,2018-06-17 19:15:22.400,13,0
3,500234730,2017-01-29 18:49:51.470,517,1
4,500234752,2018-02-11 19:39:53.710,139,0


In [6]:
df = pd.merge(sales.drop(['TransactionID', 'DateTime'],axis=1),products, on = 'ProductID', how = 'left')
df = pd.merge(df,customers[['UserID','UserFirstTransaction']], on = 'UserID', how = 'left')
df = pd.merge(df,last_transactions.drop(['Churned'],axis=1), on = 'UserID', how = 'left')
df.head(3)

Unnamed: 0,UserID,ProductID,Channel,PaymentType,Price,Discount,Category,UserFirstTransaction,LastTransaction,Recency
0,500546547,10334,MOBILE,Cash,51.0,No,Female Shoes,2015-03-18,2018-06-28 19:35:53.360,2
1,500240813,10610,WEB,Cash,108.0,No,Male Fashion,2015-07-24,2018-06-22 17:42:49.730,8
2,500460527,10579,MOBILE,Cash,22.5,No,Female Fashion,2015-01-06,2018-06-25 01:12:09.600,5


In [7]:
df['DifferenceDay'] = (df['LastTransaction'] - df['UserFirstTransaction']).dt.days
df.head(3)

Unnamed: 0,UserID,ProductID,Channel,PaymentType,Price,Discount,Category,UserFirstTransaction,LastTransaction,Recency,DifferenceDay
0,500546547,10334,MOBILE,Cash,51.0,No,Female Shoes,2015-03-18,2018-06-28 19:35:53.360,2,1198.0
1,500240813,10610,WEB,Cash,108.0,No,Male Fashion,2015-07-24,2018-06-22 17:42:49.730,8,1064.0
2,500460527,10579,MOBILE,Cash,22.5,No,Female Fashion,2015-01-06,2018-06-25 01:12:09.600,5,1266.0


In [8]:
dropped = df.drop(['ProductID', 'UserFirstTransaction', 'LastTransaction'], axis = 1)
dropped.head(3)

Unnamed: 0,UserID,Channel,PaymentType,Price,Discount,Category,Recency,DifferenceDay
0,500546547,MOBILE,Cash,51.0,No,Female Shoes,2,1198.0
1,500240813,WEB,Cash,108.0,No,Male Fashion,8,1064.0
2,500460527,MOBILE,Cash,22.5,No,Female Fashion,5,1266.0


In [9]:
categorical_columns = ['Channel','PaymentType','Discount','Category']

In [10]:
# OHE the categorical columns
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output = False)

ohe_features = ohe.fit_transform(dropped[categorical_columns])
ohe_feature_names = ohe.get_feature_names_out(categorical_columns)

categorical_ohe = pd.DataFrame(ohe_features, columns = ohe_feature_names)
categorical_ohe = pd.concat([dropped['UserID'], categorical_ohe], axis = 1)
categorical_ohe = categorical_ohe.groupby('UserID').sum().reset_index()
categorical_ohe.head(3)

Unnamed: 0,UserID,Channel_MOBILE,Channel_WEB,PaymentType_Cash,PaymentType_Mobile Payment,PaymentType_Online Credit Card,Discount_No,Discount_Yes,Category_Computers & Laptops,Category_Electronic Accessories,...,Category_Indoor Sports,Category_Kitchen Electronics,Category_Male Fashion,Category_Male Shoes,Category_Outdoor Sports,Category_Smart Phones,Category_Smart Watches,Category_Sound Systems,Category_Sport Shoes,Category_TVs and TV Sets
0,500234532,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,500234631,2.0,2.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,500234642,4.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [11]:
# OHE Customer Informations
ohe2 = OneHotEncoder(sparse_output = False)

ohe_features2 = ohe2.fit_transform(customers[['Gender','Location']])
ohe_feature_names2 = ohe2.get_feature_names_out(['Gender','Location'])

customer_ohe = pd.DataFrame(ohe_features2, columns = ohe_feature_names2)
customer_ohe = pd.concat([customers[['UserID','Age']], customer_ohe], axis = 1)
customer_ohe.head(3)

Unnamed: 0,UserID,Age,Gender_FEMALE,Gender_MALE,Location_ADANA,Location_ANKARA,Location_ANTALYA,Location_BURSA,Location_ESKISEHIR,Location_ISTANBUL,Location_IZMIR,Location_KAYSERI,Location_TRABZON
0,500234532,19,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,500234631,20,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,500234642,41,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [12]:
dropped.drop(categorical_columns, axis = 1, inplace = True)
dropped.head(3)

Unnamed: 0,UserID,Price,Recency,DifferenceDay
0,500546547,51.0,2,1198.0
1,500240813,108.0,8,1064.0
2,500460527,22.5,5,1266.0


* Kişiyi genel yargılarken bu kısımda gruplama yapmalı ve kişinin yaptığı her bir alış-veriş için değil genel durumu için yargıda bulunmalıyız.  
* Bu durumda kişinin `Price` değişkenini ortalama değil toplam şeklinde almalıyız ki makine öğrenmesi modelimiz diğer, diğer kullanıcılar alışveriş yaptıkça toplam `Price` değişkeni artarken artmayan kullanıcıları daha kolay tespit etsin.  
* Eğer bu kısımda `Price` değişkenini ortalama alırsak 100 defa 10₺ 'lik alış-veriş biri ile 1 defa 10₺ 'lik alış-veriş yapan kişiyi aynı görür.  

In [13]:
price = dropped.groupby('UserID')['Price'].sum().reset_index()
price.columns = ['UserID','TotalPrice']
price.head(3)

Unnamed: 0,UserID,TotalPrice
0,500234532,146.7
1,500234631,147.0
2,500234642,735.6


In [14]:
other = dropped.groupby('UserID')['DifferenceDay','Recency'].mean().reset_index()
other.head(3)

  other = dropped.groupby('UserID')['DifferenceDay','Recency'].mean().reset_index()


Unnamed: 0,UserID,DifferenceDay,Recency
0,500234532,2258.0,195.0
1,500234631,34.0,50.0
2,500234642,785.0,13.0


In [15]:
dataframe = pd.merge(other,last_transactions[['UserID', 'Churned']], on = 'UserID', how = 'left')
daraframe = pd.merge(dataframe,price, on = 'UserID', how = 'left')
dataframe = pd.merge(dataframe,categorical_ohe, on = 'UserID', how = 'left')
dataframe = pd.merge(dataframe,customer_ohe, on = 'UserID', how = 'left')
dataframe.head(3)

Unnamed: 0,UserID,DifferenceDay,Recency,Churned,Channel_MOBILE,Channel_WEB,PaymentType_Cash,PaymentType_Mobile Payment,PaymentType_Online Credit Card,Discount_No,...,Gender_MALE,Location_ADANA,Location_ANKARA,Location_ANTALYA,Location_BURSA,Location_ESKISEHIR,Location_ISTANBUL,Location_IZMIR,Location_KAYSERI,Location_TRABZON
0,500234532,2258.0,195.0,0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,500234631,34.0,50.0,0,2.0,2.0,4.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,500234642,785.0,13.0,0,4.0,0.0,0.0,0.0,4.0,4.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [16]:
np.sum(dataframe['UserID'].value_counts()>1)

0

* Bütün müşterileri tek değer haline indirgeyip genel yargıda bir veri seti oluşturduk.

In [17]:
dataframe.to_csv('../Datasets/online_retail.csv', index = False)