In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

In [None]:
colname = ['date', 'unique_id', 'event', 'price', 'product_id', 'category']
df = pd.read_csv("000.txt", names=colname, header=None)
data = df.copy()
data.head()

In [None]:
# Checking the dataset shape (Rows and Columns)
data.shape

In [None]:
# Checking for dataset information
data.info()

In [None]:
# Getting features with null/NA
data.isna().any()

In [None]:
# Number of nulls/NA
data.isna().sum()

In [None]:
# Checking record with null/na unique identifier
data[data['unique_id'].isna()]

This record has all features as NaN and does not give any info. Its safe to remove.

In [None]:
data = data.dropna(subset=['unique_id'])

In [None]:
# As there is no way to determine category of products where it is NaN, we create a new category - Unknown and assign all suck
# products to it

In [None]:
data['category'] = data['category'].fillna('Unknown')

In [None]:
# Considering only 'buy' action records as only buy action leads to revenue
data = data[data['event'] == 'buy']

In [None]:
# Dropping duplicate records
data = data.drop_duplicates()

In [None]:
# Checking the dataset shape (Rows and Columns)
data.shape

In [None]:
# Converting date column to datetime
data['datetim'] =  pd.to_datetime(data['date'], format='%Y-%m-%d')

In [None]:
# Adding day and week number
data['days'] =  data.datetim.dt.dayofyear
data['days'] =  data['days'].astype('int')
#data['week'] =  data.datetim.dt.weekofyear
#data['week'] =  data['week'].astype('int') 

In [None]:
# Considering repeat customers(who bought more than once)
#data = data[data.groupby('unique_id').unique_id.transform(len) > 1]

In [None]:
#data.datetim.dt.strftime('%Y.%m.%d')
data.dtypes

In [None]:
# Sorting day wise
data = data.sort_values(by=['days'])

In [None]:
# Calculating retained customers per days
rids = []
final_data=[]
for i in range(min(data['days']),max(data['days'])+1):
    ids = data[data['days'] == i].unique_id.tolist()
    tot_price = data[data['days'] == i].price.sum()
    l = [i,len(np.setdiff1d(ids,rids)),len(set(rids).intersection(ids)), len(np.setdiff1d(ids,rids))+len(set(rids).intersection(ids)), tot_price]
    final_data.append(l)
    rids += ids
col = ['days','New_Cust', 'Retained_Cust', 'Total_Cust','Revenue']
final_data=pd.DataFrame(final_data,columns=col)
final_data.head(100)

In [None]:
# Looking at Retained customer Vs Revenue
sns.barplot(data=final_data,x='Retained_Cust',y='Revenue')
sns.lineplot(data=final_data[1:4],x='Retained_Cust',y='Revenue',
             color='b',
             marker="o"
            )

In [None]:
# Plot
plt.figure(figsize=[20,10])
plt.scatter(final_data['Retained_Cust'],final_data['Revenue'])
plt.title('Scatter plot - Retained customer Vs Revenue')
plt.xlabel('Retained_Cust')
plt.ylabel('Revenue')
plt.show()

In [None]:
final_data['Revenue'].corr(final_data['Retained_Cust'])

In [None]:
final_data.corr()

In [None]:
# It is clear from the above that revenue is strongly and positively corelated to 
# new customer, customer retention and total customer (basically revenue is related to buying customers). 
# As the data provided is small, getting an accurate enough model is difficult.

In [None]:
# As Total customer  = Retained customer + New customer, we will include only Retained customer and New customer
y = final_data['Revenue']
X = final_data[['New_Cust','Retained_Cust']]

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Plot outputs
y_pred = regr.predict(X_test)

# Model Evaluation

In [None]:
from sklearn.metrics import explained_variance_score
explained_variance_score(y_test, y_pred)  

In [None]:
from sklearn.metrics import max_error
max_error(y_test, y_pred)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

In [None]:
from sklearn.metrics import mean_squared_log_error
mean_squared_log_error(y_test, y_pred) 

In [None]:
from sklearn.metrics import median_absolute_error
median_absolute_error(y_test, y_pred)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)