In [None]:
import os

import numpy as np

import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import feature_selection, linear_model




# Predicting product performance in video eCommerce

## Overview:
### My company produces 2-3 minute video segments for products that feature a host/presenter ( like HSN but shorter and online!). We use our content and syndicate it across out publisher network to generate sales. In this business, the KPIs are gross revenue, total # of views (think of this as your foot traffic to a store), conversion rate and more. 

## Problem Statement:
#### We have hit a stage where in order to scale the business, you need to purchase more distribution. There is a need for a predictive model where if we input the category of a product, sale price and potentially presenter, we should have a prediction of the number of views and gross revenue we would receive. This will help the team make better choices as to which content to place on specific publisher.



## What to find in the data ?
#### Build a model that shows the gross revenue for each sale price by 

#### #1  At what sale price does each campaign source generate the most views and revenue? Is there a signifiance in the correlation between category with views and gross revenue? 

#### #2 What price point on a particular channel generates a bulk of the sales. 


## Methodology

### 1)  Linear Regression & Linear Regression Modeling with sklearn

####  - Use multilinear linear regression to find any significance between variables (seconds per view in relation to sale price, category in relation to sale price, total views in relation to gross sales, sale price in relation to gross sales). 

#### -  Of the variables that have significance, rank the order of signifiance.


### 2) Regularization 

#### Use regularization to check for overfitting


## Goals / Indicators of Success

### Check for Multicollinearity

## Challenges / Concerns

### Interaction Effects and gression plot between different variables to the response variable (gross sales)


## About the data file: 
### I used Looker (our analytics tool that tracks transactional data) to pull information on all video sales from 1/1/2015 - 12/31/2016. I filtered only for videos with 1000> views. When we do feature a video on any particular day, the video receives well over 1000 views so I want to filter out noise. 

In [18]:
df = pd.read_csv(os.path.join('Video sales dataset.csv'),index_col = 'Video Video ID')

In [19]:
df

Unnamed: 0_level_0,Date,Campaign Campaign Source,Video Category,Video Subcategory,Video Presenter,...,Seconds Viewed Onsite,Sale Price,Seconds Per View,Seconds Per View Onsite,Seconds Per View Offsite
Video Video ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3360.0,2016-12-31,syndication-comcast,Beauty,Skincare,Susan Yara,...,141020,$37.50,80,80.0,
3400.0,2016-12-31,syndication-aol-hp-test,Home,Storage & Organization,Marcy McKenna,...,770,$68.00,65,55.0,65.0
2958.0,2016-12-31,syndication-AOL-CPV,Home,Storage & Organization,Marcy McKenna,...,594,$20.00,58,54.0,58.0
3399.0,2016-12-31,syndication-AOL-CPV,Beauty,Skincare,Jenny Patinkin,...,3376,$64.31,56,57.0,56.0
3398.0,2016-12-31,syndication-aol-hp-test,Home,Storage & Organization,Marcy McKenna,...,4017,$128.00,67,98.0,67.0
...,...,...,...,...,...,...,...,...,...,...,...
1814.0,2015-01-01,syndication-aol-hp-test,Beauty,Makeup,Mikaela South,...,5015,$20.00,76,78.0,76.0
1816.0,2015-01-01,syndication-people-style-watch,Apparel,Dresses,Zoë Ruderman,...,43166,$68.00,55,68.0,45.0
1808.0,2015-01-01,syndication-people-style-watch,Apparel,Tops,Tracy O'Connor,...,121250,$128.00,43,43.0,
1818.0,2015-01-01,syndication-aol-hp-test,Beauty,Hair,Tracy O'Connor,...,39963,$44.59,95,92.0,96.0


###  Description of my data

In [20]:
df.shape

(4400, 18)

In [59]:
df.columns

Index([u' Date', u'Campaign Campaign Source', u'Video Category',
       u'Video Subcategory', u'Video Presenter', u'RPV', u'Total Views',
       u'Views Offsite', u'Views Onsite', u'Gross Revenue',
       u'Total Units Sold', u'Total Seconds Viewed', u'Seconds Viewed Offsite',
       u'Seconds Viewed Onsite', u'Sale Price', u'Seconds Per View',
       u'Seconds Per View Onsite', u'Seconds Per View Offsite'],
      dtype='object')

In [34]:
df = df.rename(columns={'Campaign Campaign Source': 'CampaignSource', 'Video Video ID': 'VideoID',
               'Video Category': 'Category','Video Subcategory': 'Subcategory',
               'Video Presenter': 'Presenter','Total Views': 'TotalViews',
                'Views Offsite': 'ViewsOffsite','ViewsOnsite': 'ViewsOnsite','Gross Revenue': 'GrossRevenue',
                'Total Units Sold': 'TotalUnitsSold','Total Seconds Viewed': 'TotalSecondsViewed',
                'Seconds Viewed Offsite': 'SecondsViewedOffsite','Seconds Viewed Onsite': 'SecondsViewedOnsite',
                'Sale Price': 'SalePrice','Seconds Per View': 'SecondsPerView','Seconds Per View Onsite':'SecondsPerViewOnsite'
                ,'Seconds Per View Offsite':'SecondsPerViewOffsite'})

df.columns

Index([u' Date', u'CampaignSource', u'Category', u'Subcategory', u'Presenter',
       u'RPV', u'TotalViews', u'ViewsOffsite', u'Views Onsite',
       u'GrossRevenue', u'TotalUnitsSold', u'TotalSeconsViewed',
       u'SecondsViewedOffsite', u'SecondsViewedOnsite', u'SalePrice',
       u'SecondsPerView', u'SecondsPerViewOnsite', u'SecondsPerViewOffsite'],
      dtype='object')

### Take out null values

In [31]:
df.isnull().sum()

 Date                      0
CampaignSource            48
Category                   0
Subcategory                0
Presenter                  0
                        ... 
SecondsViewedOnsite        0
SalePrice                423
SecondsPerView             0
SecondsPerViewOnsite      93
SecondsPerViewOffsite    185
dtype: int64

In [42]:
df.isnull().sum().sum()

188

In [48]:
df.dropna(subset=['SalePrice','SecondsPerViewOnsite','SecondsPerViewOffsite'], inplace=True)

In [49]:
df.isnull().sum().sum()

0

## PART A | Linear Regression

In [50]:
smf.ols(formula = 'GrossRevenue ~ TotalViews + Presenter', data = df).fit().summary()




ValueError: shapes (3744,1807) and (3744,1807) not aligned: 1807 (dim 1) != 3744 (dim 0)