##**Step-1: Install/ Import the required Python Packages/ Libraries, Mount the Google Drive and read and check the Data and Customer files**

**1) Install/ Import the required Python Packages/ Libraries**

In [None]:
#Import required python packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
pip install category_encoders



**2) Mounting the Google Drive**

In [None]:
# Mount the Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**3) Read the Data file and Customer file and check**

In [None]:
# Read the Diabetes Data from .csv file and check the data shape (number of Rows and Columns)
train_df = pd.read_csv('gdrive/My Drive/SRM-Internship-2021-Latest/Marketplace-Features-Creation-Project/05-Output-cum-Input-Files3/Mkt_Features_Training_Data.csv')
cust_df = pd.read_csv('gdrive/My Drive/SRM-Internship-2021-Latest/Marketplace-Features-Creation-Project/05-Output-cum-Input-Files3/Mkt_Features_Customer_Data.csv')
print(train_df.shape)
print(cust_df.shape)

(23835, 10)
(10215, 9)


In [None]:
train_df.info()
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23835 entries, 0 to 23834
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   UserID                         23835 non-null  object
 1   No_of_days_Visited_7_Days      23835 non-null  int64 
 2   No_Of_Products_Viewed_15_Days  23835 non-null  int64 
 3   User_Vintage                   23835 non-null  int64 
 4   Most_Viewed_product_15_Days    23835 non-null  object
 5   Most_Active_OS                 23835 non-null  object
 6   Recently_Viewed_Product        23835 non-null  object
 7   Pageloads_last_7_days          23835 non-null  int64 
 8   Clicks_last_7_days             23835 non-null  int64 
 9   Target_Customers               23835 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 1.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10215 entries, 0 to 10214
Data columns (total 9 columns):
 #   Column              

In [None]:
train_df['Target_Customers'].value_counts()

0    23602
1      233
Name: Target_Customers, dtype: int64

##**Step-2: Combine the Train and Test File**

In [None]:
train_df['train']=1
cust_df['cust'] = 0

In [None]:
print(train_df.shape)
print(cust_df.shape)

(23835, 11)
(10215, 10)


In [None]:
train_df.info()
print()
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23835 entries, 0 to 23834
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   UserID                         23835 non-null  object
 1   No_of_days_Visited_7_Days      23835 non-null  int64 
 2   No_Of_Products_Viewed_15_Days  23835 non-null  int64 
 3   User_Vintage                   23835 non-null  int64 
 4   Most_Viewed_product_15_Days    23835 non-null  object
 5   Most_Active_OS                 23835 non-null  object
 6   Recently_Viewed_Product        23835 non-null  object
 7   Pageloads_last_7_days          23835 non-null  int64 
 8   Clicks_last_7_days             23835 non-null  int64 
 9   Target_Customers               23835 non-null  int64 
 10  train                          23835 non-null  int64 
dtypes: int64(7), object(4)
memory usage: 2.0+ MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10215 entries, 0 to 102

In [None]:
combined_df  = pd.concat([train_df, cust_df])
combined_df.shape

(34050, 12)

In [None]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34050 entries, 0 to 10214
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   UserID                         34050 non-null  object 
 1   No_of_days_Visited_7_Days      34050 non-null  int64  
 2   No_Of_Products_Viewed_15_Days  34050 non-null  int64  
 3   User_Vintage                   34050 non-null  int64  
 4   Most_Viewed_product_15_Days    34050 non-null  object 
 5   Most_Active_OS                 34050 non-null  object 
 6   Recently_Viewed_Product        34050 non-null  object 
 7   Pageloads_last_7_days          34050 non-null  int64  
 8   Clicks_last_7_days             34050 non-null  int64  
 9   Target_Customers               23835 non-null  float64
 10  train                          23835 non-null  float64
 11  cust                           10215 non-null  float64
dtypes: float64(3), int64(5), object(4)
memory usag

##**Step-3: Check the Data Types of the Columns as well as Missing Data**

**1) Execute the "info()" command and check datatypes of the Columns and Missing Data**

In [None]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34050 entries, 0 to 10214
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   UserID                         34050 non-null  object 
 1   No_of_days_Visited_7_Days      34050 non-null  int64  
 2   No_Of_Products_Viewed_15_Days  34050 non-null  int64  
 3   User_Vintage                   34050 non-null  int64  
 4   Most_Viewed_product_15_Days    34050 non-null  object 
 5   Most_Active_OS                 34050 non-null  object 
 6   Recently_Viewed_Product        34050 non-null  object 
 7   Pageloads_last_7_days          34050 non-null  int64  
 8   Clicks_last_7_days             34050 non-null  int64  
 9   Target_Customers               23835 non-null  float64
 10  train                          23835 non-null  float64
 11  cust                           10215 non-null  float64
dtypes: float64(3), int64(5), object(4)
memory usag

**2) Summarize the columnwise Missing Data**

In [None]:
combined_df.isnull().sum()

UserID                               0
No_of_days_Visited_7_Days            0
No_Of_Products_Viewed_15_Days        0
User_Vintage                         0
Most_Viewed_product_15_Days          0
Most_Active_OS                       0
Recently_Viewed_Product              0
Pageloads_last_7_days                0
Clicks_last_7_days                   0
Target_Customers                 10215
train                            10215
cust                             23835
dtype: int64

**Observations:**
* **a) We have the missing data, hence we need to handle this.**

##**Step-4: Check on Data Preprocessing applicability (Initial)**


###**Column-0: UserID**

In [None]:
combined_df['UserID'].value_counts()

U120966    1
U111174    1
U109623    1
U122181    1
U132578    1
          ..
U102456    1
U118462    1
U116269    1
U115839    1
U114632    1
Name: UserID, Length: 34050, dtype: int64

**Observations:**
* a) Data in this column will not be contributing to the prediction of the Depenedent variable

**Decsion:**

**We will be dropping this column**

**Action:**

In [None]:
combined_df.drop(['UserID'], axis = 1, inplace = True)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34050 entries, 0 to 10214
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   No_of_days_Visited_7_Days      34050 non-null  int64  
 1   No_Of_Products_Viewed_15_Days  34050 non-null  int64  
 2   User_Vintage                   34050 non-null  int64  
 3   Most_Viewed_product_15_Days    34050 non-null  object 
 4   Most_Active_OS                 34050 non-null  object 
 5   Recently_Viewed_Product        34050 non-null  object 
 6   Pageloads_last_7_days          34050 non-null  int64  
 7   Clicks_last_7_days             34050 non-null  int64  
 8   Target_Customers               23835 non-null  float64
 9   train                          23835 non-null  float64
 10  cust                           10215 non-null  float64
dtypes: float64(3), int64(5), object(3)
memory usage: 3.1+ MB


###**Column-3: Most_Viewed_product_15_Days**

In [None]:
combined_df['Most_Viewed_product_15_Days'].value_counts()

Product101    9587
PR100017      1858
PR100102      1097
PR100166       749
PR100145       373
              ... 
PR108099         1
PR102804         1
PR100617         1
PR101119         1
PR107238         1
Name: Most_Viewed_product_15_Days, Length: 1781, dtype: int64

**Observations:**
* a) Data in this column is of "Object" or "String" datatype. Also, the data levels are "Nominal" Type.

**Decsion:**

**We will be converting the data in this column into Numerical values using Nominal Type method "Binary Encoding" as we want to have minimum number of columns.**

**Action:**

In [None]:
#Create object for binary encoding
import category_encoders as ce
encoder1= ce.BinaryEncoder(cols=['Most_Viewed_product_15_Days'],return_df=True)

In [None]:
#Fit and Transform Data 
Most_Viewed_product_15_Days_encoded=encoder1.fit_transform(combined_df['Most_Viewed_product_15_Days']) 
Most_Viewed_product_15_Days_encoded

Unnamed: 0,Most_Viewed_product_15_Days_0,Most_Viewed_product_15_Days_1,Most_Viewed_product_15_Days_2,Most_Viewed_product_15_Days_3,Most_Viewed_product_15_Days_4,Most_Viewed_product_15_Days_5,Most_Viewed_product_15_Days_6,Most_Viewed_product_15_Days_7,Most_Viewed_product_15_Days_8,Most_Viewed_product_15_Days_9,Most_Viewed_product_15_Days_10
0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
10210,0,0,0,0,0,0,0,0,0,1,0
10211,0,0,0,0,0,0,0,0,0,1,0
10212,0,1,1,1,0,0,0,1,0,1,1
10213,0,0,0,0,1,1,1,1,0,0,1


###**Column-3: Married**

###**Column-4: Most_Active_OS**

In [None]:
combined_df['Most_Active_OS'].value_counts()

WINDOWS      20044
ANDROID       9847
MAC OS X      2422
LINUX          924
IOS            492
UBUNTU         295
CHROME OS       14
FEDORA          12
Name: Most_Active_OS, dtype: int64

**Observations:**
* a) Data in this column is of "Object" or "String" datatype. Also, the data levels are "Nominal" Type.

**Decsion:**

**We will be converting the data in this column into Numerical values using Nominal Type method "Binary Encoding" as we want to have minimum number of columns.**

**Action:**

In [None]:
#Create object for binary encoding
import category_encoders as ce
encoder2= ce.BinaryEncoder(cols=['Most_Active_OS'],return_df=True)

In [None]:
#Fit and Transform Data 
Most_Active_OS_encoded=encoder2.fit_transform(combined_df['Most_Active_OS']) 
Most_Active_OS_encoded

Unnamed: 0,Most_Active_OS_0,Most_Active_OS_1,Most_Active_OS_2,Most_Active_OS_3
0,0,0,0,1
1,0,0,1,0
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
10210,0,0,0,1
10211,0,0,1,0
10212,0,0,0,1
10213,0,0,0,1


###**Column-5: Recently_Viewed_Product**


In [None]:
combined_df['Recently_Viewed_Product'].value_counts()

Product101    4603
PR100017      1387
PR100102      1256
PR100166       736
PR100390       590
              ... 
PR103113         1
PR100914         1
PR111017         1
PR105599         1
PR100389         1
Name: Recently_Viewed_Product, Length: 2163, dtype: int64

In [None]:
#Create object for binary encoding
import category_encoders as ce
encoder3= ce.BinaryEncoder(cols=['Recently_Viewed_Product'],return_df=True)

In [None]:
#Fit and Transform Data 
Recently_Viewed_Product_encoded=encoder3.fit_transform(combined_df['Recently_Viewed_Product']) 
Recently_Viewed_Product_encoded

Unnamed: 0,Recently_Viewed_Product_0,Recently_Viewed_Product_1,Recently_Viewed_Product_2,Recently_Viewed_Product_3,Recently_Viewed_Product_4,Recently_Viewed_Product_5,Recently_Viewed_Product_6,Recently_Viewed_Product_7,Recently_Viewed_Product_8,Recently_Viewed_Product_9,Recently_Viewed_Product_10,Recently_Viewed_Product_11
0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10210,0,0,1,1,0,0,0,1,1,0,1,0
10211,0,0,0,0,1,1,0,0,1,1,0,0
10212,0,0,0,1,1,0,1,0,1,0,0,0
10213,0,0,0,0,0,1,1,1,1,0,1,0


In [None]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34050 entries, 0 to 10214
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   No_of_days_Visited_7_Days      34050 non-null  int64  
 1   No_Of_Products_Viewed_15_Days  34050 non-null  int64  
 2   User_Vintage                   34050 non-null  int64  
 3   Most_Viewed_product_15_Days    34050 non-null  object 
 4   Most_Active_OS                 34050 non-null  object 
 5   Recently_Viewed_Product        34050 non-null  object 
 6   Pageloads_last_7_days          34050 non-null  int64  
 7   Clicks_last_7_days             34050 non-null  int64  
 8   Target_Customers               23835 non-null  float64
 9   train                          23835 non-null  float64
 10  cust                           10215 non-null  float64
dtypes: float64(3), int64(5), object(3)
memory usage: 3.1+ MB


In [None]:
combined_df.drop(['Most_Viewed_product_15_Days', 'Most_Active_OS','Recently_Viewed_Product',], axis = 1, inplace = True)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34050 entries, 0 to 10214
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   No_of_days_Visited_7_Days      34050 non-null  int64  
 1   No_Of_Products_Viewed_15_Days  34050 non-null  int64  
 2   User_Vintage                   34050 non-null  int64  
 3   Pageloads_last_7_days          34050 non-null  int64  
 4   Clicks_last_7_days             34050 non-null  int64  
 5   Target_Customers               23835 non-null  float64
 6   train                          23835 non-null  float64
 7   cust                           10215 non-null  float64
dtypes: float64(3), int64(5)
memory usage: 2.3 MB


In [None]:
combined_df = pd.concat([combined_df,Most_Viewed_product_15_Days_encoded, Most_Active_OS_encoded,Recently_Viewed_Product_encoded], axis=1)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34050 entries, 0 to 10214
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   No_of_days_Visited_7_Days       34050 non-null  int64  
 1   No_Of_Products_Viewed_15_Days   34050 non-null  int64  
 2   User_Vintage                    34050 non-null  int64  
 3   Pageloads_last_7_days           34050 non-null  int64  
 4   Clicks_last_7_days              34050 non-null  int64  
 5   Target_Customers                23835 non-null  float64
 6   train                           23835 non-null  float64
 7   cust                            10215 non-null  float64
 8   Most_Viewed_product_15_Days_0   34050 non-null  int64  
 9   Most_Viewed_product_15_Days_1   34050 non-null  int64  
 10  Most_Viewed_product_15_Days_2   34050 non-null  int64  
 11  Most_Viewed_product_15_Days_3   34050 non-null  int64  
 12  Most_Viewed_product_15_Days_4   

##**Step-7: Seggregate the Train and Test Data**

In [None]:
train_df1 = combined_df[combined_df["train"] == 1]
cust_df1 = combined_df[combined_df["cust"] == 0]
train_df1.drop(["train", "cust"], axis=1, inplace=True)
cust_df1.drop(["cust", "train", "Target_Customers"], axis=1, inplace=True)

In [None]:
train_df1.shape

(23835, 33)

In [None]:
cust_df1.shape

(10215, 32)

##**Step-8: Slice X and y Values**

In [None]:
X = train_df1.drop(['Target_Customers'], axis = 1)
y = train_df1['Target_Customers']
X.head()

Unnamed: 0,No_of_days_Visited_7_Days,No_Of_Products_Viewed_15_Days,User_Vintage,Pageloads_last_7_days,Clicks_last_7_days,Most_Viewed_product_15_Days_0,Most_Viewed_product_15_Days_1,Most_Viewed_product_15_Days_2,Most_Viewed_product_15_Days_3,Most_Viewed_product_15_Days_4,Most_Viewed_product_15_Days_5,Most_Viewed_product_15_Days_6,Most_Viewed_product_15_Days_7,Most_Viewed_product_15_Days_8,Most_Viewed_product_15_Days_9,Most_Viewed_product_15_Days_10,Most_Active_OS_0,Most_Active_OS_1,Most_Active_OS_2,Most_Active_OS_3,Recently_Viewed_Product_0,Recently_Viewed_Product_1,Recently_Viewed_Product_2,Recently_Viewed_Product_3,Recently_Viewed_Product_4,Recently_Viewed_Product_5,Recently_Viewed_Product_6,Recently_Viewed_Product_7,Recently_Viewed_Product_8,Recently_Viewed_Product_9,Recently_Viewed_Product_10,Recently_Viewed_Product_11
0,1,2,776,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,82,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,1,300,1,2,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
3,0,1,404,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
4,4,7,80,14,20,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1


In [None]:
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Target_Customers, dtype: float64

In [None]:
train_df1.columns

Index(['No_of_days_Visited_7_Days', 'No_Of_Products_Viewed_15_Days',
       'User_Vintage', 'Pageloads_last_7_days', 'Clicks_last_7_days',
       'Target_Customers', 'Most_Viewed_product_15_Days_0',
       'Most_Viewed_product_15_Days_1', 'Most_Viewed_product_15_Days_2',
       'Most_Viewed_product_15_Days_3', 'Most_Viewed_product_15_Days_4',
       'Most_Viewed_product_15_Days_5', 'Most_Viewed_product_15_Days_6',
       'Most_Viewed_product_15_Days_7', 'Most_Viewed_product_15_Days_8',
       'Most_Viewed_product_15_Days_9', 'Most_Viewed_product_15_Days_10',
       'Most_Active_OS_0', 'Most_Active_OS_1', 'Most_Active_OS_2',
       'Most_Active_OS_3', 'Recently_Viewed_Product_0',
       'Recently_Viewed_Product_1', 'Recently_Viewed_Product_2',
       'Recently_Viewed_Product_3', 'Recently_Viewed_Product_4',
       'Recently_Viewed_Product_5', 'Recently_Viewed_Product_6',
       'Recently_Viewed_Product_7', 'Recently_Viewed_Product_8',
       'Recently_Viewed_Product_9', 'Recently_Viewed

In [None]:
columnNames=['No_of_days_Visited_7_Days', 'No_Of_Products_Viewed_15_Days',
       'User_Vintage', 'Pageloads_last_7_days', 'Clicks_last_7_days',
       'Most_Viewed_product_15_Days_0',
       'Most_Viewed_product_15_Days_1', 'Most_Viewed_product_15_Days_2',
       'Most_Viewed_product_15_Days_3', 'Most_Viewed_product_15_Days_4',
       'Most_Viewed_product_15_Days_5', 'Most_Viewed_product_15_Days_6',
       'Most_Viewed_product_15_Days_7', 'Most_Viewed_product_15_Days_8',
       'Most_Viewed_product_15_Days_9', 'Most_Viewed_product_15_Days_10',
       'Most_Active_OS_0', 'Most_Active_OS_1', 'Most_Active_OS_2',
       'Most_Active_OS_3', 'Recently_Viewed_Product_0',
       'Recently_Viewed_Product_1', 'Recently_Viewed_Product_2',
       'Recently_Viewed_Product_3', 'Recently_Viewed_Product_4',
       'Recently_Viewed_Product_5', 'Recently_Viewed_Product_6',
       'Recently_Viewed_Product_7', 'Recently_Viewed_Product_8',
       'Recently_Viewed_Product_9', 'Recently_Viewed_Product_10',
       'Recently_Viewed_Product_11']

In [None]:
from sklearn.preprocessing import StandardScaler
std_scaler_object = preprocessing.StandardScaler()
X1 = std_scaler_object.fit_transform(X)
X1 = pd.DataFrame(X1 , columns = columnNames)
X1.head()

Unnamed: 0,No_of_days_Visited_7_Days,No_Of_Products_Viewed_15_Days,User_Vintage,Pageloads_last_7_days,Clicks_last_7_days,Most_Viewed_product_15_Days_0,Most_Viewed_product_15_Days_1,Most_Viewed_product_15_Days_2,Most_Viewed_product_15_Days_3,Most_Viewed_product_15_Days_4,Most_Viewed_product_15_Days_5,Most_Viewed_product_15_Days_6,Most_Viewed_product_15_Days_7,Most_Viewed_product_15_Days_8,Most_Viewed_product_15_Days_9,Most_Viewed_product_15_Days_10,Most_Active_OS_0,Most_Active_OS_1,Most_Active_OS_2,Most_Active_OS_3,Recently_Viewed_Product_0,Recently_Viewed_Product_1,Recently_Viewed_Product_2,Recently_Viewed_Product_3,Recently_Viewed_Product_4,Recently_Viewed_Product_5,Recently_Viewed_Product_6,Recently_Viewed_Product_7,Recently_Viewed_Product_8,Recently_Viewed_Product_9,Recently_Viewed_Product_10,Recently_Viewed_Product_11
0,0.36416,-0.031822,0.824883,-0.067602,-0.114764,-0.200257,-0.340705,-0.50118,-0.560698,-0.608911,-0.672,-0.654294,-0.818997,-0.785721,-1.327237,1.309345,-0.021488,-0.345821,-0.673644,0.764154,0.0,-0.284607,-0.44434,-0.558758,-0.659202,-0.760164,-0.74192,-0.817425,-0.857893,-1.227211,-1.151541,0.948982
1,-0.667425,-0.600386,-0.690322,-0.262251,-0.114764,-0.200257,-0.340705,-0.50118,-0.560698,-0.608911,-0.672,-0.654294,-0.818997,-0.785721,0.753445,-0.763741,-0.021488,-0.345821,1.484463,-1.308637,0.0,-0.284607,-0.44434,-0.558758,-0.659202,-0.760164,-0.74192,-0.817425,-0.857893,-1.227211,0.868402,-1.053761
2,0.36416,-0.316104,-0.214364,-0.067602,-0.040239,-0.200257,-0.340705,-0.50118,-0.560698,-0.608911,-0.672,-0.654294,-0.818997,-0.785721,0.753445,1.309345,-0.021488,-0.345821,-0.673644,0.764154,0.0,-0.284607,-0.44434,-0.558758,-0.659202,-0.760164,-0.74192,-0.817425,-0.857893,-1.227211,0.868402,0.948982
3,-0.667425,-0.316104,0.012698,-0.262251,-0.114764,-0.200257,-0.340705,-0.50118,-0.560698,-0.608911,-0.672,-0.654294,-0.818997,1.272716,-1.327237,-0.763741,-0.021488,-0.345821,-0.673644,0.764154,0.0,-0.284607,-0.44434,-0.558758,-0.659202,-0.760164,-0.74192,-0.817425,-0.857893,0.814856,-1.151541,-1.053761
4,3.458916,1.389591,-0.694688,2.462827,0.630485,-0.200257,-0.340705,-0.50118,-0.560698,-0.608911,-0.672,-0.654294,-0.818997,-0.785721,-1.327237,1.309345,-0.021488,-0.345821,-0.673644,0.764154,0.0,-0.284607,-0.44434,-0.558758,-0.659202,-0.760164,-0.74192,-0.817425,-0.857893,0.814856,-1.151541,0.948982


##**Step-9: Execute Train-Test-Split Command and Verify**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.2, random_state = 66)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(19068, 32)
(19068,)
(4767, 32)
(4767,)


##**Step-10: Learn the Data and Predict the dependent Variable values for the "X_test"data using "SVC()" algorithm**

In [None]:
from sklearn.linear_model import LogisticRegression
#create an instance and fit the model 
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

LogisticRegression()

In [None]:
#predictions
y_pred = logmodel.predict(X_test)

In [None]:
lr_Train_acc=logmodel.score(X_train,y_train)
lr_Test_acc=logmodel.score(X_test,y_test)

##**Step-11: Calculate the Accuracy of the Model**

In [None]:
print('Accuracy on training set:',lr_Train_acc)
print('Accuracy on test set:',lr_Test_acc)

Accuracy on training set: 0.9942836165303126
Accuracy on test set: 0.9930774071743235


##**Step-12: Display the Confusion Matrix and Classification Report of the Model**

In [None]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  

[[4711    8]
 [  25   23]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      4719
         1.0       0.74      0.48      0.58        48

    accuracy                           0.99      4767
   macro avg       0.87      0.74      0.79      4767
weighted avg       0.99      0.99      0.99      4767



##**Step-13: SVC Algorithm Parameters Fine Tuning using GridSearch CV Method**

In [None]:
model_params = {
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
 }

In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X1, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,logistic_regression,0.993665,{'C': 5}


In [None]:
# Logistic Regression
lr_grid_acc = cross_val_score(LogisticRegression(C=5, solver='liblinear',multi_class='auto'),X1, y, cv=5)
print("lr_grid_acc (CV_based) :", lr_grid_acc)
lr_grid_acc_avg=np.average(lr_grid_acc)
print()
print("lr_grid_acc_avg : ", lr_grid_acc_avg)

lr_grid_acc (CV_based) : [0.99496539 0.99265786 0.99517516 0.99286763 0.99265786]

lr_grid_acc_avg :  0.993664778686805


In [None]:
from sklearn.preprocessing import StandardScaler
std_scaler_object = preprocessing.StandardScaler()
cust_df2 = std_scaler_object.fit_transform(cust_df1)
cust_df3 = pd.DataFrame(cust_df2 , columns = columnNames)
cust_df3.head()

Unnamed: 0,No_of_days_Visited_7_Days,No_Of_Products_Viewed_15_Days,User_Vintage,Pageloads_last_7_days,Clicks_last_7_days,Most_Viewed_product_15_Days_0,Most_Viewed_product_15_Days_1,Most_Viewed_product_15_Days_2,Most_Viewed_product_15_Days_3,Most_Viewed_product_15_Days_4,Most_Viewed_product_15_Days_5,Most_Viewed_product_15_Days_6,Most_Viewed_product_15_Days_7,Most_Viewed_product_15_Days_8,Most_Viewed_product_15_Days_9,Most_Viewed_product_15_Days_10,Most_Active_OS_0,Most_Active_OS_1,Most_Active_OS_2,Most_Active_OS_3,Recently_Viewed_Product_0,Recently_Viewed_Product_1,Recently_Viewed_Product_2,Recently_Viewed_Product_3,Recently_Viewed_Product_4,Recently_Viewed_Product_5,Recently_Viewed_Product_6,Recently_Viewed_Product_7,Recently_Viewed_Product_8,Recently_Viewed_Product_9,Recently_Viewed_Product_10,Recently_Viewed_Product_11
0,-0.658482,-0.308746,-0.504879,-0.24523,-0.06047,-0.251349,-0.364672,-0.492186,1.753915,1.658312,1.478249,1.512877,-0.816663,1.278581,0.737553,1.334581,-0.01714,-0.348628,-0.676322,0.7682,-0.108567,-0.312725,-0.444611,-0.559297,-0.676937,1.332031,-0.740244,-0.810681,-0.866396,-1.229253,0.841882,0.953971
1,-0.658482,-0.308746,-0.68877,-0.24523,-0.06047,-0.251349,-0.364672,-0.492186,-0.570153,1.658312,1.478249,-0.660992,-0.816663,1.278581,-1.355835,1.334581,-0.01714,-0.348628,-0.676322,0.7682,-0.108567,-0.312725,-0.444611,-0.559297,-0.676937,-0.750733,1.350905,-0.810681,1.154206,0.813502,0.841882,-1.04825
2,-0.658482,-0.308746,1.439111,-0.24523,-0.06047,-0.251349,2.742189,-0.492186,-0.570153,1.658312,-0.676476,-0.660992,1.224495,1.278581,0.737553,1.334581,-0.01714,-0.348628,-0.676322,0.7682,-0.108567,-0.312725,-0.444611,-0.559297,-0.676937,-0.750733,-0.740244,-0.810681,-0.866396,0.813502,0.841882,0.953971
3,0.408014,-0.308746,3.860341,-0.056456,-0.042954,-0.251349,-0.364672,2.031751,1.753915,1.658312,-0.676476,1.512877,1.224495,-0.782117,0.737553,1.334581,-0.01714,-0.348628,1.478585,-1.301745,-0.108567,-0.312725,-0.444611,-0.559297,-0.676937,-0.750733,1.350905,1.233531,1.154206,0.813502,-1.187815,0.953971
4,-0.658482,-0.5969,-0.66031,-0.24523,-0.06047,-0.251349,-0.364672,-0.492186,-0.570153,-0.603023,-0.676476,-0.660992,-0.816663,-0.782117,0.737553,-0.749299,-0.01714,-0.348628,1.478585,-1.301745,-0.108567,-0.312725,-0.444611,1.787958,-0.676937,1.332031,-0.740244,-0.810681,-0.866396,-1.229253,0.841882,-1.04825


In [None]:
#predictions for Customer Data
cust_data_pred = logmodel.predict(cust_df3)

In [None]:
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10215 entries, 0 to 10214
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   UserID                         10215 non-null  object
 1   No_of_days_Visited_7_Days      10215 non-null  int64 
 2   No_Of_Products_Viewed_15_Days  10215 non-null  int64 
 3   User_Vintage                   10215 non-null  int64 
 4   Most_Viewed_product_15_Days    10215 non-null  object
 5   Most_Active_OS                 10215 non-null  object
 6   Recently_Viewed_Product        10215 non-null  object
 7   Pageloads_last_7_days          10215 non-null  int64 
 8   Clicks_last_7_days             10215 non-null  int64 
 9   cust                           10215 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 798.2+ KB


In [None]:
cust_df.drop(["cust"], axis=1, inplace=True)
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10215 entries, 0 to 10214
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   UserID                         10215 non-null  object
 1   No_of_days_Visited_7_Days      10215 non-null  int64 
 2   No_Of_Products_Viewed_15_Days  10215 non-null  int64 
 3   User_Vintage                   10215 non-null  int64 
 4   Most_Viewed_product_15_Days    10215 non-null  object
 5   Most_Active_OS                 10215 non-null  object
 6   Recently_Viewed_Product        10215 non-null  object
 7   Pageloads_last_7_days          10215 non-null  int64 
 8   Clicks_last_7_days             10215 non-null  int64 
dtypes: int64(5), object(4)
memory usage: 718.4+ KB


In [None]:
cust_df["Predicted_Target_Customers"]=cust_data_pred
print(cust_df.shape)
cust_df.head()

(10215, 10)


Unnamed: 0,UserID,No_of_days_Visited_7_Days,No_Of_Products_Viewed_15_Days,User_Vintage,Most_Viewed_product_15_Days,Most_Active_OS,Recently_Viewed_Product,Pageloads_last_7_days,Clicks_last_7_days,Predicted_Target_Customers
0,U124954,0,1,169,PR100047,WINDOWS,PR100047,0,0,0.0
1,U124892,0,1,85,PR100279,WINDOWS,PR100279,0,0,0.0
2,U123462,0,1,1057,PR100980,WINDOWS,Product101,0,0,0.0
3,U107624,1,1,2163,PR100498,ANDROID,PR100498,1,1,0.0
4,U122026,0,0,98,Product101,ANDROID,PR100023,0,0,0.0


In [None]:
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10215 entries, 0 to 10214
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   UserID                         10215 non-null  object 
 1   No_of_days_Visited_7_Days      10215 non-null  int64  
 2   No_Of_Products_Viewed_15_Days  10215 non-null  int64  
 3   User_Vintage                   10215 non-null  int64  
 4   Most_Viewed_product_15_Days    10215 non-null  object 
 5   Most_Active_OS                 10215 non-null  object 
 6   Recently_Viewed_Product        10215 non-null  object 
 7   Pageloads_last_7_days          10215 non-null  int64  
 8   Clicks_last_7_days             10215 non-null  int64  
 9   Predicted_Target_Customers     10215 non-null  float64
dtypes: float64(1), int64(5), object(4)
memory usage: 798.2+ KB


In [None]:
#df['Weight'] = df['Weight'].astype(int)
cust_df['Predicted_Target_Customers']=cust_df['Predicted_Target_Customers'].astype(int)
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10215 entries, 0 to 10214
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   UserID                         10215 non-null  object
 1   No_of_days_Visited_7_Days      10215 non-null  int64 
 2   No_Of_Products_Viewed_15_Days  10215 non-null  int64 
 3   User_Vintage                   10215 non-null  int64 
 4   Most_Viewed_product_15_Days    10215 non-null  object
 5   Most_Active_OS                 10215 non-null  object
 6   Recently_Viewed_Product        10215 non-null  object
 7   Pageloads_last_7_days          10215 non-null  int64 
 8   Clicks_last_7_days             10215 non-null  int64 
 9   Predicted_Target_Customers     10215 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 798.2+ KB


In [None]:
cust_df.head()

Unnamed: 0,UserID,No_of_days_Visited_7_Days,No_Of_Products_Viewed_15_Days,User_Vintage,Most_Viewed_product_15_Days,Most_Active_OS,Recently_Viewed_Product,Pageloads_last_7_days,Clicks_last_7_days,Predicted_Target_Customers
0,U124954,0,1,169,PR100047,WINDOWS,PR100047,0,0,0
1,U124892,0,1,85,PR100279,WINDOWS,PR100279,0,0,0
2,U123462,0,1,1057,PR100980,WINDOWS,Product101,0,0,0
3,U107624,1,1,2163,PR100498,ANDROID,PR100498,1,1,0
4,U122026,0,0,98,Product101,ANDROID,PR100023,0,0,0


In [None]:
cust_df['Predicted_Target_Customers'].value_counts()

0    10154
1       61
Name: Predicted_Target_Customers, dtype: int64

In [None]:
from google.colab import files
cust_df.to_csv("gdrive/My Drive/SRM-Internship-2021-Latest/Marketplace-Features-Creation-Project/07-Final-Output-File/Mkt_Features_Customer_Data_with_Predicted_Values.csv", index = False)