In [1]:
## Import numpy, pandas, matplotlib, seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Import Logistic regression and SVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

## Import Confusion matrix, plot confusion matrix, classification report, accuracy score, auc-roc score and roc curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

## Import Label encoder, label binarizer and Standard Scaler
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
lb = preprocessing.LabelBinarizer()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

## ================================ ##
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
df = pd.read_csv('Train_data.csv')

In [3]:
df.shape

(4571, 9)

In [4]:
df.describe()


Unnamed: 0,impressions,clicks,cost,conversions,revenue
count,4571.0,4571.0,4571.0,4571.0,4571.0
mean,92.635747,43.340844,2.800834,1.339094,231.86534
std,198.349737,100.164913,14.361618,5.223922,1091.742763
min,1.0,0.0,0.0,0.0,0.0
25%,6.0,2.0,0.04,0.0,0.0
50%,16.0,7.0,0.21,0.0,0.0
75%,84.0,38.0,1.35,1.0,92.6
max,3239.0,1771.0,531.25,94.0,20515.41


In [5]:
df.isnull().sum()


date           0
campaign       0
adgroup        0
ad             0
impressions    0
clicks         0
cost           0
conversions    0
revenue        0
dtype: int64

In [6]:
df.columns

Index(['date', 'campaign', 'adgroup', 'ad', 'impressions', 'clicks', 'cost',
       'conversions', 'revenue'],
      dtype='object')

In [7]:
df.head()

Unnamed: 0,date,campaign,adgroup,ad,impressions,clicks,cost,conversions,revenue
0,01-08-2020,campaign 1,adgroup 1,ad 1,24,6,0.08,0,0.0
1,01-08-2020,campaign 1,adgroup 2,ad 1,1,0,0.0,0,0.0
2,01-08-2020,campaign 1,adgroup 3,ad 1,13,4,0.04,0,0.0
3,01-08-2020,campaign 1,adgroup 4,ad 1,5,4,0.08,0,0.0
4,01-08-2020,campaign 1,adgroup 1,ad 2,247,126,1.29,4,925.71


In [8]:
s = (df.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['date', 'campaign', 'adgroup', 'ad']


In [9]:
object_nunique = list(map(lambda col: df[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

[('campaign', 1), ('adgroup', 4), ('ad', 70), ('date', 212)]

In [10]:
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if df[col].nunique() < 10]

# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)

Categorical columns that will be one-hot encoded: ['campaign', 'adgroup']

Categorical columns that will be dropped from the dataset: ['date', 'ad']


In [11]:
from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(df[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = df.index


# Remove categorical columns (will replace with one-hot encoding)
num_X_train = df.drop(object_cols, axis=1)


# Use as many lines of code as you need!

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)


In [12]:
OH_X_train.head()

Unnamed: 0,impressions,clicks,cost,conversions,revenue,0,1,2,3,4
0,24,6,0.08,0,0.0,1.0,1.0,0.0,0.0,0.0
1,1,0,0.0,0,0.0,1.0,0.0,1.0,0.0,0.0
2,13,4,0.04,0,0.0,1.0,0.0,0.0,1.0,0.0
3,5,4,0.08,0,0.0,1.0,0.0,0.0,0.0,1.0
4,247,126,1.29,4,925.71,1.0,1.0,0.0,0.0,0.0


In [13]:
OH_X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4571 entries, 0 to 4570
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   impressions  4571 non-null   int64  
 1   clicks       4571 non-null   int64  
 2   cost         4571 non-null   float64
 3   conversions  4571 non-null   int64  
 4   revenue      4571 non-null   float64
 5   0            4571 non-null   float64
 6   1            4571 non-null   float64
 7   2            4571 non-null   float64
 8   3            4571 non-null   float64
 9   4            4571 non-null   float64
dtypes: float64(7), int64(3)
memory usage: 357.2 KB


In [14]:
y = OH_X_train.revenue

In [15]:
X = OH_X_train.drop('revenue',axis=1)

In [16]:
X.head()

Unnamed: 0,impressions,clicks,cost,conversions,0,1,2,3,4
0,24,6,0.08,0,1.0,1.0,0.0,0.0,0.0
1,1,0,0.0,0,1.0,0.0,1.0,0.0,0.0
2,13,4,0.04,0,1.0,0.0,0.0,1.0,0.0
3,5,4,0.08,0,1.0,0.0,0.0,0.0,1.0
4,247,126,1.29,4,1.0,1.0,0.0,0.0,0.0


In [17]:
y.head()

0      0.00
1      0.00
2      0.00
3      0.00
4    925.71
Name: revenue, dtype: float64

In [18]:
#X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      #train_size=0.8, test_size=0.2,
                                                      #random_state=0)

In [19]:
#eval_set = [(X_valid, y_valid)]

In [20]:
#import xgboost
#classifier=xgboost.XGBRegressor()

In [21]:
#print("X_train:",X_train.shape) 
#print("X_valid:",X_valid.shape) 
#print("y_train:",y_train.shape) 
#print("y_valid:",y_valid.shape) 

In [22]:
import xgboost
regressor=xgboost.XGBRegressor()


In [23]:
regressor=xgboost.XGBRegressor(learning_rate=0.004, max_depth=4,
             n_estimators=600, min_child_weight=4)

In [24]:
regressor.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.004, max_delta_step=0, max_depth=4,
             min_child_weight=4, missing=nan, monotone_constraints='()',
             n_estimators=600, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
dfn = pd.read_csv('Test_Data.csv')

In [26]:
dfn.shape

(318, 8)

In [27]:
dfn.describe()

Unnamed: 0,cost,impressions,clicks,conversions
count,318.0,318.0,318.0,318.0
mean,0.381352,67.144654,29.720126,0.805031
std,0.59148,101.07887,45.200014,1.80248
min,0.0,1.0,0.0,0.0
25%,0.02,4.0,2.0,0.0
50%,0.105,20.0,9.0,0.0
75%,0.49,103.0,40.75,1.0
max,3.53,589.0,276.0,15.0


In [28]:
dfn.isnull().sum()

date           0
campaign       0
adgroup        0
ad             0
cost           0
impressions    0
clicks         0
conversions    0
dtype: int64

In [29]:
dfn.columns

Index(['date', 'campaign', 'adgroup', 'ad', 'cost', 'impressions', 'clicks',
       'conversions'],
      dtype='object')

In [30]:
dfn.head()

Unnamed: 0,date,campaign,adgroup,ad,cost,impressions,clicks,conversions
0,01-03-2021,campaign 1,adgroup 1,ad 1,0.58,121,49,1
1,01-03-2021,campaign 1,adgroup 3,ad 1,0.17,22,12,0
2,01-03-2021,campaign 1,adgroup 4,ad 1,0.05,5,3,0
3,01-03-2021,campaign 1,adgroup 2,ad 1,0.01,2,1,0
4,01-03-2021,campaign 1,adgroup 2,ad 2,0.01,3,1,0


In [31]:
sn = (dfn.dtypes == 'object')
object_colsn = list(s[s].index)

print("Categorical variables:")
print(object_colsn)

Categorical variables:
['date', 'campaign', 'adgroup', 'ad']


In [32]:
object_nunique = list(map(lambda col: dfn[col].nunique(), object_colsn))
dn = dict(zip(object_colsn, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(dn.items(), key=lambda x: x[1])

[('campaign', 1), ('adgroup', 4), ('date', 15), ('ad', 16)]

In [33]:
# Columns that will be one-hot encoded
low_cardinality_colsn = [col for col in object_colsn if dfn[col].nunique() < 10]

# Columns that will be dropped from the dataset
high_cardinality_colsn = list(set(object_colsn)-set(low_cardinality_colsn))

print('Categorical columns that will be one-hot encoded:', low_cardinality_colsn)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_colsn)

Categorical columns that will be one-hot encoded: ['campaign', 'adgroup']

Categorical columns that will be dropped from the dataset: ['date', 'ad']


In [34]:
from sklearn.preprocessing import OneHotEncoder
OH_encodern = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_test = pd.DataFrame(OH_encodern.fit_transform(dfn[low_cardinality_colsn]))

# One-hot encoding removed index; put it back
OH_cols_test.index = dfn.index


# Remove categorical columns (will replace with one-hot encoding)
num_X_test = dfn.drop(object_colsn, axis=1)


# Use as many lines of code as you need!

OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [35]:
OH_X_test.head()

Unnamed: 0,cost,impressions,clicks,conversions,0,1,2,3,4
0,0.58,121,49,1,1.0,1.0,0.0,0.0,0.0
1,0.17,22,12,0,1.0,0.0,0.0,1.0,0.0
2,0.05,5,3,0,1.0,0.0,0.0,0.0,1.0
3,0.01,2,1,0,1.0,0.0,1.0,0.0,0.0
4,0.01,3,1,0,1.0,0.0,1.0,0.0,0.0


In [36]:
OH_X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   cost         318 non-null    float64
 1   impressions  318 non-null    int64  
 2   clicks       318 non-null    int64  
 3   conversions  318 non-null    int64  
 4   0            318 non-null    float64
 5   1            318 non-null    float64
 6   2            318 non-null    float64
 7   3            318 non-null    float64
 8   4            318 non-null    float64
dtypes: float64(6), int64(3)
memory usage: 22.5 KB


In [37]:
predn=regressor.predict(OH_X_test)

In [38]:
dfans = pd.DataFrame(predn)
dfans.columns = ["revenue"]

In [39]:
dfans.to_csv('revenue9.csv',index=False)

In [40]:
from sklearn.metrics import mean_squared_error
pred_train= regressor.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train)))

NameError: name 'X_train' is not defined

In [None]:
pred_test= regressor.predict(X_valid)
print(np.sqrt(mean_squared_error(y_valid,pred_test)))

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(regressor,X_train,y_train,cv=5).mean()