In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


df = pd.read_csv("train.csv")
campaign_data = pd.read_csv('campaign_data.csv')

In [2]:
df['date'] = pd.to_datetime(df['send_date'], format = "%d-%m-%Y %H:%M")
df.drop('send_date', axis = 1, inplace = True)
df.index = df['user_id']

# Train

In [3]:
split_date = min(df['date']) + pd.Timedelta(days = 115)
train_df = df.loc[df['date'] < split_date]

In [4]:
train_df['no_of_emails'] =train_df.groupby('user_id').size()
train_df['cumulative_count'] = train_df.groupby('user_id').cumcount()

# returns timestamp object day of week and hour of day
def weekday(a):
    return a.dayofweek

def hourofday(a):
    return a.hour

train_df['day_of_week']= train_df['date'].apply(weekday)
train_df['hour_of_day']= train_df['date'].apply(hourofday)

train_df.fillna(0, inplace = True)
train_df.reset_index(drop = True, inplace=True)

# merge campagin data with emails
train_df = campaign_data.merge(train_df, on = 'campaign_id')
train_df['temp'] = 1

# Calculates the count received of each type of campaign - essentially whether or not they received that campaign
pivot_df = pd.pivot_table(train_df, values="temp", index="user_id", columns="campaign_id", aggfunc="count", fill_value=0).reset_index()
pivot_df.columns = ['user_id'] + ['campaign_' + str(col) for col in range(29,52)]
train_df = train_df.merge(pivot_df, on = 'user_id')

# Calculates the count received of each type of communication
pivot_df = pd.pivot_table(train_df, values="temp", index="user_id", columns="communication_type", aggfunc="count", fill_value=0).reset_index()
pivot_df.columns = ['user_id', 'conference_count', 'corporate_count','hackathon_count','newsletter_count','others_count','upcoming_events_count','webinar_count']
train_df = train_df.merge(pivot_df, on = 'user_id')

# calculate percentage of communcation type received for each user
train_df['conference_percent']=train_df['conference_count'] / train_df['no_of_emails']
train_df['corporate_percent']=train_df['corporate_count'] / train_df['no_of_emails']
train_df['hackathon_percent']=train_df['hackathon_count'] / train_df['no_of_emails']
train_df['newsletter_percent']=train_df['newsletter_count'] / train_df['no_of_emails']
train_df['others_percent']=train_df['others_count'] / train_df['no_of_emails']
train_df['upcoming_events_percent']=train_df['upcoming_events_count'] / train_df['no_of_emails']
train_df['webinar_percent']=train_df['webinar_count'] / train_df['no_of_emails']

# drop unneccesary columns
train_df.drop(['temp','conference_count', 'corporate_count','hackathon_count','newsletter_count','others_count','upcoming_events_count','webinar_count'], axis = 1, inplace = True)

# NLP
vectorizer = CountVectorizer(min_df = 1)

corpus = train_df['subject']
x = vectorizer.fit_transform(corpus)
x.toarray()
matrix1 = x.toarray()
vectorizer.vocabulary_.get('harvest')

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(matrix1)

asd = tfidf.toarray()
asd1 = pd.DataFrame(asd) 
features = vectorizer.get_feature_names() 
asd1.columns = features
train_df = pd.concat((train_df,asd1), axis = 1) 


Defaulting to column but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
Defaulting to column but this will raise an ambiguity error in a future version
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-c

# Test

In [5]:
test_df = df.loc[df['date'] >= split_date]

In [7]:
test_df.head()

Unnamed: 0,campaign_id,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,email_body,subject,email_url,id,user_id,is_open,is_click,date,train,campaign_29,campaign_30,campaign_31,campaign_32,campaign_33,campaign_34,campaign_35,campaign_36,campaign_37,campaign_38,campaign_39,campaign_40,campaign_41,campaign_42,campaign_43,campaign_44,campaign_45,campaign_46,campaign_47,campaign_48,campaign_49,campaign_50,campaign_51,campaign_52,campaign_53,campaign_54,conference_count,corporate_count,hackathon_count,newsletter_count,others_count,upcoming_events_count,webinar_count
0,29,Newsletter,67,61,12,3,"Dear AVians,\r\n \r\nWe are shaping up a super...",Sneak Peek: A look at the emerging data scienc...,http://r.newsletters.analyticsvidhya.com/7um44...,29_185580,185580,0,0,2017-07-01 18:01:00,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,2,1,1,0
1,30,Upcoming Events,18,14,7,1,"Dear AVians,\r\n \r\nAre your eager to know wh...",[July] Data Science Expert Meetups & Competiti...,http://r.newsletters.analyticsvidhya.com/7up0e...,30_185580,185580,0,0,2017-07-05 14:01:00,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,2,1,1,0
2,33,Others,7,3,1,1,Fireside Chat with DJ Patil - the master is he...,"[Delhi NCR] Fireside Chat with DJ Patil, Forme...",http://r.newsletters.analyticsvidhya.com/7uvlg...,33_185580,185580,0,0,2017-07-24 14:51:00,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,2,1,1,0
3,49,Conference,119,117,16,1,"Dear Patron,\r\n \r\nAnalytics Vidhya is on a ...",Register @ DataHack Summit 2017 - India's Larg...,http://r.newsletters.analyticsvidhya.com/o7ohw...,49_185580,185580,0,0,2017-09-28 15:02:00,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,2,1,1,0
4,52,Newsletter,67,62,10,4,"November Newsletter\r\n \r\nDear AVians,\r\n \...",[Newsletter] Stage for DataHack Summit 2017 is...,http://r.newsletters.analyticsvidhya.com/7vtb2...,52_185580,185580,0,0,2017-11-02 12:34:00,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,2,1,1,0


In [8]:
a = test_df.groupby('user_id').size()
a.head()

user_id
5     2
6     6
7    12
8    10
9     7
dtype: int64

In [6]:
test_df['no_of_emails'] = test_df.groupby('user_id').size()
test_df['cumulative_count'] = test_df.groupby('user_id').cumcount()

# returns timestamp object day of week and hour of day
def weekday(a):
    return a.dayofweek

def hourofday(a):
    return a.hour

test_df['day_of_week']= test_df['date'].apply(weekday)
test_df['hour_of_day']= test_df['date'].apply(hourofday)

test_df.fillna(0, inplace = True)
test_df.reset_index(drop = True, inplace=True)

# merge campagin data with emails
test_df  = campaign_data.merge(df, on = 'campaign_id')
test_df['train'] = 1

# Calculates the count received of each type of campaign - essentially whether or not they received that campaign
pivot_df = pd.pivot_table(test_df , values="train", index="user_id", columns="campaign_id", aggfunc="count", fill_value=0).reset_index()
pivot_df.columns = ['user_id'] + ['campaign_' + str(col) for col in range(29,55)]
test_df  = test_df.merge(pivot_df, on = 'user_id')

# Calculates the count received of each type of communication
pivot_df = pd.pivot_table(test_df, values="train", index="user_id", columns="communication_type", aggfunc="count", fill_value=0).reset_index()
pivot_df.columns = ['user_id', 'conference_count', 'corporate_count','hackathon_count','newsletter_count','others_count','upcoming_events_count','webinar_count']
test_df = test_df.merge(pivot_df, on = 'user_id')

# calculate percentage of communcation type received for each user
test_df['conference_percent']=test_df['conference_count'] / test_df['no_of_emails']
test_df['corporate_percent']=test_df['corporate_count'] / test_df['no_of_emails']
test_df['hackathon_percent']=test_df['hackathon_count'] / test_df['no_of_emails']
test_df['newsletter_percent']=test_df['newsletter_count'] / test_df['no_of_emails']
test_df['others_percent']=test_df['others_count'] / test_df['no_of_emails']
test_df['upcoming_events_percent']=test_df['upcoming_events_count'] / test_df['no_of_emails']
test_df['webinar_percent']=test_df['webinar_count'] / test_df['no_of_emails']

# drop unneccesary columns
test_df .drop(['train','conference_count', 'corporate_count','hackathon_count','newsletter_count','others_count','upcoming_events_count','webinar_count'], axis = 1, inplace = True)

# NLP
vectorizer = CountVectorizer(min_df = 1)

corpus = test_df['subject']
x = vectorizer.fit_transform(corpus)
x.toarray()
matrix1 = x.toarray()
vectorizer.vocabulary_.get('harvest')

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(matrix1)

asd = tfidf.toarray()
asd1 = pd.DataFrame(asd) 
features = vectorizer.get_feature_names() 
asd1.columns = features
test_df  = pd.concat((test_df ,asd1), axis = 1) 


Defaulting to column but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
Defaulting to column but this will raise an ambiguity error in a future version
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-c

KeyError: 'no_of_emails'

In [None]:
train_df.drop(['communication_type','email_body','subject','email_url'], axis = 1, inplace = True)
test_df.drop(['communication_type','email_body','subject','email_url'], axis = 1, inplace = True)

In [None]:
train_df.drop('date', axis = 1, inplace = True)
test_df.drop('date', axis = 1, inplace = True)

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
df.shape


In [None]:
761657 + 261534

In [None]:
to_drop = list(set(test.columns) - set(train_df.columns))
test_df.drop(to_drop, axis = 1, inplace = True)

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
test_df.head()

In [None]:
train_df.to_csv("train_export.csv", index = False)
test_df.to_csv("test_export.csv", index = False)