In [124]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


df = pd.read_csv("train.csv")
campaign_data = pd.read_csv('campaign_data.csv')

In [125]:
df['date'] = pd.to_datetime(df['send_date'], format = "%d-%m-%Y %H:%M")
df.drop('send_date', axis = 1, inplace = True)
df.index = df['user_id']

# Train

In [126]:
split_date = min(df['date']) + pd.Timedelta(days = 115)
train_df = df.loc[df['date'] < split_date]
test_df = df.loc[df['date'] >= split_date]

In [127]:
train_df['no_of_emails'] =train_df.groupby('user_id').size()
train_df['cumulative_count'] = train_df.groupby('user_id').cumcount()

# returns timestamp object day of week and hour of day
def weekday(a):
    return a.dayofweek

def hourofday(a):
    return a.hour

train_df['day_of_week']= train_df['date'].apply(weekday)
train_df['hour_of_day']= train_df['date'].apply(hourofday)

train_df.fillna(0, inplace = True)
train_df.reset_index(drop = True, inplace=True)

# merge campagin data with emails
train_df = campaign_data.merge(train_df, on = 'campaign_id')
train_df['temp'] = 1

# Calculates the count received of each type of campaign - essentially whether or not they received that campaign
pivot_df = pd.pivot_table(train_df, values="temp", index="user_id", columns="campaign_id", aggfunc="count", fill_value=0).reset_index()
pivot_df.columns = ['user_id'] + ['campaign_' + str(col) for col in range(29,52)]
train_df = train_df.merge(pivot_df, on = 'user_id')

# Calculates the count received of each type of communication
pivot_df = pd.pivot_table(train_df, values="temp", index="user_id", columns="communication_type", aggfunc="count", fill_value=0).reset_index()
pivot_df.columns = ['user_id', 'conference_count', 'corporate_count','hackathon_count','newsletter_count','others_count','upcoming_events_count','webinar_count']
train_df = train_df.merge(pivot_df, on = 'user_id')

# calculate percentage of communcation type received for each user
train_df['conference_percent']=train_df['conference_count'] / train_df['no_of_emails']
train_df['corporate_percent']=train_df['corporate_count'] / train_df['no_of_emails']
train_df['hackathon_percent']=train_df['hackathon_count'] / train_df['no_of_emails']
train_df['newsletter_percent']=train_df['newsletter_count'] / train_df['no_of_emails']
train_df['others_percent']=train_df['others_count'] / train_df['no_of_emails']
train_df['upcoming_events_percent']=train_df['upcoming_events_count'] / train_df['no_of_emails']
train_df['webinar_percent']=train_df['webinar_count'] / train_df['no_of_emails']

# drop unneccesary columns
train_df.drop(['temp','conference_count', 'corporate_count','hackathon_count','newsletter_count','others_count','upcoming_events_count','webinar_count'], axis = 1, inplace = True)

# NLP
vectorizer = CountVectorizer(min_df = 1)

corpus = train_df['subject']
x = vectorizer.fit_transform(corpus)
x.toarray()
matrix1 = x.toarray()
vectorizer.vocabulary_.get('harvest')

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(matrix1)

asd = tfidf.toarray()
asd1 = pd.DataFrame(asd) 
features = vectorizer.get_feature_names() 
asd1.columns = features
train_df = pd.concat((train_df,asd1), axis = 1) 


Defaulting to column but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
Defaulting to column but this will raise an ambiguity error in a future version
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-c

# Test

In [128]:
test_df.head()

Unnamed: 0_level_0,id,user_id,campaign_id,is_open,is_click,date
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
134438,52_134438,134438,52,0,0,2017-11-02 12:53:00
231024,54_231024,231024,54,0,0,2017-12-01 20:15:00
65386,52_65386,65386,52,0,0,2017-11-02 12:36:00
224491,53_224491,224491,53,0,0,2017-11-06 22:33:00
109939,53_109939,109939,53,0,0,2017-11-06 22:39:00


In [129]:
test_df['no_of_emails'] = test_df.groupby('user_id').size()
test_df['cumulative_count'] = test_df.groupby('user_id').cumcount()

Defaulting to column but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
Defaulting to column but this will raise an ambiguity error in a future version
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [130]:
# returns timestamp object day of week and hour of day
def weekday(a):
    return a.dayofweek

def hourofday(a):
    return a.hour

test_df['day_of_week']= test_df['date'].apply(weekday)
test_df['hour_of_day']= test_df['date'].apply(hourofday)

test_df.fillna(0, inplace = True)
test_df.reset_index(drop = True, inplace=True)

# merge campagin data with emails
test_df  = campaign_data.merge(test_df, on = 'campaign_id')
test_df['temp'] = 1


# Calculates the count received of each type of campaign - essentially whether or not they received that campaign
pivot_df = pd.pivot_table(test_df , values="temp", index="user_id", columns="campaign_id", aggfunc="count", fill_value=0).reset_index()
pivot_df.columns = ['user_id'] + ['campaign_' + str(col) for col in range(52,55)]
test_df  = test_df.merge(pivot_df, on = 'user_id')


# Calculates the count received of each type of communication
pivot_df = pd.pivot_table(test_df, values="temp", index="user_id", columns="communication_type", aggfunc="count", fill_value=0).reset_index()
print('here\n\n\n\n',pivot_df.iloc[0])
pivot_df.columns = ['user_id', 'conference_count', 'newsletter_count']
test_df = test_df.merge(pivot_df, on = 'user_id')

# calculate percentage of communcation type received for each user
test_df['conference_percent']=test_df['conference_count'] / test_df['no_of_emails']
#test_df['corporate_percent']=test_df['corporate_count'] / test_df['no_of_emails']
#test_df['hackathon_percent']=test_df['hackathon_count'] / test_df['no_of_emails']
test_df['newsletter_percent']=test_df['newsletter_count'] / test_df['no_of_emails']
#test_df['others_percent']=test_df['others_count'] / test_df['no_of_emails']
#test_df['upcoming_events_percent']=test_df['upcoming_events_count'] / test_df['no_of_emails']
#test_df['webinar_percent']=test_df['webinar_count'] / test_df['no_of_emails']

# drop unneccesary columns
test_df.drop(['temp','conference_count', 'newsletter_count'], axis = 1, inplace = True)

# NLP
vectorizer = CountVectorizer(min_df = 1)

corpus = test_df['subject']
x = vectorizer.fit_transform(corpus)
x.toarray()
matrix1 = x.toarray()
vectorizer.vocabulary_.get('harvest')

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(matrix1)

asd = tfidf.toarray()
asd1 = pd.DataFrame(asd) 
features = vectorizer.get_feature_names() 
asd1.columns = features
test_df  = pd.concat((test_df ,asd1), axis = 1) 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


here



 communication_type
user_id       5
Conference    1
Newsletter    0
Name: 0, dtype: int64


In [131]:
train_df.drop(['communication_type','email_body','subject','email_url'], axis = 1, inplace = True)
test_df.drop(['communication_type','email_body','subject','email_url'], axis = 1, inplace = True)

In [132]:
train_df.drop('date', axis = 1, inplace = True)
test_df.drop('date', axis = 1, inplace = True)

In [133]:
train_df.shape

(761657, 168)

In [134]:
test_df.shape

(261534, 52)

In [135]:
df.shape


(1023191, 6)

In [136]:
761657 + 261534

1023191

In [137]:
to_drop = list(set(test_df.columns) - set(train_df.columns))
test_df.drop(to_drop, axis = 1, inplace = True)

In [138]:
to_drop = list(set(train_df.columns) - set(test_df.columns))
train_df.drop(to_drop, axis = 1, inplace = True)

In [139]:
train_df.shape

(761657, 34)

In [140]:
test_df.shape

(261534, 34)

In [141]:
test_df.head()

Unnamed: 0,campaign_id,total_links,no_of_internal_links,no_of_images,no_of_sections,id,user_id,is_open,is_click,no_of_emails,cumulative_count,day_of_week,hour_of_day,conference_percent,newsletter_percent,2017,and,artificial,datahack,days,go,hackathons,in,india,intelligence,just,learning,machine,new,newsletter,now,register,summit,to
0,52,67,62,10,4,52_134438,134438,0,0,3,0,3,12,0.333333,0.666667,0.268543,0.268543,0.0,0.268543,0.0,0.0,0.268543,0.0,0.0,0.0,0.0,0.0,0.0,0.268543,0.268543,0.0,0.0,0.268543,0.0
1,53,104,100,13,1,53_134438,134438,0,0,3,1,0,23,0.333333,0.666667,0.0,0.0,0.262929,0.0,0.262929,0.262929,0.0,0.262929,0.262929,0.262929,0.262929,0.262929,0.262929,0.0,0.0,0.262929,0.262929,0.0,0.262929
2,54,63,58,8,4,54_134438,134438,0,0,3,2,4,20,0.333333,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,52,67,62,10,4,52_65386,65386,0,0,1,0,3,12,0.0,1.0,0.268543,0.268543,0.0,0.268543,0.0,0.0,0.268543,0.0,0.0,0.0,0.0,0.0,0.0,0.268543,0.268543,0.0,0.0,0.268543,0.0
4,52,67,62,10,4,52_32415,32415,0,0,2,0,3,13,0.5,0.5,0.268543,0.268543,0.0,0.268543,0.0,0.0,0.268543,0.0,0.0,0.0,0.0,0.0,0.0,0.268543,0.268543,0.0,0.0,0.268543,0.0


In [142]:
train_df.to_csv("train_export.csv", index = False)
test_df.to_csv("test_export.csv", index = False)