In [1]:
import pandas as pd
import re

In [2]:
# the document is '$' separated
df = pd.read_csv('Automatic email content store.csv', sep='$')

In [3]:
df.head()

Unnamed: 0,Date,to,raw
0,30/09/17,shubhi.prakash03@gmail.com,will be doing it by tomorrow
1,30/09/17,exampreparationonline@gmail.com,will be doing it by tomorrow
2,30/09/17,exampreparationonline@gmail.com,will be doing it by tomorrow
3,30/09/17,shubhi.prakash03@gmail.com,will be doing it by tomorrow
4,30/09/17,shubhi.prakash03@gmail.com,will be doing it by tomorrow


In [4]:
# to get the column names
df.columns

Index(['Date', 'to ', 'raw'], dtype='object')

### To get all the unique email

In [5]:
email = df['to '].unique()

In [6]:
email

array(['shubhi.prakash03@gmail.com', 'exampreparationonline@gmail.com',
       'nidhisaini05@gmail.com', 'chowdhuryshaif@rediffmail.com',
       'abhishekguragol@gmail.com', 'sonuyannam@gmail.com',
       'sachinsn231@gmail.com', 'limithakeerthi@gmail.com',
       'ramthakkar07@gmail.com', 'pavithramurugan22@gmail.com',
       'saha.swapankumar1@gmail.com'], dtype=object)

### To remove the rows having timestamp in different format other than 'dd/mm/yy'

In [7]:
for i in range(len(df.index)):
    if not (bool(re.compile(r'(\d{2}\/\d{2}\/\d{2})').match(df['Date'][i]))):
        df.drop(df.index[i], inplace=True)

df.reset_index(drop=True, inplace=True)

In [8]:
# to check the final df after dropping rows
df.head()

Unnamed: 0,Date,to,raw
0,30/09/17,shubhi.prakash03@gmail.com,will be doing it by tomorrow
1,30/09/17,exampreparationonline@gmail.com,will be doing it by tomorrow
2,30/09/17,exampreparationonline@gmail.com,will be doing it by tomorrow
3,30/09/17,shubhi.prakash03@gmail.com,will be doing it by tomorrow
4,30/09/17,shubhi.prakash03@gmail.com,will be doing it by tomorrow


### To find common words

In [9]:
# to store the list if matching words from dummy_words and raw column
remark = []

In [10]:
# user defined set of dummy words
dummy_words = set(['deadline', 'tell', 'me'])

In [11]:
# clean function to remove '?' from words in a list and retrurns a final list of cleaned words
def clean(a):
    x = []
    for j in a:
        if '?' in j:
            j = ''.join(j.split('?'))
        x.append(j)
    return x

In [12]:
word = df['raw'][30].split()
print(word)

['can', 'u', 'tell', 'me', 'the', 'details?']


In [13]:
# examlpe to check the work of clean function
print(clean(word))

['can', 'u', 'tell', 'me', 'the', 'details']


In [14]:
# to find out the matching words in sentences of raw data and dummy data 
# And storing the final list of words in remark list
for j in range(len(df.index)):
    words = clean(df['raw'][j].split())
    word_set = set(words)
    result = ','.join(list(dummy_words.intersection(word_set)))
    if (len(result) == 0):
        remark.append('NO MATCHING WORDS')
    else:
        remark.append(result)

In [15]:
# adding a new column to existing dataframe to store common words
# it writes 'Nothing' if no common words is present
df['Same_words'] = remark

In [16]:
df

Unnamed: 0,Date,to,raw,Same_words
0,30/09/17,shubhi.prakash03@gmail.com,will be doing it by tomorrow,NO MATCHING WORDS
1,30/09/17,exampreparationonline@gmail.com,will be doing it by tomorrow,NO MATCHING WORDS
2,30/09/17,exampreparationonline@gmail.com,will be doing it by tomorrow,NO MATCHING WORDS
3,30/09/17,shubhi.prakash03@gmail.com,will be doing it by tomorrow,NO MATCHING WORDS
4,30/09/17,shubhi.prakash03@gmail.com,will be doing it by tomorrow,NO MATCHING WORDS
5,30/09/17,exampreparationonline@gmail.com,try to submit it with details,NO MATCHING WORDS
6,30/09/17,shubhi.prakash03@gmail.com,try to submit it with details,NO MATCHING WORDS
7,30/09/17,nidhisaini05@gmail.com,try to submit it with details,NO MATCHING WORDS
8,30/09/17,chowdhuryshaif@rediffmail.com,try to submit it with details,NO MATCHING WORDS
9,30/09/17,chowdhuryshaif@rediffmail.com,try to submit it with details,NO MATCHING WORDS


### Time delta in day

In [17]:
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

In [18]:
df['Date'].head()

0   2017-09-30
1   2017-09-30
2   2017-09-30
3   2017-09-30
4   2017-09-30
Name: Date, dtype: datetime64[ns]

In [19]:
# Here user can give their dates of their own
user_date = '2017-12-30'

In [20]:
# Stores the differenc in days between user provided sate and the last date
diff = df.iloc[len(df.index)-1]['Date'] - pd.to_datetime(user_date)

In [21]:
print(abs(diff))

87 days 00:00:00
