In [1]:
import pandas as pd

In [2]:
from dateutil.parser import parse

def is_date(string, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False

In [3]:
def convert_messages_to_list(f):
    """
    Convert WhatsApp Messages from File to List
    
    :param f: FileBuffer, FileBuffer with the opened file in read mode
    :return lines: List of WhatsApp Messages in 2 dimensional list
    """
    lines = []
    for line in f.readlines():
        line_list = line.replace("\n","").split(",")
        if is_date(line_list[0]):
            lines.append([line_list[0],("".join(line_list[1:]))])
        else:
            lines[-1][-1] = lines[-1][-1] +' '+ line.replace("\n","")
    return lines

In [4]:
f = open("WhatsApp Chat with Young Data Professionals.txt", encoding = "utf8")
messages = convert_messages_to_list(f)


In [5]:
# change the format to dataframe
whatsapp_df = pd.DataFrame(messages,columns=['date','message'])
whatsapp_df

Unnamed: 0,date,message
0,12/04/2020,00:51 - Olayinka: Yes
1,12/04/2020,00:51 - +234 812 216 8133: Just as I suspecte...
2,12/04/2020,00:53 - Yusfat: P-value right?
3,12/04/2020,00:54 - Yusfat: sorry correlation...
4,12/04/2020,00:58 - +234 818 249 8479 left
...,...,...
40034,16/02/2022,15:07 - Muhammed: Kindly check your DM biko 👏
40035,16/02/2022,16:52 - Muhammed: <Media omitted>
40036,16/02/2022,16:53 - Muhammed: Guys who's up for this 👆? ...
40037,16/02/2022,17:08 - +234 703 806 5681: This is a Masters ...


In [6]:
# Extract the Time and the Message from the DataFrame
time_msg = whatsapp_df["message"].str.split("-", n = 1, expand = True)
whatsapp_df["time"] = time_msg[0]
whatsapp_df["message"] = time_msg[1]


In [7]:
whatsapp_df.head()

Unnamed: 0,date,message,time
0,12/04/2020,Olayinka: Yes,00:51
1,12/04/2020,+234 812 216 8133: Just as I suspected. A pos...,00:51
2,12/04/2020,Yusfat: P-value right?,00:53
3,12/04/2020,Yusfat: sorry correlation...,00:54
4,12/04/2020,+234 818 249 8479 left,00:58


In [8]:
#Extract the User Who Posted the Message and The Actual Message
user_msg = whatsapp_df["message"].str.split(":", n = 1, expand = True)
whatsapp_df["author"] = user_msg[0]
whatsapp_df["message"] = user_msg[1]

In [9]:
whatsapp_df.head()

Unnamed: 0,date,message,time,author
0,12/04/2020,Yes,00:51,Olayinka
1,12/04/2020,Just as I suspected. A positive relationship ...,00:51,+234 812 216 8133
2,12/04/2020,P-value right?,00:53,Yusfat
3,12/04/2020,sorry correlation...,00:54,Yusfat
4,12/04/2020,,00:58,+234 818 249 8479 left


In [10]:
whatsapp_df.describe()

Unnamed: 0,date,message,time,author
count,40039,39762,40039,40012
unique,666,31062,1438,499
top,01/02/2022,<Media omitted>,19:36,Fabiyi Opeyemi
freq,570,2661,83,5991


In [11]:
whatsapp_df['id'] = range(1, 1+len(whatsapp_df))
whatsapp_df.head()

Unnamed: 0,date,message,time,author,id
0,12/04/2020,Yes,00:51,Olayinka,1
1,12/04/2020,Just as I suspected. A positive relationship ...,00:51,+234 812 216 8133,2
2,12/04/2020,P-value right?,00:53,Yusfat,3
3,12/04/2020,sorry correlation...,00:54,Yusfat,4
4,12/04/2020,,00:58,+234 818 249 8479 left,5


In [13]:
whatsapp_df = whatsapp_df[["id","date","time","author","message"]]

whatsapp_df.head()

Unnamed: 0,id,date,time,author,message
0,1,12/04/2020,00:51,Olayinka,Yes
1,2,12/04/2020,00:51,+234 812 216 8133,Just as I suspected. A positive relationship ...
2,3,12/04/2020,00:53,Yusfat,P-value right?
3,4,12/04/2020,00:54,Yusfat,sorry correlation...
4,5,12/04/2020,00:58,+234 818 249 8479 left,


In [None]:
# # Drop Null Values i.e. for Messages which do not have a User e.g. "User Was Added To Group" e.t.c.
# whatsapp_df.dropna(inplace=True)

In [14]:
import re

def extract_url(message):
    """
    Regex to Extract URLs from Messages (Stackoverflow Help :smiley:)
    """
    url = r"(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)"
    urls_extract = whatsapp_df.message.str.extractall(url)
    url_id = []

    for i in range(0,len(urls_extract)):
    
        add_list = urls_extract.index[[i]][0][0] + 1
    
        url_id.append(add_list)
    urls_extract['id'] = url_id

    urls_extract.head()
    return urls_extract

In [15]:
links = extract_url(whatsapp_df["message"])
links.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,0,id
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1
85,0,https://databricks.com/sparkaisummit/north-ame...,86
94,0,https://info.microsoft.com/CE-AzureINFRA-WBNR-...,95
97,0,https://www.myjobmag.com/jobs/microsoft-nigeri...,98
99,0,https://bit.ly/dsnmlhack,100
310,0,https://us04web.zoom.us/j/6350982607,311


In [16]:
data = pd.merge(whatsapp_df, links, on = "id", how = "inner")
data.head()

Unnamed: 0,id,date,time,author,message,0
0,86,12/04/2020,22:43,+234 816 170 8710,Registration is free! This year's summit is v...,https://databricks.com/sparkaisummit/north-ame...
1,95,Tuesday,April 21 2020 13:00,16,00 Join this free virtual session. To creat...,https://info.microsoft.com/CE-AzureINFRA-WBNR-...
2,98,13/04/2020,16:39,+234 813 730 6439,https://www.myjobmag.com/jobs/microsoft-niger...,https://www.myjobmag.com/jobs/microsoft-nigeri...
3,100,13/04/2020,17:17,Fabiyi Opeyemi,Win 20GB data and more at DSN Algorithm Chall...,https://bit.ly/dsnmlhack
4,311,14/04/2020,10:51,Olayinka,Topic: Julius Abudu's Zoom Meeting Time: Apr ...,https://us04web.zoom.us/j/6350982607


In [28]:
data.columns = ['id', 'date', 'time', 'author', 'message', 'url']
data.head()


Unnamed: 0,id,date,time,author,message,url
0,86,12/04/2020,22:43,+234 816 170 8710,Registration is free! This year's summit is v...,https://databricks.com/sparkaisummit/north-ame...
1,95,Tuesday,April 21 2020 13:00,16,00 Join this free virtual session. To creat...,https://info.microsoft.com/CE-AzureINFRA-WBNR-...
2,98,13/04/2020,16:39,+234 813 730 6439,https://www.myjobmag.com/jobs/microsoft-niger...,https://www.myjobmag.com/jobs/microsoft-nigeri...
3,100,13/04/2020,17:17,Fabiyi Opeyemi,Win 20GB data and more at DSN Algorithm Chall...,https://bit.ly/dsnmlhack
4,311,14/04/2020,10:51,Olayinka,Topic: Julius Abudu's Zoom Meeting Time: Apr ...,https://us04web.zoom.us/j/6350982607


In [29]:
data.to_csv("results.csv")