# Dataset Extraction

Importing the required libraries

In [1]:
import re
import pandas as pd

Checking each line if it starts with date and time to identify each unique message in the text file

In [2]:
def rawToDf(file, key):
#     Converts raw .txt file into a Data Frame
    
    split_formats = {
        '12hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',
        '24hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',
        'custom' : ''
    }
    datetime_formats = {
        '12hr' : '%d/%m/%Y, %I:%M %p - ',
        '24hr' : '%d/%m/%Y, %H:%M - ',
        'custom': ''
    }
    
    with open(file, 'r', encoding='utf-8') as raw_data:
        # print(raw_data.read())
        raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
        user_msg = re.split(split_formats[key], raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
        date_time = re.findall(split_formats[key], raw_string) # finds all the date-time patterns
        
        df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df
        
    # converting date-time pattern which is of type String to type datetime,
    # format is to be specified for the whole string where the placeholders are extracted by the method 
    df['date_time'] = pd.to_datetime(df['date_time'], format=datetime_formats[key])
    
    # split user and msg 
    usernames = []
    msgs = []
    for i in df['user_msg']:
        a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        if(a[1:]): # user typed messages
            usernames.append(a[1])
            msgs.append(a[2])
        else: # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("group_notification")
            msgs.append(a[0])
            
    
    # creating new columns         
    df['user'] = usernames
    df['message'] = msgs

    # dropping the old user_msg col.
    df.drop('user_msg', axis=1, inplace=True)
    
    return df

In [3]:
df = rawToDf('chat.txt', '12hr')

In [4]:
df.head()

Unnamed: 0,date_time,user,message
0,2020-01-26 16:19:00,group_notification,Messages and calls are end-to-end encrypted. N...
1,2020-01-24 20:25:00,group_notification,"Tanay Kamath (TSEC, CS) created group ""CODERS👨..."
2,2020-01-26 16:19:00,group_notification,You joined using this group's invite link
3,2020-01-26 16:20:00,group_notification,+91 99871 38558 joined using this group's invi...
4,2020-01-26 16:20:00,group_notification,+91 91680 38866 joined using this group's invi...


In [5]:
# checking out number of unique authors of the messages
df['user'].unique()

array(['group_notification', '+91 96536 93868',
       'Dheeraj Lalwani (TSEC, CS)', '+91 99201 75875', '+91 95949 08570',
       '+91 79778 76844', '+91 90499 38860', 'Tanay Kamath (TSEC, CS)',
       'Saket (TSEC, CS)', '+91 77568 95072', 'Rohit Pathak (TSEC, CS)',
       '+91 75078 05454', 'Darshan Rander (TSEC, IT)', '+91 79774 68083',
       '+91 70394 60876', '+91 96191 55044', '+91 90678 93300',
       'Mohit Varma (TSEC, CS)', '+91 79770 56210',
       'Chirag Sharma (TSEC, CS)', 'Vivek Iyer (TSEC, Biomed)',
       'Tushar Nankani', '+91 81696 22410', '+91 89764 07509',
       '+91 78758 66747', 'Ankit (TSEC, CS)', '+91 86556 33169',
       '+91 76663 28147', '+91 88284 70904', '+91 97698 67348',
       'Vivek (TSEC, CS)', 'Hardik Raheja (TSEC, CS)', '+91 91680 38866',
       'Pranay Thakur (TSEC, CS)', 'Mittul Dasani (TSEC, CS)',
       'Kartik Soneji (TSEC, CS)', '+91 77180 43697', '+91 99676 84479',
       'Shreya (TSEC, IT)', '+91 96190 16721', '+91 89833 85127',
       '+9

In [6]:
# checking out random 10 samples from the dataset
df.sample(10)

Unnamed: 0,date_time,user,message
2884,2020-03-08 21:09:00,"Tanay Kamath (TSEC, CS)",Oh
9768,2020-08-11 22:12:00,"Chirag Sharma (TSEC, CS)",Wait I am confused. What I have seen mostly ar...
1735,2020-02-22 20:22:00,"Saurav Upoor (TSEC CS, SE)",
11225,2020-09-03 15:00:00,"Harsh Kapadia (TSEC IT, SE)",This message was deleted
10159,2020-08-20 16:59:00,"Pratik K (TSEC CS, SE)",🤣🤣🤣
782,2020-02-13 12:50:00,group_notification,"Saket (TSEC, CS) added +91 90821 58843"
10406,2020-08-24 15:14:00,+91 84335 18102,<Media omitted>
226,2020-01-28 10:53:00,+91 79770 56210,This'll also do for that hollow diamond prog i...
403,2020-01-29 22:54:00,"Dheeraj Lalwani (TSEC, CS)",😅
7257,2020-06-10 18:22:00,"Darshan Rander (TSEC, IT)",I'm not really good md😅


In [7]:
# checking for null data
df.isna().sum()

date_time    0
user         0
message      0
dtype: int64

## loading the cleaned dataset into the csv file

In [8]:
df.to_csv('Whatsapp_Chat_Table.csv')

In [9]:
df.head()

Unnamed: 0,date_time,user,message
0,2020-01-26 16:19:00,group_notification,Messages and calls are end-to-end encrypted. N...
1,2020-01-24 20:25:00,group_notification,"Tanay Kamath (TSEC, CS) created group ""CODERS👨..."
2,2020-01-26 16:19:00,group_notification,You joined using this group's invite link
3,2020-01-26 16:20:00,group_notification,+91 99871 38558 joined using this group's invi...
4,2020-01-26 16:20:00,group_notification,+91 91680 38866 joined using this group's invi...


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13655 entries, 0 to 13654
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date_time  13655 non-null  datetime64[ns]
 1   user       13655 non-null  object        
 2   message    13655 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 320.2+ KB


In [11]:
row, col = df.shape

In [12]:
f"No of Rows :{row} and Columns = {col}"

'No of Rows :13655 and Columns = 3'

In [13]:
df['date_time'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 13655 entries, 0 to 13654
Series name: date_time
Non-Null Count  Dtype         
--------------  -----         
13655 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 106.8 KB
