# Imports

In [109]:
import numpy as np
import pandas as pd 
import re
import dateutil
import matplotlib.pyplot as plt
import seaborn as sb
from collections import Counter

# Preparing The Data
* Read group chat text file
* Filter the data with regular expression
* Organize data into rows and columns
* Remove Irrelevant data e.g &lt;Media omitted&gt;
* Save data as a csv file

## Read group chat text file

In [None]:
with open('WhatsApp Chat with YABATECH 201819 ASPIRANT.txt','r',encoding='utf8') as file:
    data_file=file.read()
    data_file
    print(data_file)

## Filter the data with regular expression
#### The regular expression will filter the chats based on the four different patterns below
    
* Pattern 1 - (Three Names)
   * 7/26/18, 5:46 PM - **YTC David Kenny**: Hello house
* Pattern 2 - (Two Names)
   * 7/26/18, 10:17 PM - **YTC Sunshine**: U cam. Just call me Stephanie
* Pattern 3 - (Two Names preceded by ~)
   * 7/26/18, 10:16 PM - **~El Nuru**: Okay we hear... Madam Stephanie
* Pattern 3 - (Single Name)
   * 7/31/18, 4:43 PM - **Thuolu**: I chosed Yabatech as my 2nd Choice 

In [90]:

data_found = re.findall('(\d+/\d+/\d+),\s+(\d+\:\d+\s+\w+)\s+\-\s(\w+\s\w+\s\w+|\w+\s\w+|\~\w+\s\w+|\w+)\:(.*)', data_file)
data_found

[('7/24/18', '7:53 AM', 'YTC Olushayo', ' No'),
 ('7/24/18',
  '8:46 AM',
  'YTC Prince Oluwafe',
  ' A student failed JAMB 5 tyms. 1 day, she travelled 2 visit her frnd in UNILORIN, she fell sick & was admitted to a hospital there. She later called her mum & said... '),
 ('7/24/18',
  '10:04 AM',
  'YTC Firsiryour',
  ' This gives the same as dividing ur score by 8'),
 ('7/24/18', '10:19 AM', 'YTC Firsiryour', ' U do maths for school'),
 ('7/24/18', '10:20 AM', 'YTC Firsiryour', ' <Media omitted>'),
 ('7/24/18', '10:20 AM', 'YTC Firsiryour', ' <Media omitted>'),
 ('7/24/18',
  '12:43 PM',
  'YTC Daniel',
  ' Pls add this number 09021517548 to the group.'),
 ('7/24/18', '12:46 PM', 'YTC Daniel', ' Admins abeg!!!'),
 ('7/24/18',
  '12:50 PM',
  'YabaTech Informat',
  ' https://chat.whatsapp.com/2XsUCz4fvuhINIHpSq4vXc'),
 ('7/24/18', '1:19 PM', 'YTC Daniel', ' Gracias amigos.👍🏿'),
 ('7/24/18', '1:47 PM', 'YTC Temitope', ' Waiting for this message'),
 ('7/24/18', '2:20 PM', 'YTC Emmanuel'

In [103]:
#Trying to also filter unsaved numbers
num_data_found = re.findall('(\d+/\d+/\d+),\s+(\d+\:\d+\s+\w+)\s+\-\s(\+\d+\s\d+\s\d+\s\d+)\:(.*)', data_file)
num_data_found
#7/27/18, 11:42 AM - ‪+234 809 131 8973‬: Yes

[]

## Organize data into rows and columns

In [107]:
#Converting data_found to pandas dataframe
df=pd.DataFrame(data_found,columns=['Date','Time','Sender','Message'])
df.head()

Unnamed: 0,Date,Time,Sender,Message
0,7/24/18,7:53 AM,YTC Olushayo,No
1,7/24/18,8:46 AM,YTC Prince Oluwafe,"A student failed JAMB 5 tyms. 1 day, she trav..."
2,7/24/18,10:04 AM,YTC Firsiryour,This gives the same as dividing ur score by 8
3,7/24/18,10:19 AM,YTC Firsiryour,U do maths for school
4,7/24/18,10:20 AM,YTC Firsiryour,<Media omitted>


## Remove Irrelevant data e.g <Media omitted>

In [106]:
df.describe()

Unnamed: 0,Date,Time,Sender,Message
count,3696,3696,3696,3696
unique,29,981,87,3072
top,8/9/18,10:28 PM,YTC Ifeanyi,Lol
freq,368,17,254,96


In [108]:
#removing all columns where Media was omitted '*space*<Media omitted>'
df=df[df['Message']!=' <Media omitted>']
#save chat data to csv file
df.describe()

Unnamed: 0,Date,Time,Sender,Message
count,3696,3696,3696,3696
unique,29,981,87,3072
top,8/9/18,10:28 PM,YTC Ifeanyi,Lol
freq,368,17,254,96


## Save data as a csv file

In [None]:
df.to_csv('yabatech_whatsapp_group_chat_data.csv',index=False)

# Data Analysis and Visualization

* Analysis
* Visualization

## Analysis
* Find Most Active members
* Find Messages sent on the most active day (8/2/18)

In [95]:
#Most active people in the group
df['Sender'].value_counts().head(10)

YTC Ifeanyi          254
YabaTech Informat    245
YTC Icekay           195
YTC Emmanuel         168
YTC Furzito          163
YTC Kef              160
YTC Bobby Fresh      153
YTC Ayoola           140
YTC b_b              114
YTC Sunshine         113
Name: Sender, dtype: int64

In [96]:
#Display all the messages sent at this Date 8/2/18
same_date=df[df['Date']=='8/2/18']
same_date.sample(15)

Unnamed: 0,Date,Time,Sender,Message
694,8/2/18,12:21 PM,YTC Ayoola,I don vote for her if she enter Make she no r...
675,8/2/18,11:58 AM,YTC Jeff,Okay
687,8/2/18,12:17 PM,YTC Emmanuel2,Can Sumone check the portal
727,8/2/18,11:39 PM,YTC Bobby Fresh,Any news?
689,8/2/18,12:18 PM,YTC Firsiryour,How sure
674,8/2/18,11:57 AM,YabaTech Informat,D form z not out yet o
707,8/2/18,3:47 PM,YTC Khinde,Hi
704,8/2/18,12:27 PM,YTC Ifeanyi,Its not out yet madam
692,8/2/18,12:19 PM,YTC Ifeanyi,Nothing is out stop spreading nonsense na😡
683,8/2/18,12:06 PM,YTC Slay Designs,where is it


In [102]:
#Display Chat data sent of the user '~El Nuru'
specific_user=df[df['Sender']=='~El Nuru']
specific_user

Unnamed: 0,Date,Time,Sender,Message
90,7/26/18,5:59 PM,~El Nuru,"How can a polytechnic be first choice, the fo..."
112,7/26/18,10:10 PM,~El Nuru,"That means we just buy the form, sit at home ..."
119,7/26/18,10:12 PM,~El Nuru,Who told you he is a girl
135,7/26/18,10:16 PM,~El Nuru,Okay we hear... Madam Stephanie
138,7/26/18,10:17 PM,~El Nuru,"That means after we get the form online, no n..."
140,7/26/18,10:18 PM,~El Nuru,"Okay Madam,i will call you Stephanie"
166,7/27/18,10:43 AM,~El Nuru,😡😡😡
234,7/27/18,9:56 PM,~El Nuru,😭😭😭😭
441,7/28/18,5:20 PM,~El Nuru,How does on apply for direct entry to univers...
527,7/30/18,10:58 PM,~El Nuru,Screening form never still come outside?


In [83]:
#Display members that used Lol abbrevation
lol_sayers=df[df['Message']==' Lol']
lol_sayers

Unnamed: 0,Date,Time,Sender,Message
42,7/25/18,3:14 PM,YTC Firsiryour,Lol
55,7/25/18,9:40 PM,YTC Furzito,Lol
255,7/27/18,10:19 PM,YTC Samuel,Lol
263,7/27/18,10:21 PM,YTC Ifeanyi,Lol
277,7/27/18,10:25 PM,YTC Furzito,Lol
278,7/27/18,10:25 PM,YTC Ifeanyi,Lol
296,7/27/18,10:30 PM,YTC Ifeanyi,Lol
309,7/27/18,10:34 PM,YTC Ifeanyi,Lol
325,7/27/18,10:45 PM,YTC Temitope,Lol
329,7/27/18,10:48 PM,YTC Temitope,Lol


In [18]:
#Members that used 'Lol' the most
lol_sayers['Sender'].value_counts().head()

YTC Icekay         21
YTC Furzito        11
YTC Bobby Fresh    10
YTC Ifeanyi         9
YTC Ayoola          8
Name: Sender, dtype: int64

In [24]:
#Check the day 'Lol' abbrevation was used most
lol_sayers['Date'].value_counts().head()

7/27/18    19
8/7/18     16
8/15/18    11
8/8/18      6
8/19/18     6
Name: Date, dtype: int64

In [32]:
#Display 30 most repeated Messages
df['Message'].value_counts().head(30)

 Lol                                96
 Ok                                 31
 Yes                                25
 Yh                                 23
 No                                 19
 This message was deleted           17
 Okay                               14
 😂                                  14
 *Free 32Gb Internet Data Pack*     13
 Hi                                 12
 🙄                                  12
 Yeah                               11
 *💯WON💯WON💯*                        11
 Hello                              10
 Nop                                10
 Abi na                              9
 😏                                   8
 Thanks                              8
 3500                                8
 OK                                  8
 Yea                                 8
 Good morning                        8
 😂😂😂                                 8
 Really                              7
 Hmmmmm                              7
 Morning                 

In [48]:
#Display people that deleted a message
deleted_msg=df[df['Message']==' This message was deleted']
deleted_msg

Unnamed: 0,Date,Time,Sender,Message
361,7/27/18,10:58 PM,YTC Temitope,This message was deleted
686,8/2/18,12:24 PM,YTC Emmanuel2,This message was deleted
1184,8/7/18,3:32 PM,YTC Emmanuel2,This message was deleted
1185,8/7/18,3:32 PM,YTC Emmanuel2,This message was deleted
1647,8/9/18,9:24 AM,YTC Taymi,This message was deleted
2251,8/11/18,7:53 PM,YTC Kvng Vincent,This message was deleted
2418,8/13/18,6:45 PM,YTC Imma,This message was deleted
2598,8/14/18,10:28 PM,YabaTech Informat,This message was deleted
2768,8/15/18,10:36 AM,YTC Hafuez,This message was deleted
2971,8/16/18,6:36 AM,YTC Ifeanyi,This message was deleted


In [58]:
#Display people that deleted a message
spam_msg=df[df['Message']==' *Free 32Gb Internet Data Pack* ']
spam_msg['Sender']

3152       YTC Dayz
3154    YTC Meeksax
3155    YTC Meeksax
3156    YTC Meeksax
3157    YTC Meeksax
3158    YTC Meeksax
3159    YTC Meeksax
3160    YTC Meeksax
3161    YTC Meeksax
3162    YTC Meeksax
3163    YTC Meeksax
3164    YTC Meeksax
3165    YTC Meeksax
Name: Sender, dtype: object