In [1]:
import pandas as pd
import numpy as np
import re
import csv

## Replacing some words
#### Ex: Both @ , 'at' and 'after' are used for same purpose like @6:30 / at 6:30 / after 6:30. So I will replace the 'at' and 'after' by @ 

In [2]:
# Read the whole file 

with open('rmk_broadcast.txt', 'r') as f:
    filedata = f.read()
    
# Replace some words in the text file

filedata = filedata.replace(' at', ' @')
filedata = filedata.replace('after', ' @')
filedata = filedata.replace('pm,','PM,')
filedata = filedata.replace('pm', 'PM,')
filedata = filedata.replace('nusrat:', 'nusrat - ')
filedata = filedata.replace('nusrat :','nusrat - ')

filedata = filedata.replace('nusrat - ','nusrat ~ ')  


# Creating a new text file which reflects the changes 

with open('rmk_broadcast_edited.txt', 'w') as file:
    file.write(filedata)

## Count frequency of some symbols 
#### Ex: This symbols are needed for spliting a line. So we need to ensure that every lines has the necessary symbols.

In [3]:
tilde_count = 0
at_count = 0
comma_count = 0

with open('rmk_broadcast_edited.txt','r') as f:
    
    lines = f.readlines() # returns a list where each element is a line of the file
    
    for line in lines:
        words = line.split()
        for word in words:
            if word == '~':
                tilde_count += 1
            if word[0] == '@':    # if a word starts with '@', ex: @ or @6pm
                at_count += 1
            if word[-1] == ',':
                comma_count += 1
                
        # Check which line breaks the format. 
        if tilde_count != at_count:
            print(tilde_count)
            print(at_count)
            print(line)

print("Tilde counts are "+ str(tilde_count) )
print("@ counts are "+ str(at_count))
print("comma counts are "+ str(comma_count))

Tilde counts are 139
@ counts are 139
comma counts are 399


In [4]:
# Declaration of necessary lists

day  = []
date = []
year = []
bc_time = []
broadcaster = []

micro_time = []
is_sharp = []

## Load the text file

In [5]:
list_of_lines = np.genfromtxt('rmk_broadcast_edited.txt', delimiter='\n', dtype='str')
print("Length is " + str(len(list_of_lines))) # If not an even number then wrong format



Length is 278


## Loading the necessary data into seperate lists

In [6]:
line_no = 0  

for line in list_of_lines:
    
    if line_no%2 == 0:
        
        # Monday, January 2, 2017
        
        words = line.split(sep=',')  # split the line into 3 parts and returns the as a list
      
        day.append(words[0].strip())
        date.append(words[1].strip())
        year.append(words[2].strip())
        
    else:
        # (16:59:39) nusrat - Broadcast: RMT @6:30pm, In Sha Allah. 

        lines = line.split(sep='~', maxsplit = 1)  
        # split into 2 parts [ex:"(16:59:39) nusrat" & "Broadcast: RMT @6:30pm, In Sha Allah."]
        
        # 1st part manipulation [ex: (16:59:39) nusrat]
        words = lines[0].split(sep=' ')
        
        time = words[0].strip()
        
        bc_time.append(time[1:-4])
        broadcaster.append(words[1].strip())
        
        l = re.findall("\d+", lines[1])

        if len(l)== 1:   # Ex: 6pm
            l[0] = str(str(l[0]) + ":00")
            micro_time.append(l[0])
        elif len(l) == 0:
            micro_time.append(-1)  # after magrib
        else:
            m_time = str( str(l[0]) + ":" + str(l[1]) ) # Ex: 5:30pm
            micro_time.append(m_time)
        
        if 'sharp' in lines[1].lower():
            is_sharp.append(1)
        else:
            is_sharp.append(0)
            
    line_no = line_no + 1
    

In [7]:
print(len(day))
print(len(date))
print(len(year))
print(len(bc_time))
print(len(broadcaster))
print(len(micro_time))
print(len(is_sharp))

139
139
139
139
139
139
139


In [8]:
for i in range(0,len(day)):
    print("Day: "+ str(day[i]) + "  ||  " 
          + "Date: "+ str(date[i]) + "  ||  " 
          + "Year: "+ str(year[i]) + "  ||  " 
          + "Broadcast Time: "+ str(bc_time[i]) + "  ||  " 
          + "Broadcaster : "+ str(broadcaster[i]) + "  ||  " 
          + "Micro Time : " + str(micro_time[i]) + "  ||  "
          + "Is Sharp ? : " + str(is_sharp[i]) )
    print()

Day: Monday  ||  Date: January 2  ||  Year: 2017  ||  Broadcast Time: 16:59  ||  Broadcaster : nusrat  ||  Micro Time : 6:30  ||  Is Sharp ? : 0

Day: Tuesday  ||  Date: January 3  ||  Year: 2017  ||  Broadcast Time: 16:44  ||  Broadcaster : nusrat  ||  Micro Time : 5:30  ||  Is Sharp ? : 0

Day: Sunday  ||  Date: January 8  ||  Year: 2017  ||  Broadcast Time: 16:54  ||  Broadcaster : nusrat  ||  Micro Time : 5:30  ||  Is Sharp ? : 1

Day: Monday  ||  Date: January 9  ||  Year: 2017  ||  Broadcast Time: 16:38  ||  Broadcaster : nusrat  ||  Micro Time : 6:00  ||  Is Sharp ? : 0

Day: Wednesday  ||  Date: January 11  ||  Year: 2017  ||  Broadcast Time: 16:26  ||  Broadcaster : nusrat  ||  Micro Time : 5:00  ||  Is Sharp ? : 0

Day: Thursday  ||  Date: January 12  ||  Year: 2017  ||  Broadcast Time: 16:33  ||  Broadcaster : nusrat  ||  Micro Time : 5:00  ||  Is Sharp ? : 0

Day: Sunday  ||  Date: January 15  ||  Year: 2017  ||  Broadcast Time: 16:41  ||  Broadcaster : nusrat  ||  Micro Ti

# Creating a CSV file from these lists

In [9]:
with open('rmk_dataset.csv', 'w', newline='') as file:
    
    columns = ['Day', 'Date', 'Year', 'Broadcast_time', 'Broadcaster', 'Micro_time', 'Is_sharp_time?']
    the_writer = csv.DictWriter(file, fieldnames = columns)
    the_writer.writeheader()
    
    for i in range(0,len(day)):
        
        the_writer.writerow({ 'Day': day[i], 
                              'Date': date[i], 
                              'Year':year[i], 
                              'Broadcast_time':bc_time[i], 
                              'Broadcaster': broadcaster[i],
                              'Micro_time': micro_time[i], 
                              'Is_sharp_time?': is_sharp[i] })
    