In [15]:
import os, re
import glob
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
NET_TYPE = '4G' #To decide which 3GPP files we are working on. Change this to '4G' if you want to process 4G files.

In [17]:
class Preprocessor:
    def __init__(self, output_path):
        self.output_path = output_path
        self.output_file = None
        self.line_count = 0
        self.file_count = 0
        self.nas_end_idx = 0
        
    def find_sections(self, line):
        section_header = re.compile(r'\d(\.\d+)+[A-Za-z]*') #Find if the line is of pattern 3.4 or 5.7.1 or 4.6.7a, etc.
        if section_header.match(line) is not None:
            #print(line)
            return True

        
    def processAll(self, input_path):
        #input_path is a directory containing raw data
        
        self.file_count = 0
        output_file_path = os.path.join(self.output_path,f'conflict_segments_{NET_TYPE}.txt') #these segments will be paired later to make
                                                                                  #the final dataset 
        
        if os.path.exists(output_file_path):
            with open(output_file_path, 'r') as fp:
                self.line_count = len(fp.readlines()) #To get the next sequence number, as we are appending
            
        self.output_file = open(output_file_path,'w')
        f_count = 0
        for file in glob.glob(input_path + '/**/*.txt', recursive=True):
            print(os.path.basename(file))
            #if file.endswith(".txt"):
            file_path = os.path.join(input_path,os.path.basename(file))
            print(file_path)
            self.processIt(file_path, [1])
            if f_count == 0:
                self.output_file.write('---------File Ends-------\n')
            f_count += 1
        self.output_file.close()
    
    def processIt(self, file, task:list):
        
        self.file_count += 1
        file_line_count = 0
        file_section_count = -1
        #file: path to a file
        with open(file) as f:
            text = f.readlines()
            for line in text:
                #print(line)
                if self.find_sections(line):
                    self.output_file.write(".\n------\n")
                    file_section_count += 1
                    #new_section = True
                    continue
                if len(line.split()) < 4 : #skip the line
                    continue
                #if '((' or '))' or ':=' or '::' or '{{' or '}}' or '[[' or ']]' in line:
                #    continue
                if 'Editor\'s Note' in line:
                    continue
                line = re.sub(r' +', ' ', line) #multiple whitespace, keep one
                line = re.sub(r'(\n)+', '', line) #remove extra newlines
                line = re.sub(r'^[\.·\-]', '', line) #starting dot, interpunct, hyphen removal
                line = re.sub(r'^ ', '', line) #starting whitespace removal
                line = re.sub(r'[\.:,;\-]*$', '', line) #one or multiple dot, colon, hyphen, semicolon removal at the end
                line = re.sub(r'(\( )', '\(', line) #whitespace after opening paren.
                line = re.sub(r'( \))', '\)', line) #whitespace before closing paren.
                line = re.sub(r'(\(\))|(\[\])|(\{\})', '', line) #remove empty paren, curly-braces, brackets
                line = re.sub(r'(as shown below|[Ss]ee figure below):*\-*', '', line) #remove certain strings
                line = re.sub(r'([,;:])(\w)', r'\1 \2', line) #insert whitespace after punctuations, except fullstop, underscore
                                                              #hyphen
                line = re.sub(r'\ue000', '', line, re.UNICODE)
                #line = line + '.'
                
                if line != '' and line != '.':
                    line_set = line.split('. ') #Some lines still have multiple sentences
                    for i, sentence in enumerate(line_set):
                        self.line_count += 1
                        file_line_count += 1
                        if i != len(line_set)-1:
                            #self.output_file.write(str(self.line_count) + ', $' + sentence + '.$\n')
                            self.output_file.write(sentence + '.\n')
                        else:
                            #self.output_file.write(str(self.line_count) + ', $'+ sentence+'$' + '\n')
                            self.output_file.write(sentence+ " ")
                    #print('check')
                #end of document
            self.output_file.write('\n')
            print(str(self.file_count)+ " file(s) completed. " + str(file_section_count) + " sections added. " +  str(file_line_count) + " lines added.")
            
            #print("hey", text)
            print("\n---------------------------------------------------------------------------------------------\n")
        #task is a list of preprocessing tasks you want to carry on
        #############add code for getlines###########
    
    def pairUp(self, file, num_sentences_per_article = 3):
        corpus = []
        data_string = ""
        with open(file) as f:
            text = f.readlines()
            print(len(text))
            num_lines = 0
            for line in text:
                if "---------File Ends-------" in line:
                    self.nas_end_idx = len(corpus) #To check where the first file ends. Helps for disjoint dataset building
                    continue
                if line == '.\n' or line == '.':
                    continue
                if line == '\n':
                    continue
                if line == "------\n":
                    #print('yo')
                    if data_string != "":
                        corpus.append(data_string)
                        num_lines = 0
                    data_string = ""
                else:
                    data_string += line
                    num_lines += 1
                if num_lines == num_sentences_per_article or len(data_string.split()) > 50:
                    corpus.append(data_string)
                    data_string = ""
                    num_lines = 0
            if data_string != "": #The last one
                corpus.append(data_string)
                
        #print(f'Corpus Length in number of sequences: {len(corpus)}')    
        vect = TfidfVectorizer(min_df=1, stop_words="english")                                                                                                                                                                                                   
        tfidf = vect.fit_transform(corpus)
        #print("Shape of array: ",tfidf.shape)
        pairwise_similarity = tfidf * tfidf.T
        #print(pairwise_similarity)
        return corpus, pairwise_similarity



In [18]:
input_path = f"./Data/Raw_{NET_TYPE}" #all inputs

output_path = f"./Data/Processed_{NET_TYPE}"
os.makedirs(output_path, exist_ok=True)


In [12]:
dataProcessor = Preprocessor(output_path)
dataProcessor.processAll(input_path)
print(dataProcessor.nas_end_idx)

Security_4G.txt
./Data/Raw_4G/Security_4G.txt
1 file(s) completed. 120 sections added. 6416 lines added.

---------------------------------------------------------------------------------------------

NAS_4G.txt
./Data/Raw_4G/NAS_4G.txt
2 file(s) completed. 890 sections added. 8475 lines added.

---------------------------------------------------------------------------------------------

0


In [13]:
corpus, pair_sim = dataProcessor.pairUp(os.path.join(output_path,f"conflict_segments_{NET_TYPE}.txt"))
print(dataProcessor.nas_end_idx)

5323
720


In [14]:
import pickle
with open(f"./Data/cp_corpus_{NET_TYPE}.txt", "wb") as fp:
   pickle.dump(corpus, fp)