In [1]:
import requests
from urllib.parse import urlparse, unquote
from tqdm.auto import tqdm
import pathlib
import functools
import shutil
import os
from pathlib import Path
import zipfile

### Download Methods


In [2]:
def get_filename(url):
   urlparse(url).path
   url_parsed = urlparse(url)
   return unquote(Path(url_parsed.path).name)

def download_file(url, filename):
    r = requests.get(url, stream=True, allow_redirects=True)
    if r.status_code != 200:
        r.raise_for_status()  # Will only raise for 4xx codes, so...
        raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
    file_size = int(r.headers.get('Content-Length', 0))

    path = Path(filename).expanduser().resolve()
    path.parent.mkdir(parents=True, exist_ok=True)

    desc = "(Unknown total file size)" if file_size == 0 else ""
    r.raw.read = functools.partial(r.raw.read, decode_content=True)  # Decompress if needed
    with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
        with path.open("wb") as f:
            shutil.copyfileobj(r_raw, f)

    return path

### Extraction Method

In [3]:
def extract(zip_file_path, extract_folder):
   
   with zipfile.ZipFile(zip_file_path, "r")  as zip_ref:
      members = zip_ref.infolist()
      with tqdm(total=len(members), desc="Extracting Files") as pbar:
         for member in members:
            zip_ref.extract(member,f"{extract_folder}") 
            pbar.update(1)

### Checks if WIKITL is already downloaded else download and extract WikiTL
   

In [4]:
has_extracted_folder = "wikitext-tl-39" in os.listdir("../wiki_tl")
has_zipfile =  "wikitext-tl-39.zip" in os.listdir("../wiki_tl") 
url = "https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/wikitext-tl-39/wikitext-tl-39.zip"

data_dir= "../wiki_tl"
filename = f"{data_dir}/{get_filename(url)}"

if has_extracted_folder==False:
   print('True')
   if has_zipfile:
      extract(filename, data_dir)
   else:
      download_file(url, filename)
      extract(filename, data_dir)
   

In [5]:
import os
import re

folder_path = "../wiki_tl/wikitext-tl-39"
# List to store the contents of all the files
all_contents = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename)) as f:
            contents = f.read()
            # Split the contents using whitespace
            tokens = re.split(r'\s+', contents)
            # Remove tokens that are special characters
            tokens = [token for token in tokens if not re.match(r'[^\w\s]+', token)]
            # Remove periods and commas
            tokens = [token.strip(".,") for token in tokens]
            # Append the contents to the list
            all_contents.extend(tokens)

# Save the contents to a single file
with open("preprocessed_wiki_tl.txt", "w") as f:
    f.write(" ".join(all_contents))
            
            # do something with the contents of the file