# For IMDb dataset

In [1]:
import os
import urllib.request
import zipfile
import pandas as pd

# Define URLs and target directory
imdb_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
target_dir = "imdb_data"

def download_and_extract(url, target_dir):
    os.makedirs(target_dir, exist_ok=True)
    tar_path = os.path.join(target_dir, "aclImdb_v1.tar.gz")

    print("Downloading IMDb dataset...")
    urllib.request.urlretrieve(url, tar_path)

    print("Extracting IMDb dataset...")
    import tarfile
    with tarfile.open(tar_path, "r:gz") as tar_ref:
        tar_ref.extractall(target_dir)

    print("IMDb dataset downloaded and extracted!")

# Function to convert IMDB reviews to CSV
def convert_imdb_to_csv(input_dir, output_csv):
    data = []
    for label in ["pos", "neg"]:
        folder = os.path.join(input_dir, label)
        for file in os.listdir(folder):
            with open(os.path.join(folder, file), "r", encoding="utf-8") as f:
                review = f.read().strip()
                data.append([review, 1 if label == "pos" else 0])

    df = pd.DataFrame(data, columns=["reviews", "labels"])
    df.to_csv(output_csv, index=False)
    print(f"Saved {output_csv}")

# Download and extract the dataset
download_and_extract(imdb_url, target_dir)

# Convert train and test sets to CSV
convert_imdb_to_csv(os.path.join(target_dir, "aclImdb/train"), os.path.join(target_dir, "train.csv"))
convert_imdb_to_csv(os.path.join(target_dir, "aclImdb/test"), os.path.join(target_dir, "test.csv"))

print("IMDb dataset successfully converted to CSV format!")

Downloading IMDb dataset...
Extracting IMDb dataset...


  tar_ref.extractall(target_dir)


IMDb dataset downloaded and extracted!
Saved imdb_data/train.csv
Saved imdb_data/test.csv
IMDb dataset successfully converted to CSV format!


# For QNLI dataset

In [4]:
import os
import urllib.request
import zipfile

# Define URLs and target directory
qnli_url = "https://dl.fbaipublicfiles.com/glue/data/QNLI.zip"
target_dir = "qnli_data"

# Create target directory if it doesn't exist
os.makedirs(target_dir, exist_ok=True)
zip_path = os.path.join(target_dir, "QNLI.zip")

# Download the QNLI dataset
print("Downloading QNLI dataset...")
urllib.request.urlretrieve(qnli_url, zip_path)
print("Download complete!")

# Extract the dataset
print("Extracting QNLI dataset...")
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(target_dir)
print("Extraction complete!")

# List extracted files
extracted_files = os.listdir(target_dir)
print(f"Extracted files: {extracted_files}")

# Ensure TSV files are saved
tsv_files = [f for f in extracted_files if f.endswith(".tsv")]
if tsv_files:
    print("TSV files successfully saved in:", target_dir)
else:
    print("No TSV files found! Check extraction path.")


Downloading QNLI dataset...
Download complete!
Extracting QNLI dataset...
Extraction complete!
Extracted files: ['QNLI.zip', 'QNLI']
No TSV files found! Check extraction path.


# For SST-2 Dataset

In [3]:
import os
import urllib.request
import zipfile
import pandas as pd

# Define URLs and target directory
sst2_url = "https://dl.fbaipublicfiles.com/glue/data/SST-2.zip"
target_dir = "sst2_data"

# Download the SST-2 dataset
os.makedirs(target_dir, exist_ok=True)
zip_path = os.path.join(target_dir, "SST-2.zip")
urllib.request.urlretrieve(sst2_url, zip_path)

# Extract the dataset
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(target_dir)

print("SST-2 dataset downloaded and extracted!")

# Convert TSV to CSV
def convert_tsv_to_csv(tsv_path, csv_path):
    df = pd.read_csv(tsv_path, delimiter="\t")
    df.columns = ["reviews", "labels"]  # Rename columns
    df.to_csv(csv_path, index=False)
    print(f"Converted {tsv_path} to {csv_path}")

# Convert train, dev, and test sets
convert_tsv_to_csv(os.path.join(target_dir, "SST-2/train.tsv"), os.path.join(target_dir, "train.csv"))
convert_tsv_to_csv(os.path.join(target_dir, "SST-2/dev.tsv"), os.path.join(target_dir, "dev.csv"))
convert_tsv_to_csv(os.path.join(target_dir, "SST-2/test.tsv"), os.path.join(target_dir, "test.csv"))

print("SST-2 dataset successfully converted to CSV format!")


SST-2 dataset downloaded and extracted!
Converted sst2_data/SST-2/train.tsv to sst2_data/train.csv
Converted sst2_data/SST-2/dev.tsv to sst2_data/dev.csv
Converted sst2_data/SST-2/test.tsv to sst2_data/test.csv
SST-2 dataset successfully converted to CSV format!
