In [None]:
%%time

import os
import tarfile
import shutil
import gzip
import json
import pandas as pd

# Set the path of the directory containing the .tar files
tar_dir_path = 'tar_files/'

extract_path = '/twitter_data/'
# Set the path of the directory where the CSV files will be saved
csv_dir_path = 'csv_files/'

# Create the directory for the CSV files if it doesn't exist
if not os.path.exists(csv_dir_path):
    os.makedirs(csv_dir_path)

# Loop through all the .tar files in the directory
for tar_filename in os.listdir(tar_dir_path):
    if tar_filename.endswith('.tar') and tar_filename != 'twitter-stream-20221106.tar':
        # Extract the .tar file to a directory
        tar_path = os.path.join(tar_dir_path, tar_filename)
        tar_dirname = tar_filename[:-4]
        tar_extraction_dir_path = os.path.join(extract_path, tar_dirname)
        tar = tarfile.open(tar_path, 'r')
        tar.extractall(tar_extraction_dir_path)
        tar.close()

        # Initialize an empty list to store the filtered lines
        filtered_lines = []

        # Recursively traverse the subdirectories and extract all .json.gz files
        for root, dirs, files in os.walk(tar_extraction_dir_path):
            for file in files:
                if file.endswith('.json.gz'):
                    # Open the compressed file for reading
                    with gzip.open(os.path.join(root, file), 'rb') as f:
                        # Read the contents of the compressed file
                        contents = f.read()
                        
                        # Decode the contents from bytes to str
                        contents_str = contents.decode('utf-8')
                        
                        # Split the contents into lines, each containing a JSON object
                        lines = contents_str.split('\n')
                        
                        # Process each line as a separate JSON object
                        for line in lines:
                            if line:
                                # Decode the JSON object
                                json_contents = json.loads(line)
                                
                                # Filter out lines that don't contain the word "eurovision" in the "text" column
                                if 'tennis' in json_contents['text'].lower() and not json_contents['text'].startswith('RT '):
                                    # Append a dictionary with the selected columns to the filtered_lines list
                                    filtered_lines.append({
                                        "created_at": json_contents["created_at"],
                                        "id_str": json_contents["id_str"],
                                        "lang": json_contents["lang"],
                                        "text": json_contents["text"],
                                        "country": json_contents["place"]["country"] if json_contents.get("place") else None
                                    })

        # Create a new Pandas DataFrame with the filtered lines
        df = pd.DataFrame(filtered_lines, columns=["created_at", "id_str", "lang", "text", "country"])

        # Save the DataFrame to a CSV file with the same name as the .tar file
        csv_filename = tar_filename[:-4] + '.csv'
        csv_path = os.path.join(csv_dir_path, csv_filename)
        df.to_csv(csv_path, index=False)

        # Remove the extracted directory
        shutil.rmtree(extract_path)

        # Remove the .tar file
        os.remove(tar_path)


In [23]:
df

Unnamed: 0,created_at,id_str,lang,text,country
0,Tue Nov 15 00:16:24 +0000 2022,1592310513031352321,en,@RTennison3 @DannyDeVito God is pro-choice did...,
1,Tue Nov 15 00:42:47 +0000 2022,1592317152614584321,en,@iga_swiatek @XiaomiPL @GrupaPZU @tecnifibre @...,
2,Tue Nov 15 00:49:53 +0000 2022,1592318939383574528,en,@TennisTV @DjokerNole Amazing!,
3,Tue Nov 15 00:54:19 +0000 2022,1592320055072722944,en,Steamboat Tennis and Pickleball Center invites...,
4,Tue Nov 15 00:55:46 +0000 2022,1592320419985608705,en,"Of oxygen outside means nothing in tennis,But ...",
...,...,...,...,...,...
165,Tue Nov 15 23:28:13 +0000 2022,1592660775155470338,en,#BET #INPLAY #LIVENOW #TABLETENNIS\n🏆 Table Te...,
166,Tue Nov 15 23:30:29 +0000 2022,1592661345597784064,fr,@MadameTennis La fille qui pense que la carriè...,
167,Tue Nov 15 23:36:59 +0000 2022,1592662981367955462,en,There's nothing like a nice tailored suit with...,
168,Tue Nov 15 23:43:37 +0000 2022,1592664650709348353,en,@Big3Tennis those are beauty!,
