In [1]:
import numpy as np
import pandas as pd
from yaml import safe_load
import os
from tqdm import tqdm

In [2]:
filenames = []
for file in os.listdir('archive (1)/t20s/'):
    filenames.append(os.path.join('archive (1)/t20s/',file))

In [3]:

filenames[0:5]

['archive (1)/t20s/1001349.yaml',
 'archive (1)/t20s/1001351.yaml',
 'archive (1)/t20s/1001353.yaml',
 'archive (1)/t20s/1004729.yaml',
 'archive (1)/t20s/1007655.yaml']

In [4]:
import os
import pandas as pd
import yaml
from tqdm import tqdm
import time

data_folder = 'archive (1)/t20s/'

# Collect YAML files only
filenames = [os.path.join(data_folder, f) for f in os.listdir(data_folder) if f.endswith('.yaml')]

print(f"Total files found: {len(filenames)}")
print("First 5:", filenames[:5])

df_list = []
error_files = []
start_time = time.time()

for idx, file in enumerate(tqdm(filenames, desc="Processing files", unit="file")):
    try:
        with open(file, 'r') as f:
            # Load YAML safely
            data = yaml.safe_load(f)
            
            if not isinstance(data, dict):
                continue  # skip non-dict files
            
            # Flatten YAML into DataFrame
            df = pd.json_normalize(data)
            df['match_id'] = idx + 1
            
            df_list.append(df)
            
        # Optional: stop runaway processing
        if time.time() - start_time > 600:  # 10 minutes
            print("⏳ Timeout reached, stopping early...")
            break
            
    except Exception as e:
        error_files.append((file, str(e)))
        continue

# Combine results
if df_list:
    final_df = pd.concat(df_list, ignore_index=True)
else:
    final_df = pd.DataFrame()

# Make a backup copy
backup = final_df.copy()

print("\n✅ Done.")
print(f"✅ Total processed files: {len(df_list)} / {len(filenames)}")
print(f"⚠️ Errors in {len(error_files)} files.")
print("Final DataFrame shape:", final_df.shape)
final_df.head()


Total files found: 4570
First 5: ['archive (1)/t20s/1001349.yaml', 'archive (1)/t20s/1001351.yaml', 'archive (1)/t20s/1001353.yaml', 'archive (1)/t20s/1004729.yaml', 'archive (1)/t20s/1007655.yaml']


Processing files:  47%|████▋     | 2138/4570 [10:00<11:22,  3.56file/s]

⏳ Timeout reached, stopping early...






✅ Done.
✅ Total processed files: 2139 / 4570
⚠️ Errors in 0 files.
Final DataFrame shape: (2139, 4698)


Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.balls_per_over,info.dates,info.gender,info.match_type,info.outcome.by.wickets,info.outcome.winner,...,info.registry.people.Ahmed Baladraf,info.registry.people.Kashif Abbas,info.registry.people.Shakeel Ahmed,info.registry.people.Mohammed Irshad,info.registry.people.Norbert Abii,info.registry.people.D Wabwire,info.registry.people.Shaik Basha,info.registry.people.Maryyam Sheikh,info.registry.people.I Mugisha,info.registry.people.JM Iradukunda
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-02-18,2,6,[2017-02-17],male,T20,5.0,Sri Lanka,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-02-19,2,6,[2017-02-19],male,T20,2.0,Sri Lanka,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-02-23,1,6,[2017-02-22],male,T20,,Australia,...,,,,,,,,,,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",0.91,2016-09-12,1,6,[2016-09-05],male,T20,,Hong Kong,...,,,,,,,,,,
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",0.91,2016-06-19,1,6,[2016-06-18],male,T20,,Zimbabwe,...,,,,,,,,,,
