In [1]:
import pandas as pd
import requests
import json
import glob
import re

In [2]:
# Men, Women
cat = "Men"

"https://scores.bcci.tv/feeds-international/internationalarchives.js"

In [3]:
BCCI = {"BCCI_RECENT_MATCH_URL": f"https://scores2.bcci.tv/getRecentMatches?platform=international&previousMatchesCount=999&filterType={cat}&loadMore=true&archieves=true",
"BCCI_LIVE_MATCH_URL": f"https://scores2.bcci.tv/getLiveMatches?platform=international&previousMatchesCount=999&filterType={cat}&loadMore=true&archieves=true",
"BCCI_UPCOMING_MATCH_URL": f"https://scores2.bcci.tv/getUpcomingMatches?platform=international&previousMatchesCount=999&filterType={cat}&loadMore=true&archieves=true"}

# CRICKET_AUS_COMMS = "https://apiv2.cricket.com.au/web/views/comments?fixtureId=20755&inningNumber=2&commentType=&overLimit=21&jsconfig=eccn%3Atrue&format=json"
# CRICKET_AUS_MATCH = "https://apiv2.cricket.com.au/web/fixtures/yearfilter?isCompleted=true&year=2025&limit=13&isInningInclude=true&jsconfig=eccn%3Atrue&format=json"

bcci_match_list = []
for i in BCCI.values():
    response = requests.get(i, timeout=100)
    data = response.json()
    rows = list(data.keys())
    if(rows[0] == 'recentMatches'):
        for j in data[list(data.keys())[-1]]:
            bcci_match_list.extend(j)
    else:
        bcci_match_list.extend(data[list(data.keys())[0]])

bcci_match_list = sorted(bcci_match_list, key = lambda x: x["MatchDate"])

with open(f"./bcci_shot_data/{cat}/bcci_match_list.json", 'w') as f:
    json.dump(bcci_match_list, f, indent=4)

In [4]:

def get_bcci_shot_data(match_id, max_overs):
    # match_id = 1653
    innings = 0

    if max_overs == 20 or max_overs == 50:
        innings = 2
    else:
        innings = 4

    match_data = []
    for i in range(1, innings+1):
        BCCI_COMMS_URL = f"https://scores.bcci.tv/feeds-international/scoringfeeds/{match_id}-Innings{i}.js"
        
        try:
            response = requests.get(BCCI_COMMS_URL, timeout=100)
            response.raise_for_status()
            data = response.text
            # result = re.sub(r'onScoring\((.*?)\);', r'\1', data)
            result = data.replace("onScoring(", "").replace(");", "")
            data_json = json.loads(result)
            match_data.append(data_json[f"Innings{i}"])

        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 404:
                continue

    with open(f"./bcci_shot_data/{cat}/json/{match_id}.json", 'w') as f:
        json.dump(match_data, f, indent=4)

In [5]:
def csv_to_json(match_id):
    df = pd.read_json(f"./bcci_shot_data/{cat}/json/{match_id}.json")

    dfs = []
    for i in range(len(df['OverHistory'])):
        dfs.append(pd.DataFrame(df['OverHistory'][i]))
        
    final_df = pd.concat(dfs, axis=0, ignore_index=True)

    # final_df.to_csv(f"./bcci_shot_data/{match_id}.csv")
    final_df = final_df.drop(columns=['BallID', 'BallUniqueID', 'StrikerID', 'NonStrikerID', 'BowlerID', 
                                    'VideoFile', 'NewCommentry', 'Commentry', 'UPDCommentry', 'OutBatsManID',
                                    'HatCheck', 'CommentStrikers', 'OverName', 'CommentOver', 'RunsText'])


    final_df = final_df[final_df['ActualBallNo'].str.strip() != '']
    final_df.to_csv(f"./bcci_shot_data/{cat}/csv/{match_id}.csv")
    final_df.columns.unique()

In [6]:
temp_df = pd.DataFrame(bcci_match_list)
temp_df = temp_df.drop_duplicates(subset=['MatchID'], keep='first', inplace=False)

live_data_file_name = f"./bcci_shot_data/{cat}/live_data_file_name.txt"
try:
    with open(live_data_file_name, 'r') as file:
        existing_ids = set(line.strip() for line in file)
except FileNotFoundError:
    existing_ids = set()

data_temp_df = temp_df[temp_df['MatchStatus'] == 'Post']
live_data_temp_df = temp_df[temp_df['MatchStatus'] == 'Live']

for match_id ,max_overs in zip(data_temp_df['MatchID'], data_temp_df['MATCH_NO_OF_OVERS']):
    temp_file_str = f"./bcci_shot_data/{cat}/csv/{match_id}.csv"

    if temp_file_str not in glob.glob(f"./bcci_shot_data/{cat}/csv/*.csv"):
        get_bcci_shot_data(match_id, max_overs)
        csv_to_json(match_id)

    if str(match_id) in existing_ids:
        get_bcci_shot_data(match_id, max_overs)
        csv_to_json(match_id)
        existing_ids.remove(str(match_id))

# live_match_list = []
for match_id ,max_overs in zip(live_data_temp_df['MatchID'], live_data_temp_df['MATCH_NO_OF_OVERS']):
    # live_match_list.append(match_id)
    if match_id not in existing_ids:
        existing_ids.add(match_id)

    get_bcci_shot_data(match_id, max_overs)
    csv_to_json(match_id)
    
with open(live_data_file_name, 'w') as file:
    file.write('\n'.join(str(id) for id in existing_ids))

In [7]:
temp_df.fillna('', inplace=True)
temp_df = temp_df.drop(columns=['PreMatchCommentary', 'PostMatchCommentary', 'innings'], errors='ignore')
temp_df.to_csv(f'./bcci_shot_data/{cat}/bcci_match_list.csv', index=False)

  temp_df.fillna('', inplace=True)


In [8]:
import os
import pandas as pd

# Path to the folder containing the CSV files
folder_path = f'./bcci_shot_data/{cat}/csv'  # Path to the 'csv' directory

# List of numbers (in the order you want the CSV files to be combined)
temp_list = temp_df['MatchID'].tolist()

# List of all CSV files in the folder
csv_files = list(f"{num}.csv" for num in temp_list)  # Set for fast lookup

# Read and concatenate the CSVs
df_list = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    if os.path.exists(file_path):  # Only process if the file exists
        df = pd.read_csv(file_path)
        df_list.append(df)

# Concatenate all DataFrames into one
if df_list:
    final_df = pd.concat(df_list, ignore_index=True)
    # Save the result to a new CSV
    final_df.to_csv(f'./bcci_shot_data/{cat}/combined_shot_data.csv', index=False)
    print('CSV files have been concatenated successfully!')
else:
    print('No CSV files were found to combine.')


CSV files have been concatenated successfully!


In [9]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [10]:
# https://www.bcci.tv/events/183/border-gavaskar-trophy-2024-25/match/1652/4th-test

# temp_df = pd.DataFrame(bcci_match_list)
hawkeye_file_name = f"./bcci_shot_data/{cat}/hawkeyeid_matchid.txt"
try:
    with open(hawkeye_file_name, 'r') as file:
        hawkeye_ids = [tuple(map(int, line.strip().split(', '))) for line in file]
except FileNotFoundError:
    hawkeye_ids = []

temp_df = pd.read_json(f"./bcci_shot_data/{cat}/bcci_match_list.json")
india_match_df = temp_df[(temp_df['HomeTeamName'] == 'India') | (temp_df['HomeTeamName'] == 'India (Women)')]
for c_id, c_name, m_id, m_order in zip(india_match_df['CompetitionID'], india_match_df['CompetitionName'], india_match_df['MatchID'], india_match_df['MatchOrder']):

    c_name_new = re.sub(r'\s+', '-', c_name.lower())
    m_order_new = re.sub(r'\s+', '-', m_order.lower())

    match_center_str = f"https://www.bcci.tv/events/{c_id}/{c_name_new}/match/{m_id}/{m_order_new}"

    response = requests.get(match_center_str, timeout=30)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.find_all('embed')
    if len(text) != 0:
        hawkurl = text[0].get('src', '')
        hawkid = int(hawkurl.split("matchId=")[-1])

        if (m_id, hawkid) not in hawkeye_ids:
            hawkeye_ids.append((m_id, hawkid))

# print(hawkeye_ids)
with open(hawkeye_file_name, 'w') as file:
    for i, j in hawkeye_ids:
        file.write(f"{i}, {j}\n")

In [8]:
import requests
import json
import pandas as pd

In [9]:
BCCI_ARCHIVES_URL = "https://scores.bcci.tv/feeds-international/internationalarchives.js"
response = requests.get(BCCI_ARCHIVES_URL, timeout=30)
data = response.text
result = data.replace("oncomptetion(", "").replace(");", "")
final = json.loads(result)
archive_df = pd.DataFrame(final['competition'])
# archive_df

In [10]:
archive_shot_data_match_list = archive_df[archive_df['completed'] == 1.0]

In [11]:
# def bcci_archive_data(comp_id):
bcci_archive_data = []
for comp_id in archive_shot_data_match_list['CompetitionID']:
    try:
        BCCI_SCORING_FEEDS_URL = f"https://scores.bcci.tv/feeds-international/scoringfeeds/{comp_id}-matchschedule.js"
        response = requests.get(BCCI_SCORING_FEEDS_URL, timeout=30)
        data = response.text
        result = data.replace("MatchSchedule(", "").replace(");", "")
        final = json.loads(result)
        bcci_archive_data.extend(final['Matchsummary'])
    except:
        print(archive_df[archive_df['CompetitionID'] == comp_id]['CompetitionName'])
        # BCCI_SCORING_FEEDS_URL = f"https://scores.bcci.tv/feeds-international/{comp_id}-matchschedule.js?callback=MatchSchedule&_=1738899249298"
        # response = requests.get(BCCI_SCORING_FEEDS_URL, timeout=30)
        # data = response.text
        # result = data.replace("MatchSchedule(", "").replace(");", "")
        # final = json.loads(result)
        # bcci_archive_data.extend(final['Matchsummary'])
    # return final['Matchsummary']

8    ENGLAND LIONS TOUR OF INDIA
Name: CompetitionName, dtype: object
11    ENGLAND LIONS TOUR OF INDIA MULTIDAY WARM UP GAME
Name: CompetitionName, dtype: object
21    ENGLAND A WOMENS TOUR OF INDIA T20 SERIES
Name: CompetitionName, dtype: object
24    QUADRANGULAR MENS U19 ONE DAY SERIES
Name: CompetitionName, dtype: object
51    INDIA A IN BANGLADESH MULTI DAY SERIES
Name: CompetitionName, dtype: object
52    NEW ZEALAND WM U19 IN INDIA T20 SERIES
Name: CompetitionName, dtype: object
55    WEST INDIES WM U19 VS NEW ZEALAND WM U19 IN INDIA
Name: CompetitionName, dtype: object
62    NEW ZEALAND A TOUR OF INDIA
Name: CompetitionName, dtype: object
63    NEW ZEALAND A TOUR OF INDIA
Name: CompetitionName, dtype: object
92    AUSTRALIA WOMEN V INDIA WOMEN 2021
Name: CompetitionName, dtype: object
95    ENGLAND WOMEN V INDIA WOMEN 2021
Name: CompetitionName, dtype: object
98    INDIA WOMEN V SOUTH AFRICA WOMEN 2021
Name: CompetitionName, dtype: object


In [None]:
bcci_archive_df = pd.DataFrame(bcci_archive_data)
bcci_archive_df

0      5
1      1
2      2
3      1
4      9
      ..
365    2
366    2
367    2
368    2
369    2
Name: HomeTeamID, Length: 370, dtype: object

In [13]:
# def preprocess(data):
#     df = pd.json_normalize(data['inning'], 'overs', ['id', 'fixtureId', 'inningNumber', 'battingTeamId', 'bowlingTeamId'], record_prefix='over_')
#     # df1 = pd.json_normalize(df['balls'])
#     # df = df.iloc[::-1].reset_index(drop=True)
#     if(len(df) > 0):
#         df = df.drop(index=df.index[-1])
#         df = df.explode('over_balls', ignore_index=True)
#         df1 = pd.json_normalize(df['over_balls'])
#         df.drop(columns=['over_balls'], inplace=True)
#         df2 = pd.concat([df, df1], axis=1)
#         # df2['ball_comments'] = df2['comments'].apply(lambda x: ', '.join([comment['comments'] for comment in x]) if isinstance(x, list) else '')
#         # df2

#         # df3 = pd.json_normalize(df2)
#         df2 = df2.iloc[::-1].reset_index(drop=True)
#         df2.drop(columns=['comments'], inplace=True)
#     else:
#         df2 = pd.DataFrame()

#     return df2
#     # df2.to_csv('./temp.csv', index=False)

In [14]:
# def main_func(matchid):
#     comms_df = pd.DataFrame()
#     for i in range(1, 3):
#         CRICKET_AUS_COMMS = f"https://apiv2.cricket.com.au/web/views/comments?fixtureId={matchid}&inningNumber={i}&commentType=&overLimit=499&jsconfig=eccn%3Atrue&format=json"
#         response = requests.get(CRICKET_AUS_COMMS, timeout=100)
#         data = response.json()
#         df = preprocess(data)
#         comms_df = pd.concat([comms_df, df], ignore_index=True)

#     comms_df.to_csv(f"./{matchid}.csv", index=False)
#     # with open("./temp.json", "w") as f:
#     #     json.dump(comms_data, f, indent=4)
#     # comms_data.extend(data['inning'])
#     # df = pd.read_json(data)

In [15]:
# main_func(11291)

In [15]:
hawk_pair = []
with open("./bcci_shot_data/Men/hawkeyeid_matchid.txt") as f:
    for l in f:
        split_values = [value.strip() for value in l.strip().split(',')]
        hawk_pair.append(split_values)

hawk_pair

[['416', '13288607242335'],
 ['417', '13288866796418'],
 ['418', '13289039898867'],
 ['419', '13289490448693'],
 ['420', '13289663853284'],
 ['421', '13289837179102'],
 ['424', '13290181311355'],
 ['425', '13290354143841'],
 ['426', '13290440625168'],
 ['422', '13290838587046'],
 ['427', '13299253690057'],
 ['428', '13299512638233'],
 ['429', '13299685274523'],
 ['430', '13299944840163'],
 ['431', '13300117631732'],
 ['636', '13308153194853'],
 ['637', '13308421662088'],
 ['638', '13308584619431'],
 ['639', '13308844210044'],
 ['640', '13309189373728'],
 ['641', '13309362654636'],
 ['642', '13309524050930'],
 ['643', '13309774466666'],
 ['644', '13309949672842'],
 ['726', '13317224945965'],
 ['727', '13317397841340'],
 ['728', '13317568537021'],
 ['729', '13317810030358'],
 ['730', '13317980527734'],
 ['731', '13318241938233'],
 ['732', '13318499127340'],
 ['744', '13318760641787'],
 ['733', '13319017446902'],
 ['734', '13319298330833'],
 ['735', '13319470867371'],
 ['736', '1331973020

In [17]:
import csv

# Read the CSV file and store the data as a list of rows
data = []

with open('./hawkid_espnid.csv', 'r') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        # Convert values to integers (assuming the file has two columns)
        data.append([int(row[0]), int(row[1])])  # Store as a list [first_value, second_value]

# Sort the list based on the second value (index 1 of each row)
sorted_data = sorted(data, key=lambda x: x[0])

# Write the sorted data back to a new CSV file
with open('./hawkid_espnid_new.csv.csv', 'w', newline='') as file:
    csv_writer = csv.writer(file)
    for row in sorted_data:
        csv_writer.writerow(row)

print("Sorted data has been saved to 'sorted_file.csv'")


Sorted data has been saved to 'sorted_file.csv'


In [16]:
# import requests
# from datetime import datetime, timedelta

# # Base URL without matchId
# base_url = "https://polls.iplt20.com/?entity_matchId=74795&matchId="

# # Define your start and end times
# start_date = datetime(2024, 1, 1, 12, 0)  # Example start time
# end_date = datetime(2024, 1, 1, 12, 30)  # Example end time (half an hour later)

# # Generate timestamps for the range
# current_time = start_date
# time_increment = timedelta(seconds=1)  # Increment by 1 second


# while current_time <= end_date:
#     # Convert to Unix timestamp (seconds since epoch)
#     match_id = int(current_time.timestamp())
#     url = f"{base_url}{match_id}"

#     try:
#         # Send a GET request
#         response = requests.get(url, timeout=10)

#         # Log the response
#         print(f"URL: {url}")
#         print(f"Status Code: {response.status_code}")

#         if response.status_code == 200:
#             print(f"Response: {response.text}\n")
#         else:
#             print("Non-200 response received.\n")

#     except requests.exceptions.RequestException as e:
#         print(f"Request failed for URL {url}: {e}\n")

#     # Increment time
#     current_time += time_increment
