In [97]:
import pandas as pd

# Step 1: Base URL for the JSON files
base_url = 'https://github.com/SunilGorantla/QT_Batch_0011/raw/main/Machine_Learning/datasets/chess/train'

# Step 2: List of JSON filenames to be read
file_names = [
    'tournament_1.json', 'tournament_1.json', 'tournament_10.json', 'tournament_10.json', 'tournament_100.json', 'tournament_100.json', 
    'tournament_101.json', 'tournament_101.json', 'tournament_102.json', 'tournament_102.json', 'tournament_103.json', 'tournament_103.json',
]

# Step 3: Initialize an empty list to store DataFrames
dfs = []

# Step 4: Loop through the file names, read each file, and append to the list
for file_name in file_names:
    file_url = f"{base_url}/{file_name}"
    try:
        df = pd.read_json(file_url)
        dfs.append(df)
    except Exception as e:
        print(f"Error reading {file_url}: {e}")

# Step 5: Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Step 6: Reset index to ensure unique index values
combined_df.reset_index(drop=True, inplace=True)

# Step 7: Check the structure of the DataFrame
print("Columns in the DataFrame:", combined_df.columns)
print("Sample data in the DataFrame:", combined_df.head())

# Step 8: Handle the 'games' column
if 'games' in combined_df.columns:
    # Ensure 'games' column contains lists
    if combined_df['games'].apply(lambda x: isinstance(x, list)).all():
        # Explode the 'games' column to separate rows for each game
        games_df = combined_df.explode('games')
        
        # Reset index after exploding
        games_df.reset_index(drop=True, inplace=True)
        
        # Convert the exploded 'games' column to a DataFrame
        games_details_df = pd.json_normalize(games_df['games'])

        # Concatenate the new games details DataFrame with the original DataFrame (excluding the old 'games' column)
        final_df = pd.concat([games_df.drop(columns='games'), games_details_df], axis=1)

        # Reset index again after concatenation
        final_df.reset_index(drop=True, inplace=True)

        # Display the new DataFrame structure
        print("Final DataFrame Structure:")
        print(final_df.head())

        # Optional: Save the new DataFrame to a CSV file for further analysis
        final_df.to_csv('simplified_chess_data.csv', index=False)
    else:
        print("The 'games' column does not contain lists.")
else:
    print("The 'games' column is missing.")


Columns in the DataFrame: Index(['name', 'start_date', 'end_date', 'games', 'tours', 'time_control'], dtype='object')
Sample data in the DataFrame:            name  start_date    end_date  \
0  tournament_1  2014-01-08  2014-01-17   
1  tournament_1  2014-01-08  2014-01-17   
2  tournament_1  2014-01-08  2014-01-17   
3  tournament_1  2014-01-08  2014-01-17   
4  tournament_1  2014-01-08  2014-01-17   

                                               games  tours time_control  
0  [{'white': '贾叶珍', 'black': '范辰妮', 'date': '201...      9        rapid  
1  [{'white': '曹灵缨', 'black': '陆桂姐', 'date': '201...      9        rapid  
2  [{'white': '刘家坚', 'black': '沈岚鸿', 'date': '201...      9        rapid  
3  [{'white': '沈岚鸿', 'black': '李文子', 'date': '201...      9        rapid  
4  [{'white': '范存妞', 'black': '沈岚鸿', 'date': '201...      9        rapid  
Final DataFrame Structure:
           name  start_date    end_date  tours time_control white black  \
0  tournament_1  2014-01-08  2014-01-17  

In [98]:
final_df = final_df.drop_duplicates()

In [101]:

# Create the 'Pattern' column by splitting the 'id' column after the first "_"
final_df['Pattern'] = final_df['id'].str.split('_', n=1).str[1]
#final_df['Pattern'] = final_df['Pattern'].str.replace('_', '.', regex=False)
final_df['series_no'] = final_df['id'].str.split('_', n=2).str[2]
#final_df['Groupwise_Rank'] = final_df['series_no'].rank(method='dense', ascending=True).astype(int)
final_df['Tournment_serial'] = final_df['name'].str.split('_', n=1).str[1]
# Concatenate the columns and declare the result as a float
final_df['concatenate'] = final_df['Tournment_serial'].astype(str) + '.' + final_df['series_no'].astype(str)
final_df['dense_rank11'] = final_df['series_no'].rank(method='dense', ascending=True).astype(int)
# Convert the concatenated string to float
#final_df['concatenate'] = final_df['concatenate'].astype(float)

# Rank the 'Pattern' column in ascending order and create a new column 'Rank'

final_df.head( )


Unnamed: 0,name,start_date,end_date,tours,time_control,white,black,date,result,id,Pattern,series_no,Tournment_serial,concatenate,dense_rank11
0,tournament_1,2014-01-08,2014-01-17,9,rapid,贾叶珍,范辰妮,2014-01-08,0.5,tournament_1_1,1_1,1,1,1.1,1
1,tournament_1,2014-01-08,2014-01-17,9,rapid,吕亚光,李嘉爵,2014-01-08,0.5,tournament_1_2,1_2,2,1,1.2,112
2,tournament_1,2014-01-08,2014-01-17,9,rapid,刘奇喜,刘晓鹏,2014-01-08,0.5,tournament_1_3,1_3,3,1,1.3,223
3,tournament_1,2014-01-08,2014-01-17,9,rapid,陆桂姐,郑新聪,2014-01-08,1.0,tournament_1_4,1_4,4,1,1.4,334
4,tournament_1,2014-01-08,2014-01-17,9,rapid,李汶玲,叶天英,2014-01-08,0.5,tournament_1_5,1_5,5,1,1.5,378


In [91]:
final_df.head(10000 )

Unnamed: 0,name,start_date,end_date,tours,time_control,white,black,date,result,id,Pattern,series_no,Tournment_serial,concatenate,dense_rank11
0,tournament_1,2014-01-08,2014-01-17,9,rapid,贾叶珍,范辰妮,2014-01-08,0.5,tournament_1_1,1_1,1,1,1.1,1
1,tournament_1,2014-01-08,2014-01-17,9,rapid,吕亚光,李嘉爵,2014-01-08,0.5,tournament_1_2,1_2,2,1,1.2,112
2,tournament_1,2014-01-08,2014-01-17,9,rapid,刘奇喜,刘晓鹏,2014-01-08,0.5,tournament_1_3,1_3,3,1,1.3,223
3,tournament_1,2014-01-08,2014-01-17,9,rapid,陆桂姐,郑新聪,2014-01-08,1.0,tournament_1_4,1_4,4,1,1.4,334
4,tournament_1,2014-01-08,2014-01-17,9,rapid,李汶玲,叶天英,2014-01-08,0.5,tournament_1_5,1_5,5,1,1.5,378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257,tournament_103,2014-09-08,2014-09-08,12,classic,杨燕波,李海,2014-09-08,0.5,tournament_103_320,103_320,320,103,103.320,247
3258,tournament_103,2014-09-08,2014-09-08,12,classic,秦常明,沈健琴,2014-09-08,0.5,tournament_103_321,103_321,321,103,103.321,248
3259,tournament_103,2014-09-08,2014-09-08,12,classic,张跃,李苏瑾,2014-09-08,1.0,tournament_103_322,103_322,322,103,103.322,249
3260,tournament_103,2014-09-08,2014-09-08,12,classic,程浩婷,杨俊瑞,2014-09-08,0.5,tournament_103_323,103_323,323,103,103.323,250


In [93]:

final_df['dense_rank22'] = final_df['series_no'].rank(method='dense', ascending=True).astype(int)


In [103]:
import pandas as pd

# Assuming final_df is already created and contains data

# Define the file path where you want to save the CSV file
file_path = r'F:\jupyter\final_AAAA.csv'

# Save the DataFrame to a CSV file
final_df.to_csv(file_path, index=False)

print(f"DataFrame has been saved to {file_path}")


DataFrame has been saved to F:\jupyter\final_AAAA.csv


In [107]:
import pandas as pd
T1=pd.read_csv("final_AAAA.csv")
T1.head(1000)

Unnamed: 0,name,start_date,end_date,tours,time_control,white,black,date,result,id,Pattern,series_no,Tournment_serial,concatenate,dense_rank11
0,tournament_1,2014-01-08,2014-01-17,9,rapid,贾叶珍,范辰妮,2014-01-08,0.5,tournament_1_1,1_1,1,1,1.100,1
1,tournament_1,2014-01-08,2014-01-17,9,rapid,吕亚光,李嘉爵,2014-01-08,0.5,tournament_1_2,1_2,2,1,1.200,112
2,tournament_1,2014-01-08,2014-01-17,9,rapid,刘奇喜,刘晓鹏,2014-01-08,0.5,tournament_1_3,1_3,3,1,1.300,223
3,tournament_1,2014-01-08,2014-01-17,9,rapid,陆桂姐,郑新聪,2014-01-08,1.0,tournament_1_4,1_4,4,1,1.400,334
4,tournament_1,2014-01-08,2014-01-17,9,rapid,李汶玲,叶天英,2014-01-08,0.5,tournament_1_5,1_5,5,1,1.500,378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,tournament_100,2014-08-29,2014-09-22,12,classic,金心峰,贾烨广,2014-09-20,0.0,tournament_100_399,100_399,399,100,100.399,333
996,tournament_100,2014-08-29,2014-09-22,12,classic,傅建,李舒初,2014-09-20,0.0,tournament_100_400,100_400,400,100,100.400,336
997,tournament_100,2014-08-29,2014-09-22,12,classic,李利平,刘碧艳,2014-09-20,1.0,tournament_100_401,100_401,401,100,100.401,337
998,tournament_100,2014-08-29,2014-09-22,12,classic,冯三立,王真直,2014-09-20,0.0,tournament_100_402,100_402,402,100,100.402,338


In [111]:
T1['dense_rank11'] = T1['series_no'].rank(method='dense', ascending=True).astype(int)

In [113]:
T1.head(1000)

Unnamed: 0,name,start_date,end_date,tours,time_control,white,black,date,result,id,Pattern,series_no,Tournment_serial,concatenate,dense_rank11
0,tournament_1,2014-01-08,2014-01-17,9,rapid,贾叶珍,范辰妮,2014-01-08,0.5,tournament_1_1,1_1,1,1,1.100,1
1,tournament_1,2014-01-08,2014-01-17,9,rapid,吕亚光,李嘉爵,2014-01-08,0.5,tournament_1_2,1_2,2,1,1.200,2
2,tournament_1,2014-01-08,2014-01-17,9,rapid,刘奇喜,刘晓鹏,2014-01-08,0.5,tournament_1_3,1_3,3,1,1.300,3
3,tournament_1,2014-01-08,2014-01-17,9,rapid,陆桂姐,郑新聪,2014-01-08,1.0,tournament_1_4,1_4,4,1,1.400,4
4,tournament_1,2014-01-08,2014-01-17,9,rapid,李汶玲,叶天英,2014-01-08,0.5,tournament_1_5,1_5,5,1,1.500,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,tournament_100,2014-08-29,2014-09-22,12,classic,金心峰,贾烨广,2014-09-20,0.0,tournament_100_399,100_399,399,100,100.399,399
996,tournament_100,2014-08-29,2014-09-22,12,classic,傅建,李舒初,2014-09-20,0.0,tournament_100_400,100_400,400,100,100.400,400
997,tournament_100,2014-08-29,2014-09-22,12,classic,李利平,刘碧艳,2014-09-20,1.0,tournament_100_401,100_401,401,100,100.401,401
998,tournament_100,2014-08-29,2014-09-22,12,classic,冯三立,王真直,2014-09-20,0.0,tournament_100_402,100_402,402,100,100.402,402


In [115]:
import pandas as pd

# Example initial ELO score
initial_elo = 1200

# Create a dictionary to store ELO ratings of players
elo_ratings = {}

# Function to calculate the expected score
def expected_score(elo_a, elo_b):
    return 1 / (1 + 10**((elo_b - elo_a) / 400))

# Function to update the ELO rating
def update_elo(elo, expected, actual, k=32):
    return elo + k * (actual - expected)

# Sort the DataFrame by 'Tournment_serial' and 'series_no'
T1 = T1.sort_values(by=['Tournment_serial', 'series_no']).reset_index(drop=True)

# Initialize lists to store ELO scores for each match
white_elo_scores = []
black_elo_scores = []

# Iterate over each match in the DataFrame and update ELO scores
for _, row in T1.iterrows():
    white_player = row['white']
    black_player = row['black']
    result = row['result']
    
    # Initialize ELO scores if players are not in the dictionary
    if white_player not in elo_ratings:
        elo_ratings[white_player] = initial_elo
    if black_player not in elo_ratings:
        elo_ratings[black_player] = initial_elo
    
    # Get current ELO ratings
    elo_white = elo_ratings[white_player]
    elo_black = elo_ratings[black_player]
    
    # Calculate expected scores
    expected_white = expected_score(elo_white, elo_black)
    expected_black = 1 - expected_white
    
    # Store current ELO scores
    white_elo_scores.append(elo_white)
    black_elo_scores.append(elo_black)
    
    # Determine actual scores based on match result
    if result == 1.0:  # White wins
        actual_white = 1.0
        actual_black = 0.0
    elif result == 0.0:  # Black wins
        actual_white = 0.0
        actual_black = 1.0
    else:  # Draw
        actual_white = 0.5
        actual_black = 0.5
    
    # Update ELO ratings
    elo_ratings[white_player] = update_elo(elo_white, expected_white, actual_white)
    elo_ratings[black_player] = update_elo(elo_black, expected_black, actual_black)

# Add the ELO scores to the DataFrame
T1['white_ELO_scores'] = white_elo_scores
T1['black_ELO_scores'] = black_elo_scores

# Display the updated DataFrame
print(T1.head())


           name  start_date    end_date  tours time_control white black  \
0  tournament_1  2014-01-08  2014-01-17      9        rapid   贾叶珍   范辰妮   
1  tournament_1  2014-01-08  2014-01-17      9        rapid   吕亚光   李嘉爵   
2  tournament_1  2014-01-08  2014-01-17      9        rapid   刘奇喜   刘晓鹏   
3  tournament_1  2014-01-08  2014-01-17      9        rapid   陆桂姐   郑新聪   
4  tournament_1  2014-01-08  2014-01-17      9        rapid   李汶玲   叶天英   

         date  result              id Pattern  series_no  Tournment_serial  \
0  2014-01-08     0.5  tournament_1_1     1_1          1                 1   
1  2014-01-08     0.5  tournament_1_2     1_2          2                 1   
2  2014-01-08     0.5  tournament_1_3     1_3          3                 1   
3  2014-01-08     1.0  tournament_1_4     1_4          4                 1   
4  2014-01-08     0.5  tournament_1_5     1_5          5                 1   

   concatenate  dense_rank11  white_ELO_scores  black_ELO_scores  
0          1.

In [117]:
import pandas as pd

# Assuming final_df is already created and contains data

# Define the file path where you want to save the CSV file
file_path = r'F:\jupyter\final_ELO.csv'

# Save the DataFrame to a CSV file
T1.to_csv(file_path, index=False)

print(f"DataFrame has been saved to {file_path}")


DataFrame has been saved to F:\jupyter\final_ELO.csv
