In [2]:
import os
import pandas as pd

# Step 1: Display current working directory
cwd = os.getcwd()
print(f"Current working directory: {cwd}")

# Step 2: Navigate two folders up and into the desired folder
base_path = os.path.abspath(os.path.join(cwd, "../../folder_04_extracted_players/folder_01_training_data"))

# Step 3: Define the list of training subfolders (can be extended in the future)
list_training_data = ['data_03_players_kaggle']

# Step 4: Define the list of color subfolders
list_colors = ['black', 'blue', 'green', 'orange', 'red', 'white', 'yellow']

# Step 5: Initialize an empty list to store the data
data = []

# Step 6: Process each training folder
for training_folder in list_training_data:
    # Construct the full path to the training folder
    training_folder_path = os.path.join(base_path, training_folder)
    
    # Check if the folder exists
    if not os.path.exists(training_folder_path):
        print(f"Folder not found: {training_folder_path}")
        continue
    
    # Process each color folder inside the current training folder
    for color in list_colors:
        # Construct the path to the color folder
        color_folder_path = os.path.join(training_folder_path, color)
        
        # Check if the folder exists
        if not os.path.exists(color_folder_path):
            print(f"Color folder not found: {color_folder_path}")
            continue
        
        # Iterate through all files in the color folder
        for file_name in os.listdir(color_folder_path):
            # Only process files with valid image extensions
            if file_name.lower().endswith(('.png', '.jpg', '.jpeg')):
                # Extract the game ID from the file name
                game_id = f"game_{file_name.split('_')[0]}"
                
                # Construct the full path to the image
                file_path = os.path.join(color_folder_path, file_name)
                
                # Append the data to the list
                data.append({
                    'path': file_path,
                    'color': color,
                    'game': game_id
                })


# Step 7: Create a DataFrame from the collected data
df = pd.DataFrame(data)
df['color_and_game'] = df['color'] + '_' + df['game']

# Step 8: Display the final DataFrame
print(f"Final DataFrame created with {len(df)} entries.")
print(df)

Current working directory: C:\Users\elias\OneDrive\Desktop\Artificial-Intelligence\FH-Technikum\Master_Thesis\folder_01_modules\folder_02_training_models
Final DataFrame created with 25586 entries.
                                                    path   color      game  \
0      C:\Users\elias\OneDrive\Desktop\Artificial-Int...   black  game_140   
1      C:\Users\elias\OneDrive\Desktop\Artificial-Int...   black  game_140   
2      C:\Users\elias\OneDrive\Desktop\Artificial-Int...   black  game_140   
3      C:\Users\elias\OneDrive\Desktop\Artificial-Int...   black  game_140   
4      C:\Users\elias\OneDrive\Desktop\Artificial-Int...   black  game_140   
...                                                  ...     ...       ...   
25581  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  yellow   game_84   
25582  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  yellow   game_84   
25583  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  yellow   game_84   
25584  C:\Users\elias\

In [3]:
df.head()

Unnamed: 0,path,color,game,color_and_game
0,C:\Users\elias\OneDrive\Desktop\Artificial-Int...,black,game_140,black_game_140
1,C:\Users\elias\OneDrive\Desktop\Artificial-Int...,black,game_140,black_game_140
2,C:\Users\elias\OneDrive\Desktop\Artificial-Int...,black,game_140,black_game_140
3,C:\Users\elias\OneDrive\Desktop\Artificial-Int...,black,game_140,black_game_140
4,C:\Users\elias\OneDrive\Desktop\Artificial-Int...,black,game_140,black_game_140


In [4]:
df["color_and_game"].unique()

array(['black_game_140', 'black_game_154', 'black_game_75',
       'black_game_76', 'black_game_82', 'black_game_97', 'blue_game_16',
       'blue_game_21', 'blue_game_26', 'blue_game_46', 'blue_game_50',
       'blue_game_63', 'green_game_115', 'green_game_172',
       'green_game_173', 'green_game_174', 'green_game_20',
       'green_game_53', 'orange_game_151', 'orange_game_169',
       'orange_game_170', 'orange_game_171', 'orange_game_53',
       'orange_game_74', 'red_game_102', 'red_game_104', 'red_game_15',
       'red_game_51', 'red_game_60', 'red_game_6', 'white_game_15',
       'white_game_5', 'white_game_65', 'white_game_68', 'white_game_73',
       'white_game_75', 'yellow_game_102', 'yellow_game_145',
       'yellow_game_14', 'yellow_game_18', 'yellow_game_19',
       'yellow_game_84'], dtype=object)

### For master thesis:
Missing from supervisor:
* black: 2 ( wegen tulln-06) done
* blue: 1 done
* green: 1 
* orange: 0
* red: 0
* white: 1
* yellow: 2
  



In [5]:
# Count unique occurrences in the 'game_and_color' column
unique_counts = df['color_and_game'].value_counts()

# Sort the result alphabetically by index
unique_counts_sorted = unique_counts.sort_index()

# Display the result
print("Unique occurrences in 'game_and_color' (alphabetically sorted):")
print(unique_counts_sorted)


Unique occurrences in 'game_and_color' (alphabetically sorted):
color_and_game
black_game_140     545
black_game_154     583
black_game_75      567
black_game_76      350
black_game_82      485
black_game_97      546
blue_game_16       652
blue_game_21       447
blue_game_26       483
blue_game_46       594
blue_game_50       750
blue_game_63       549
green_game_115     653
green_game_172     644
green_game_173     621
green_game_174     530
green_game_20      660
green_game_53      486
orange_game_151    545
orange_game_169    689
orange_game_170    728
orange_game_171    610
orange_game_53     566
orange_game_74     628
red_game_102       625
red_game_104       542
red_game_15        594
red_game_51        572
red_game_6         705
red_game_60        554
white_game_15      671
white_game_5       697
white_game_65      717
white_game_68      616
white_game_73      739
white_game_75      717
yellow_game_102    636
yellow_game_14     633
yellow_game_145    668
yellow_game_18     634
y

In [6]:
# Count unique occurrences in the 'game_and_color' column
unique_counts = df['color_and_game'].value_counts()

# Sort the result alphabetically by index
unique_counts_sorted = unique_counts.sort_index()

# Display the result
print("Unique occurrences in 'game_and_color' (alphabetically sorted):")
print(unique_counts_sorted)

# Calculate the minimum count
min_count = unique_counts.min()

# Calculate the maximum count
max_count = unique_counts.max()

# Calculate the average count
average_count = unique_counts.mean()

# Display the results
print("\nStatistics for 'color_and_game' occurrences:")
print(f"Minimum count: {min_count}")
print(f"Maximum count: {max_count}")
print(f"Average count: {average_count:.2f}")


Unique occurrences in 'game_and_color' (alphabetically sorted):
color_and_game
black_game_140     545
black_game_154     583
black_game_75      567
black_game_76      350
black_game_82      485
black_game_97      546
blue_game_16       652
blue_game_21       447
blue_game_26       483
blue_game_46       594
blue_game_50       750
blue_game_63       549
green_game_115     653
green_game_172     644
green_game_173     621
green_game_174     530
green_game_20      660
green_game_53      486
orange_game_151    545
orange_game_169    689
orange_game_170    728
orange_game_171    610
orange_game_53     566
orange_game_74     628
red_game_102       625
red_game_104       542
red_game_15        594
red_game_51        572
red_game_6         705
red_game_60        554
white_game_15      671
white_game_5       697
white_game_65      717
white_game_68      616
white_game_73      739
white_game_75      717
yellow_game_102    636
yellow_game_14     633
yellow_game_145    668
yellow_game_18     634
y

In [7]:
# Target number of samples per group
TARGET_COUNT = 300

# Random state for reproducibility
RANDOM_STATE = 19

def resample_group(group, target, random_state):
    """
    Resample a DataFrame group to the target number of samples.
    
    Parameters:
    - group (DataFrame): The group to resample.
    - target (int): The target number of samples.
    - random_state (int): Seed for random number generator.
    
    Returns:
    - DataFrame: Resampled group.
    """
    current_count = len(group)
    
    if current_count > target:
        # Downsample without replacement
        resampled = group.sample(n=target, replace=False, random_state=random_state)
    elif current_count < target:
        # Upsample with replacement
        resampled = group.sample(n=target, replace=True, random_state=random_state)
    else:
        # No resampling needed
        resampled = group
    
    return resampled


In [8]:
# Apply the resampling function to each group
df_resampled = df.groupby('color_and_game').apply(
    lambda group: resample_group(group, TARGET_COUNT, RANDOM_STATE)
).reset_index(drop=True)

# Display the first few rows of the resampled DataFrame
print("\nResampled Data:")
print(df_resampled.head())



Resampled Data:
                                                path  color      game  \
0  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  black  game_140   
1  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  black  game_140   
2  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  black  game_140   
3  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  black  game_140   
4  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  black  game_140   

   color_and_game  
0  black_game_140  
1  black_game_140  
2  black_game_140  
3  black_game_140  
4  black_game_140  


  df_resampled = df.groupby('color_and_game').apply(


In [9]:
# Count unique occurrences in the resampled 'color_and_game' column
new_unique_counts = df_resampled['color_and_game'].value_counts()

# Sort the result alphabetically by index
new_unique_counts_sorted = new_unique_counts.sort_index()

# Display the new distribution
print("\nNew unique occurrences in 'color_and_game' (alphabetically sorted):")
print(new_unique_counts_sorted)



New unique occurrences in 'color_and_game' (alphabetically sorted):
color_and_game
black_game_140     300
black_game_154     300
black_game_75      300
black_game_76      300
black_game_82      300
black_game_97      300
blue_game_16       300
blue_game_21       300
blue_game_26       300
blue_game_46       300
blue_game_50       300
blue_game_63       300
green_game_115     300
green_game_172     300
green_game_173     300
green_game_174     300
green_game_20      300
green_game_53      300
orange_game_151    300
orange_game_169    300
orange_game_170    300
orange_game_171    300
orange_game_53     300
orange_game_74     300
red_game_102       300
red_game_104       300
red_game_15        300
red_game_51        300
red_game_6         300
red_game_60        300
white_game_15      300
white_game_5       300
white_game_65      300
white_game_68      300
white_game_73      300
white_game_75      300
yellow_game_102    300
yellow_game_14     300
yellow_game_145    300
yellow_game_18     

In [10]:
new_unique_counts = df_resampled['color'].value_counts()
# Display the new distribution
print("\nNew unique occurrences in 'color_and_game' (alphabetically sorted):")
print(new_unique_counts)


New unique occurrences in 'color_and_game' (alphabetically sorted):
color
black     1800
blue      1800
green     1800
orange    1800
red       1800
white     1800
yellow    1800
Name: count, dtype: int64


In [11]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold


unique_values = sorted(df_resampled["color_and_game"].unique())

# Create a new DataFrame based on unique-values
df_fold = pd.DataFrame({
    "color_and_game": unique_values,
    "color": [x.split('_')[0] for x in unique_values]}) 



# Create Stratified KFold based on 'color'
df_fold['color_group'] = df_fold['color'].astype("category").cat.codes
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=19)

# Assign folds
df_fold['fold'] = -1
for fold, (_, test_idx) in enumerate(skf.split(df_fold, df_fold['color_group'])):
    df_fold.loc[test_idx, 'fold'] = fold

# Display the resulting folds
for i in range(3):
    print(f"\nFold {i + 1}:\n", df_fold[df_fold['fold'] == i][['color_and_game', 'color']])




Fold 1:
     color_and_game   color
0   black_game_140   black
3    black_game_76   black
7     blue_game_21    blue
10    blue_game_50    blue
13  green_game_172   green
17   green_game_53   green
22  orange_game_53  orange
23  orange_game_74  orange
25    red_game_104     red
27     red_game_51     red
32   white_game_65   white
33   white_game_68   white
39  yellow_game_18  yellow
40  yellow_game_19  yellow

Fold 2:
      color_and_game   color
2     black_game_75   black
4     black_game_82   black
8      blue_game_26    blue
11     blue_game_63    blue
14   green_game_173   green
16    green_game_20   green
18  orange_game_151  orange
21  orange_game_171  orange
24     red_game_102     red
26      red_game_15     red
31     white_game_5   white
34    white_game_73   white
36  yellow_game_102  yellow
37   yellow_game_14  yellow

Fold 3:
      color_and_game   color
1    black_game_154   black
5     black_game_97   black
6      blue_game_16    blue
9      blue_game_46    blue
12   

In [12]:
df_fold

Unnamed: 0,color_and_game,color,color_group,fold
0,black_game_140,black,0,0
1,black_game_154,black,0,2
2,black_game_75,black,0,1
3,black_game_76,black,0,0
4,black_game_82,black,0,1
5,black_game_97,black,0,2
6,blue_game_16,blue,1,2
7,blue_game_21,blue,1,0
8,blue_game_26,blue,1,1
9,blue_game_46,blue,1,2


In [13]:
# Merge df_resampled with df_fold based on 'game_and_color'
df_train = pd.merge(df_resampled, df_fold[['color_and_game', 'fold']], on='color_and_game', how='left')

# Show results
print("Merged DataFrame:")
print(df_train.head())


Merged DataFrame:
                                                path  color      game  \
0  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  black  game_140   
1  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  black  game_140   
2  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  black  game_140   
3  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  black  game_140   
4  C:\Users\elias\OneDrive\Desktop\Artificial-Int...  black  game_140   

   color_and_game  fold  
0  black_game_140     0  
1  black_game_140     0  
2  black_game_140     0  
3  black_game_140     0  
4  black_game_140     0  


In [14]:
df_train.head()

Unnamed: 0,path,color,game,color_and_game,fold
0,C:\Users\elias\OneDrive\Desktop\Artificial-Int...,black,game_140,black_game_140,0
1,C:\Users\elias\OneDrive\Desktop\Artificial-Int...,black,game_140,black_game_140,0
2,C:\Users\elias\OneDrive\Desktop\Artificial-Int...,black,game_140,black_game_140,0
3,C:\Users\elias\OneDrive\Desktop\Artificial-Int...,black,game_140,black_game_140,0
4,C:\Users\elias\OneDrive\Desktop\Artificial-Int...,black,game_140,black_game_140,0


In [15]:
# Group by 'color_and_game' and 'fold', and count occurrences
game_fold_counts = df_train.groupby(['color_and_game', 'fold']).size().reset_index(name='count')

# Sort by 'fold' and optionally by 'color_and_game'
game_fold_counts = game_fold_counts.sort_values(by=['fold', 'color_and_game'])

# Display the results
print("Count of each game in the folds (sorted by fold):")
print(game_fold_counts)


Count of each game in the folds (sorted by fold):
     color_and_game  fold  count
0    black_game_140     0    300
3     black_game_76     0    300
7      blue_game_21     0    300
10     blue_game_50     0    300
13   green_game_172     0    300
17    green_game_53     0    300
22   orange_game_53     0    300
23   orange_game_74     0    300
25     red_game_104     0    300
27      red_game_51     0    300
32    white_game_65     0    300
33    white_game_68     0    300
39   yellow_game_18     0    300
40   yellow_game_19     0    300
2     black_game_75     1    300
4     black_game_82     1    300
8      blue_game_26     1    300
11     blue_game_63     1    300
14   green_game_173     1    300
16    green_game_20     1    300
18  orange_game_151     1    300
21  orange_game_171     1    300
24     red_game_102     1    300
26      red_game_15     1    300
31     white_game_5     1    300
34    white_game_73     1    300
36  yellow_game_102     1    300
37   yellow_game_14     1 

In [16]:
df_train.shape

(12600, 5)