In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
chess_data = pd.read_csv('Openings.csv')

# Select the features and target variable
features = ['Opening', 'Num Games', 'Perf Rating', 'Player Win %']
X = chess_data[features]
y = chess_data['Player Win %']

# One-hot encode the 'Opening' column
encoder = OneHotEncoder()
X_encoded = pd.get_dummies(X, columns=['Opening'])
X_encoded = encoder.fit_transform(X_encoded)

# Split the data into training and testing sets
X_train_encoded, X_test_encoded, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train_encoded, y_train)

# Make predictions
predictions = model.predict(X_test_encoded)

# Calculate the prediction error for each sample
prediction_error = abs(predictions - y_test)

# Combine prediction error with original data
error_df = pd.DataFrame({'Prediction Error': prediction_error, 'Predicted Win %': predictions, 'Actual Win %': y_test})
error_df['Opening'] = X['Opening']  # Add 'Opening' column from original data

# Select the top 5 samples with the largest prediction errors
top_5_errors = error_df.nlargest(5, 'Prediction Error')

# Display the top 5 samples
print("Top 5 samples where the model's predictions were the furthest from the actual win rates:")
print(top_5_errors)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)


Top 5 samples where the model's predictions were the furthest from the actual win rates:
      Prediction Error  Predicted Win %  Actual Win %  \
1090            34.556           34.744          69.3   
382             22.162           35.762          13.6   
1774            22.067           35.533          57.6   
598             19.982           35.718          55.7   
543             19.961           35.161          15.2   

                                                Opening  
1090        Queen's Gambit Accepted, Saduleto Variation  
382   Four Knights Game, Spanish Variation, Rubinste...  
1774          Spanish Game, Morphy Defense, Wing Attack  
598          Italian Game, Classical Variation, General  
543                  Gruenfeld Defense, Smyslov Defense  
Mean Squared Error: 31.284756519893897


In [3]:
import pandas as pd

# Load the dataset
chess_data = pd.read_csv('Openings.csv')

# Calculate the average win percentage for each opening
opening_win_rates = chess_data.groupby('Opening')['Player Win %'].mean()

# Find the top 5 openings with the highest win percentage
top_openings = opening_win_rates.nlargest(5)

print("Top 5 Chess Openings by Win Percentage:")
print(top_openings)


Top 5 Chess Openings by Win Percentage:
Opening
Italian Game, Two Knights Defense, Fried Liver Attack           77.6
Italian Game, Classical Variation, Center Atttack               71.5
Queen's Gambit Refused, Chigorin Defense, Janowski Variation    70.1
Caro-Kann Defense, Maroczy Variation, Maroczy Gambit            69.5
Queen's Gambit Accepted, Saduleto Variation                     69.3
Name: Player Win %, dtype: float64
