In [1]:
import numpy as np 
import pandas as pd
import os  
import warnings
warnings.filterwarnings('ignore') 
import ast 
from datetime import timedelta

In [2]:
player_vs_player_stats=pd.read_csv('dt/player_vs_player_stats.csv') 
player_stats_with_date_venue=pd.read_csv('dt/odi_player_stats_with_date_venue.csv') 
all_matches_player=pd.read_csv('dt/matches_all_players.csv') 
fantasy_points=pd.read_csv('fantasy_points_data.csv') 
venues_with_dates_with_locations_with_weather=pd.read_csv('dt/odis_venues_with_dates_with_locations_with_weather.csv')

In [5]:
def get_player_venue_stats(df, player_name, venue, date, delta): 
    import numpy as np 
    import pandas as pd 
    from datetime import timedelta
    df['Date'] = pd.to_datetime(df['Date'])
    end_date = pd.to_datetime(date)
    start_date = end_date - timedelta(days=delta)
    filtered_df = df[
        (df['player_name'] == player_name) & 
        (df['Venue'] == venue) & 
        (df['Date'] >= start_date) & 
        (df['Date'] < end_date)
    ]
    if filtered_df.empty:
        return pd.DataFrame([{
            'runs_scored': 0,
            'balls_faced': 0,
            'wickets_taken': 0,
            'runs_given': 0,
            'balls_thrown': 0,
            'boundaries_scored': 0,
            'boundaries_given': 0,
            'number_of_dismissals': 0,
            'strike_rate': 0,
            'economy': 0,
            'batting_average': 0,
            'fantasy_points': 0,
            'number_of_matches_played': 0
        }])
    aggregated_stats = {
        'Date': date,
        'Venue': venue,
        'player_Id': filtered_df['player_Id'].iloc[0],  # Assuming player_Id is unique for a player
        'player_name': player_name,
        'runs_scored': filtered_df['runs_scored'].sum(),
        'balls_faced': filtered_df['balls_faced'].sum(),
        'wickets_taken': filtered_df['wickets_taken'].sum(),
        'runs_given': filtered_df['runs_given'].sum(),
        'balls_thrown': filtered_df['balls_thrown'].sum(),
        'boundaries_scored': filtered_df['boundaries_scored'].sum(),
        'boundaries_given': filtered_df['boundaries_given'].sum(),
        'number_of_dismissals': filtered_df['number_of_dismissals'].sum(),
        'strike_rate': (filtered_df['runs_scored'].sum() / (filtered_df['balls_faced'].sum() if filtered_df['balls_faced'].sum() != 0 else 1)),
        'economy': ((filtered_df['runs_given'].sum() * 6) / (filtered_df['balls_thrown'].sum() if filtered_df['balls_thrown'].sum() != 0 else 1)),
        'batting_average': (filtered_df['runs_scored'].sum() / (filtered_df['number_of_dismissals'].sum() if filtered_df['number_of_dismissals'].sum() != 0 else 1)),
        'fantasy_points': filtered_df['fantasy_points'].sum(),
        'number_of_matches_played': filtered_df.shape[0]  # Number of matches played in the interval
    } 
    a=pd.DataFrame([aggregated_stats]) 
    a=a.drop(['Date','Venue','player_Id','player_name'],axis=1)
    return a 
def get_player_vs_player_stats_ordered(df, player1_name, player2_names, date, delta): 
    import numpy as np 
    import pandas as pd 
    from datetime import timedelta
    df['match_date'] = pd.to_datetime(df['match_date'])
    end_date = pd.to_datetime(date)
    start_date = end_date - timedelta(days=delta)
    filtered_df = df[
        (df['match_date'] >= start_date) & 
        (df['match_date'] < end_date)
    ]
    direct_matches = filtered_df[
        (filtered_df['player1_name'] == player1_name) & 
        (filtered_df['player2_name'].isin(player2_names))
    ]
    
    reverse_matches = filtered_df[
        (filtered_df['player2_name'] == player1_name) & 
        (filtered_df['player1_name'].isin(player2_names))
    ]
    reverse_matches = reverse_matches.rename(columns={
        'player1_id': 'player2_id', 'player1_name': 'player2_name',
        'player2_id': 'player1_id', 'player2_name': 'player1_name',
        'runs_b1_b2': 'runs_b2_b1', 'balls_b1_b2': 'balls_b2_b1',
        'boundaries_b1_b2': 'boundaries_b2_b1', 'dismissals_b1_b2': 'dismissals_b2_b1',
        'runs_b2_b1': 'runs_b1_b2', 'balls_b2_b1': 'balls_b1_b2',
        'boundaries_b2_b1': 'boundaries_b1_b2', 'dismissals_b2_b1': 'dismissals_b1_b2',
        'strike_rate_b1_b2': 'strike_rate_b2_b1', 'strike_rate_b2_b1': 'strike_rate_b1_b2',
        'economy_b1_b2': 'economy_b2_b1', 'economy_b2_b1': 'economy_b1_b2',
        'fantasy_point_p1_p2': 'fantasy_point_p2_p1', 'fantasy_point_p2_p1': 'fantasy_point_p1_p2'
    })
    combined_df = pd.concat([direct_matches, reverse_matches], ignore_index=True)
    aggregated_df = combined_df.groupby(['player1_id', 'player1_name', 'player2_id', 'player2_name']).agg({
    'runs_b1_b2': 'sum',
    'balls_b1_b2': 'sum',
    'boundaries_b1_b2': 'sum',
    'dismissals_b1_b2': 'sum',
    'fantasy_point_p1_p2': 'sum',
    'match_date': 'count' 
          }).reset_index()
    aggregated_df['strike_rate_b1_b2'] = aggregated_df['runs_b1_b2'] / aggregated_df['balls_b1_b2'].replace(0, 1)
    aggregated_df['economy_b1_b2'] = (aggregated_df['runs_b1_b2']*6) / (aggregated_df['balls_b1_b2'] ).replace(0, 1)
    aggregated_df.rename(columns={'match_date': 'number_of_matches_played'}, inplace=True)
    result_rows = []
    for player2_name in player2_names:
        row = aggregated_df[aggregated_df['player2_name'] == player2_name]
        if not row.empty: 
            row=row.drop(['player1_id','player1_name','player2_id','player2_name'],axis=1)
            result_rows.append(row.iloc[0].to_dict())  # Add the existing row
        else:
            result_rows.append({
                'runs_b1_b2': 0,
                'balls_b1_b2': 0,
                'boundaries_b1_b2': 0,
                'dismissals_b1_b2': 0,
                'strike_rate_b1_b2': 0,
                'economy_b1_b2': 0,
                'fantasy_point_p1_p2': 0,
                'number_of_matches_played': 0
            })
    
    # Convert result_rows to a DataFrame
    result_df = pd.DataFrame(result_rows) 
    return result_df 
def get_player_matchwise_stats(df, player, date, delta): 
    import numpy as np 
    import pandas as pd 
    from datetime import timedelta
    df['Date'] = pd.to_datetime(df['Date'])
    end_date = pd.to_datetime(date)
    start_date = end_date - timedelta(days=delta)
    filtered_df = df[
        (df['Player'] == player) &
        (df['Date'] >= start_date) &
        (df['Date'] < end_date)
    ]
    if filtered_df.empty:
        return pd.DataFrame([{
            'EWMA Fantasy Points': 0,
            'total_points': 0,
            'Runs': 0,
            'Wickets': 0,
            'Balls_Faced': 0,
            'Strike_Rate': 0,
            'matches_played': 0, 
            'Runs_Given': 0,  
            'Balls_Thrown': 0,  
            'Boundaries_Scored': 0,  
            'Boundaries_Given': 0,  
            'Number_of_Dismissals': 0,  
            'Economy': 0, 
            'Batting_Average': 0
        }])
    filtered_df = filtered_df.sort_values(by='Date')
    filtered_df['EWMA Fantasy Points'] = filtered_df['EWMA Fantasy Points']
    filtered_df['EWMA Fantasy Points'].fillna(0, inplace=True)
    aggregated_stats = {
        'EWMA Fantasy Points': filtered_df['EWMA Fantasy Points'].iloc[-1],  
        'total_points': filtered_df['total_points'].sum(),
        'Runs': filtered_df['Runs_Scored'].sum(),
        'Wickets': filtered_df['Wickets_Taken'].sum(),
        'Balls_Faced': filtered_df['Balls_Faced'].sum(),
        'Strike_Rate': (filtered_df['Runs_Scored'].sum() / (filtered_df['Balls_Faced'].sum() if filtered_df['Balls_Faced'].sum() != 0 else 1)),
        'matches_played': len(filtered_df), 
        'Runs_Given': filtered_df['Runs_Given'].sum(), 
        'Balls_Thrown': filtered_df['Balls_Thrown'].sum(), 
        'Boundaries_Scored': filtered_df['Boundaries_Scored'].sum(), 
        'Boundaries_Given': filtered_df['Boundaries_Given'].sum(), 
        'Number_of_Dismissals': filtered_df['Number_of_Dismissals'].sum(),  
        'Economy':((filtered_df['Runs_Given'].sum()*6) / (filtered_df['Balls_Thrown'].sum() if filtered_df['Balls_Thrown'].sum() != 0 else 1)),
        'Batting_Average': (filtered_df['Runs_Scored'].sum() / (filtered_df['Number_of_Dismissals'].sum() if filtered_df['Number_of_Dismissals'].sum() != 0 else 1)),
    }
    a = pd.DataFrame([aggregated_stats]) 
    return a 
counter=0
def stack_and_pad_dataframes(df1, df2, df3, df4,x_train=x_train):
    global counter
    current_rows, num_columns = df1.shape
    required_rows=12
    if current_rows < required_rows:
        missing_rows = required_rows - current_rows
        zero_padding = pd.DataFrame(
            0, 
            index=range(missing_rows), 
            columns=df1.columns
        )
        padded_df = pd.concat([df1, zero_padding], ignore_index=True)
    elif current_rows > required_rows:
        padded_df = df1.iloc[:required_rows, :]
    else:
        padded_df = df1
    flattened_array = padded_df.to_numpy().flatten()
    reshaped_array = flattened_array.reshape(-1, 1)
    flattened_array2 = df2.to_numpy().flatten()
    flattened_array3 = df3.to_numpy().flatten()
    flattened_array4 = df4.to_numpy().flatten()
    stacked_array = np.concatenate([ reshaped_array.reshape(1, -1), 
                                    flattened_array2.reshape(1, -1),
                                    flattened_array3.reshape(1, -1),
                                    flattened_array4.reshape(1, -1)], axis=1)
    final_array = stacked_array.reshape(1, -1)
    x_train[counter] = final_array
    counter += 1 
def get_fantasy_points(df, player_name, match_date):
    result = df[(df['Player Name'] == player_name) & (df['Match Date'] == match_date)]
    if not result.empty:
        return result['Fantasy Points'].values[0]
    else:
        return 0 
def get_weather_data(dataframe, date, venue): 
    import numpy as np 
    import pandas as pd 
    from datetime import timedelta
    match = dataframe[(dataframe['start_date'] == date) & (dataframe['venue'] == venue)]
    if not match.empty:
        match=match.drop(['start_date','venue','latitude','longitude'],axis=1)
        return match
    else:
        zero_row = pd.DataFrame([{
            'temperature': 0,
            'precipitation': 0,
            'wind_speed': 0
        }])
        return zero_row

In [4]:
x_train=np.zeros((70000,126)) 
y_train=np.zeros(70000)

In [6]:
data1='dt/csv1'  
data2='dt/csv2' 
data3='dt/csv3' 
data_1 = [f for f in os.listdir(data1) if os.path.isfile(os.path.join(data1, f))]
data_2 = [f for f in os.listdir(data2) if os.path.isfile(os.path.join(data2, f))]
data_3 = [f for f in os.listdir(data3) if os.path.isfile(os.path.join(data3, f))]

i=0
for data in data_1: 
    path2=os.path.join(data1,data) 
    data=pd.read_csv(path2)   
    team1_players=data['team1_players'].apply(ast.literal_eval)[0]
    team2_players=data['team2_players'].apply(ast.literal_eval)[0] 
    date=data['date'][0] 
    venue=data['venue'][0]  
    for player in team1_players:  
        a=get_player_venue_stats(player_stats_with_date_venue,player,venue,date,3000) 
        b=get_player_matchwise_stats(all_matches_player,player,date,180) 
        c=get_player_vs_player_stats_ordered(player_vs_player_stats,player,team2_players,date,800) 
        d=get_weather_data(venues_with_dates_with_locations_with_weather,date,venue) 
        stack_and_pad_dataframes(c,a,b,d) 
        e=get_fantasy_points(fantasy_points,player,date) 
        y_train[i]=e   
        i=i+1 
    for player in team2_players:  
        a=get_player_venue_stats(player_stats_with_date_venue,player,venue,date,3000) 
        b=get_player_matchwise_stats(all_matches_player,player,date,180) 
        c=get_player_vs_player_stats_ordered(player_vs_player_stats,player,team1_players,date,800) 
        d=get_weather_data(venues_with_dates_with_locations_with_weather,date,venue) 
        stack_and_pad_dataframes(c,a,b,d) 
        e=get_fantasy_points(fantasy_points,player,date) 
        y_train[i]=e  
        i=i+1 
for data in data_2: 
    path2=os.path.join(data2,data) 
    data=pd.read_csv(path2)   
    team1_players=data['team1_players'].apply(ast.literal_eval)[0]
    team2_players=data['team2_players'].apply(ast.literal_eval)[0] 
    date=data['date'][0] 
    venue=data['venue'][0]  
    for player in team1_players:  
        a=get_player_venue_stats(player_stats_with_date_venue,player,venue,date,3000) 
        b=get_player_matchwise_stats(all_matches_player,player,date,180) 
        c=get_player_vs_player_stats_ordered(player_vs_player_stats,player,team2_players,date,800) 
        d=get_weather_data(venues_with_dates_with_locations_with_weather,date,venue) 
        stack_and_pad_dataframes(c,a,b,d) 
        e=get_fantasy_points(fantasy_points,player,date) 
        y_train[i]=e   
        i=i+1 
    for player in team2_players:  
        a=get_player_venue_stats(player_stats_with_date_venue,player,venue,date,3000) 
        b=get_player_matchwise_stats(all_matches_player,player,date,180) 
        c=get_player_vs_player_stats_ordered(player_vs_player_stats,player,team1_players,date,800) 
        d=get_weather_data(venues_with_dates_with_locations_with_weather,date,venue) 
        stack_and_pad_dataframes(c,a,b,d) 
        e=get_fantasy_points(fantasy_points,player,date) 
        y_train[i]=e  
        i=i+1 
for data in data_3: 
    path2=os.path.join(data3,data) 
    data=pd.read_csv(path2)   
    team1_players=data['team1_players'].apply(ast.literal_eval)[0]
    team2_players=data['team2_players'].apply(ast.literal_eval)[0] 
    date=data['date'][0] 
    venue=data['venue'][0]  
    for player in team1_players:  
        a=get_player_venue_stats(player_stats_with_date_venue,player,venue,date,3000) 
        b=get_player_matchwise_stats(all_matches_player,player,date,180) 
        c=get_player_vs_player_stats_ordered(player_vs_player_stats,player,team2_players,date,800) 
        d=get_weather_data(venues_with_dates_with_locations_with_weather,date,venue) 
        stack_and_pad_dataframes(c,a,b,d) 
        e=get_fantasy_points(fantasy_points,player,date) 
        y_train[i]=e   
        i=i+1 
    for player in team2_players:  
        a=get_player_venue_stats(player_stats_with_date_venue,player,venue,date,3000) 
        b=get_player_matchwise_stats(all_matches_player,player,date,180) 
        c=get_player_vs_player_stats_ordered(player_vs_player_stats,player,team1_players,date,800) 
        d=get_weather_data(venues_with_dates_with_locations_with_weather,date,venue) 
        stack_and_pad_dataframes(c,a,b,d) 
        e=get_fantasy_points(fantasy_points,player,date) 
        y_train[i]=e  
        i=i+1 

In [7]:
x_train2=x_train[:52000,:] 
y_train2=y_train[:52000]  
x_train2=pd.DataFrame(x_train2) 
y_train2=pd.DataFrame(y_train2) 
x_train2.to_csv('x_train.csv',index=False) 
y_train2.to_csv('y_train.csv',index=False)

In [8]:
y_train

array([ -3., 200.,  82., ...,   0.,   0.,   0.])

In [9]:
import dill

with open("get_player_venue_stats.pkl", "wb") as f:
    dill.dump(get_player_venue_stats, f) 
with open("get_player_matchwise_stats.pkl", "wb") as f:
    dill.dump(get_player_matchwise_stats, f) 
with open("get_player_vs_player_stats_ordered.pkl", "wb") as f:
    dill.dump(get_player_vs_player_stats_ordered, f) 
with open("get_weather_data.pkl", "wb") as f:
    dill.dump(get_weather_data, f) 
with open("stack_and_pad_dataframes.pkl", "wb") as f:
    dill.dump(stack_and_pad_dataframes, f) 
with open("get_fantasy_points.pkl", "wb") as f:
    dill.dump(get_fantasy_points, f)  

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

In [22]:
# Define the feature names for each component of the stacked data
# df1: player_vs_player_stats (c) - 12 rows x 8 columns = 96 features
pvp_features = [
    'runs_b1_b2', 'balls_b1_b2', 'boundaries_b1_b2', 'dismissals_b1_b2',
    'strike_rate_b1_b2', 'economy_b1_b2', 'fantasy_point_p1_p2', 'number_of_matches_played'
]

# df2: player_venue_stats (a) - 13 features
venue_features = [
    'runs_scored', 'balls_faced', 'wickets_taken', 'runs_given',
    'balls_thrown', 'boundaries_scored', 'boundaries_given', 'number_of_dismissals',
    'strike_rate', 'economy', 'batting_average', 'fantasy_points', 'number_of_matches_played'
]

# df3: player_matchwise_stats (b) - 14 features
matchwise_features = [
    'EWMA Fantasy Points', 'total_points', 'Runs', 'Wickets',
    'Balls_Faced', 'Strike_Rate', 'matches_played', 'Runs_Given',
    'Balls_Thrown', 'Boundaries_Scored', 'Boundaries_Given', 'Number_of_Dismissals',
    'Economy', 'Batting_Average'
]

# df4: weather_data (d) - 3 features
weather_features = [
    'temperature', 'precipitation', 'wind_speed'
]

# Create the complete feature list
all_features = []

# Add player vs player features (12 rows x 8 columns = 96 features)
for row in range(12):
    for col in pvp_features:
        all_features.append(f"pvp_{row}_{col}")

# Add venue stats features (13 features)
for feature in venue_features:
    all_features.append(f"venue_{feature}")

# Add matchwise stats features (14 features)
for feature in matchwise_features:
    all_features.append(f"match_{feature}")

# Add weather features (3 features)
for feature in weather_features:
    all_features.append(f"weather_{feature}")

# Print all features with their indices
print("All 126 features in x_train:")
for i, feature in enumerate(all_features):
    print(f"Index {i}: {feature}")

# Load the data
x_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv')

# Rename the columns in x_train to use the feature names
x_train.columns = all_features

# Rename the column in y_train
y_train.columns = ['fantasy_points']

# Basic checks
print(f"\nX shape: {x_train.shape}")
print(f"Y shape: {y_train.shape}")

# Create an output directory for images
import os
os.makedirs('images', exist_ok=True)

All 126 features in x_train:
Index 0: pvp_0_runs_b1_b2
Index 1: pvp_0_balls_b1_b2
Index 2: pvp_0_boundaries_b1_b2
Index 3: pvp_0_dismissals_b1_b2
Index 4: pvp_0_strike_rate_b1_b2
Index 5: pvp_0_economy_b1_b2
Index 6: pvp_0_fantasy_point_p1_p2
Index 7: pvp_0_number_of_matches_played
Index 8: pvp_1_runs_b1_b2
Index 9: pvp_1_balls_b1_b2
Index 10: pvp_1_boundaries_b1_b2
Index 11: pvp_1_dismissals_b1_b2
Index 12: pvp_1_strike_rate_b1_b2
Index 13: pvp_1_economy_b1_b2
Index 14: pvp_1_fantasy_point_p1_p2
Index 15: pvp_1_number_of_matches_played
Index 16: pvp_2_runs_b1_b2
Index 17: pvp_2_balls_b1_b2
Index 18: pvp_2_boundaries_b1_b2
Index 19: pvp_2_dismissals_b1_b2
Index 20: pvp_2_strike_rate_b1_b2
Index 21: pvp_2_economy_b1_b2
Index 22: pvp_2_fantasy_point_p1_p2
Index 23: pvp_2_number_of_matches_played
Index 24: pvp_3_runs_b1_b2
Index 25: pvp_3_balls_b1_b2
Index 26: pvp_3_boundaries_b1_b2
Index 27: pvp_3_dismissals_b1_b2
Index 28: pvp_3_strike_rate_b1_b2
Index 29: pvp_3_economy_b1_b2
Index 30: 

In [24]:
import pickle
import os

# Assume all_features is already defined as in your code snippet
# Saving all_features to a file
os.makedirs('artifacts', exist_ok=True)
with open('all_features.pkl', 'wb') as f:
    pickle.dump(all_features, f)

print("Saved all_features (length =", len(all_features), ") to all_features.pkl")

Saved all_features (length = 126 ) to all_features.pkl


In [None]:
# 1. Distribution of Fantasy Points
plt.figure(figsize=(10, 6))
sns.histplot(y_train['fantasy_points'], kde=True, bins=50)
plt.title('Distribution of Fantasy Points')
plt.xlabel('Fantasy Points')
plt.ylabel('Frequency')
plt.axvline(y_train['fantasy_points'].mean(), color='r', linestyle='--', label=f'Mean: {y_train["fantasy_points"].mean():.2f}')
plt.axvline(y_train['fantasy_points'].median(), color='g', linestyle='-.', label=f'Median: {y_train["fantasy_points"].median():.2f}')
plt.legend()
plt.savefig('images/fantasy_points_distribution.png')
plt.close()

# 2. Fantasy Points Moving Average
plt.figure(figsize=(12, 6))
sns.lineplot(x=range(len(y_train)), y=y_train['fantasy_points'].rolling(window=50).mean())
plt.title('Moving Average of Fantasy Points (Window Size = 50)')
plt.xlabel('Sample Index')
plt.ylabel('Fantasy Points (50-match Moving Average)')
plt.savefig('images/fantasy_points_moving_avg.png')
plt.close()

# 3. Correlations between features and target
# Let's select a subset of important features for visualization
# Combine player venue and matchwise stats which are likely most important
important_features = venue_features + matchwise_features
important_feature_indices = [all_features.index(f"venue_{f}") for f in venue_features] + \
                           [all_features.index(f"match_{f}") for f in matchwise_features]

X_subset = x_train.iloc[:, important_feature_indices]
feature_names = [all_features[i] for i in important_feature_indices]
X_subset.columns = feature_names

# Add fantasy points to this subset
correlation_df = pd.concat([X_subset, y_train.reset_index(drop=True)], axis=1)

# Calculate and visualize correlations
plt.figure(figsize=(16, 14))
corr = correlation_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='coolwarm', vmin=-1, vmax=1, center=0,
            square=True, linewidths=.5, annot=False, fmt='.2f')
plt.title('Correlation Heatmap between Key Features and Fantasy Points')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('images/correlation_heatmap.png')
plt.close()