In [14]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [15]:
file_path = './data/round-2014-small.txt'
file_delimiter = ';'
target_column_ids = [15, 79, 122]
target_column_names = ['Score', 'GIR', 'Putt']

data = pd.read_csv(file_path, delimiter=file_delimiter)
data.head(5)

Unnamed: 0,Tour,Tournament Year,Tournament #,Permanent Tournament #,Course #,Team ID,Team Number,Player Number,Player Name,Round Number,...,Putting 20'-25'(putts made),Putting 20'-25'(rank),Putting >25'(attempts),Putting >25'(putts made),Putting >25'(rank),Putting > 10' (putts made),Putting > 10' (attempts),Putting > 10' (rank),Total Putts Gained,Unnamed: 173
0,R,2014,10,464,775,,0,1381,"Funk, Fred",1,...,0,24.0,5,0,26.0,1.0,14.0,96.0,.524,
1,R,2014,10,464,775,,0,1706,"Love III, Davis",1,...,0,24.0,3,0,26.0,2.0,11.0,43.0,.618,
2,R,2014,10,464,775,,0,1724,"Maggert, Jeff",1,...,0,24.0,7,0,26.0,2.0,13.0,53.0,1.403,
3,R,2014,10,464,775,,0,1797,"Mediate, Rocco",1,...,0,,9,2,12.0,4.0,14.0,12.0,.904-,
4,R,2014,10,464,775,,0,2239,"Verplank, Scott",1,...,0,24.0,9,0,26.0,1.0,14.0,96.0,.263-,


In [16]:
selected_columns = data.iloc[:, target_column_ids]
selected_columns.columns = target_column_names
selected_columns

Unnamed: 0,Score,GIR,Putt
0,75,9,31
1,69,12,28
2,73,11,31
3,73,13,34
4,72,14,33
...,...,...,...
1435,72,10,28
1436,69,14,29
1437,67,11,24
1438,73,15,33


In [17]:
data_df = selected_columns

In [None]:
def read_data(file_path, target_column_ids, target_column_names, file_delimiter = ';'):
    """
    Reads data from a text file and returns the columns of interest with the updated column names

    Parameters:
    - file_path: str, path to the text file
    - target_column_ids: list of int, indices of the target columns
    - target_column_names: list of str, names of the target columns
    - file_delimiter: str, delimiter of the txt file, default as ';'

    Returns:
    - DataFrame containing the selected columns
    """
    # read the txt file with read_csv
    data = pd.read_csv(file_path, delimiter=file_delimiter)
    # select target columns
    selected_columns = data.iloc[:, target_column_ids]
    selected_columns.columns = target_column_names

    return selected_columns

In [18]:
X = data_df[['GIR', 'Putt']]
X = sm.add_constant(X)
X

Unnamed: 0,const,GIR,Putt
0,1.0,9,31
1,1.0,12,28
2,1.0,11,31
3,1.0,13,34
4,1.0,14,33
...,...,...,...
1435,1.0,10,28
1436,1.0,14,29
1437,1.0,11,24
1438,1.0,15,33


In [19]:
y = data_df['Score']
y

0       75
1       69
2       73
3       73
4       72
        ..
1435    72
1436    69
1437    67
1438    73
1439    72
Name: Score, Length: 1440, dtype: int64

In [20]:
model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,Score,R-squared:,0.826
Model:,OLS,Adj. R-squared:,0.826
Method:,Least Squares,F-statistic:,3422.0
Date:,"Sat, 03 Aug 2024",Prob (F-statistic):,0.0
Time:,17:28:59,Log-Likelihood:,-2503.7
No. Observations:,1440,AIC:,5013.0
Df Residuals:,1437,BIC:,5029.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,58.7469,0.437,134.427,0.000,57.890,59.604
GIR,-1.2705,0.017,-73.425,0.000,-1.304,-1.237
Putt,0.9464,0.016,60.892,0.000,0.916,0.977

0,1,2,3
Omnibus:,145.04,Durbin-Watson:,1.544
Prob(Omnibus):,0.0,Jarque-Bera (JB):,260.332
Skew:,0.672,Prob(JB):,2.95e-57
Kurtosis:,4.591,Cond. No.,388.0


In [None]:
def OLS_regression(data, X_columns, y_column):
    """
    Performs linear regression on the given data

    Parameters:
    - data: DataFrame containing input data
    - X_columns: list of str, containing column names of independent variables
    - y_columns: str, column name of target variable

    Returns:
    - Regression results (summary)
    """
    # form X and y matrices
    X = data[X_columns]
    X = sm.add_constant(X)    # Adds a contant term
    y = data[y_column]
    # run the regression
    model = sm.OLS(y, X).fit()

    return model.summary()

In [21]:
np.random.seed(42)

In [23]:
scores = data_df['Score'].values
scores

array([75, 69, 73, ..., 67, 73, 72])

In [29]:
random_samples = np.random.choice(scores, size=(10000, 4), replace=True)
prob_estimated = np.mean(random_samples.sum(axis=1) <= 270)

In [30]:
standard_error = np.sqrt(prob_estimated * (1 - prob_estimated) / 10000)
standard_error

np.float64(0.0017967871326342471)

In [None]:
def monte_carlo_simulation(data, target_column, n_trials=10000, sample_count=4, threshold=270, random_seed=42):
    """
    Perform Monte Carlo Simulation to estimate the probability that the sum of given number of values in the 
    target column is less than or equal to the threshold

    Parameters:
    - data: DataFrame that contains the target column
    - target_column: str, name of the target column
    - n_trials: int,  number of simulation trails
    - sample_count: int, number of sample in each simulation trials
    - threshold: int, summation threshold
    - random_seed: int, random seed for reproducibility
    
    Returns:
    - Two float numbers containing the estimated probability and standard error
    """
    # set random seed
    np.random.seed(random_seed)

    # get sample values
    scores = data[target_column].values

    # Generate simulation array 
    random_samples = np.random.choice(scores, size=(n_trials, sample_count), replace=True)

    # Calculate summation
    sum_of_samples = random_samples.sum(axis=1)

    # Calculate estimated probability of sums less than or equal to the threshold
    prob_estimated = np.mean(sum_of_samples <= threshold)

    # Calculate standard error
    standard_error = np.sqrt(prob_estimated * (1 - prob_estimated) / n_trials)

    return prob_estimated, standard_error

In [31]:
model.summary().as_text()

