## Project Model

### 1. Import Necessary Libraries

In [2]:
import pandas as pd
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

### 2. Data Importing and Cleaning

In [8]:
directory = '/Users/yunzheyu/Desktop/DS340/Project/DataSets'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

cleaned_data = []  # This will store all the cleaned DataFrames to be combined later

for file in csv_files:
    file_path = os.path.join(directory, file)
    
    # Load the dataset
    data = pd.read_csv(file_path)
    
    # Filter to include only rows where the Event column is 'SBD' and Equipment is 'Raw'
    sbd_raw_data = data[(data['Event'] == 'SBD') & (data['Equipment'] == 'Raw')]
    
    # Drop specified columns
    columns_to_drop = [
        'AgeClass', 'BirthYearClass', 'WeightClassKg', 'Squat4Kg', 'Bench4Kg', 'Deadlift4Kg',
        'Wilks', 'Glossbrenner', 'Goodlift', 'Country', 'State', 'MeetCountry', 
        'MeetState', 'MeetTown', 'Federation', 'ParentFederation', 'MeetName'
    ]
    sbd_raw_data = sbd_raw_data.drop(columns=columns_to_drop, errors='ignore')
    
    # Drop rows where the Place column contains 'DQ', 'NS', or 'G'
    sbd_raw_data = sbd_raw_data[~sbd_raw_data['Place'].isin(['DQ', 'NS', 'G'])]

    # Drop rows where the Tested column is empty
    sbd_raw_data = sbd_raw_data.dropna(subset=['Tested'])
    
    # Save the cleaned data to a new CSV file
    cleaned_file_path = os.path.join(directory, f'cleaned_{file}')
    sbd_raw_data.to_csv(cleaned_file_path, index=False)
    
    # Append the cleaned data to the list for later combination
    cleaned_data.append(sbd_raw_data)

# Concatenate all data into a single DataFrame
combined_cleaned_data = pd.concat(cleaned_data, ignore_index=True)
combined_cleaned_data.to_csv('combined_cleaned_data.csv', index=False)

### Dealing with missing age.

Some Atheletes do have provide their ages. So, we will use the last known competition date and last known age to estimate the missing age value

In [9]:
# Ensure the 'Date' column is in datetime format
data['Date'] = pd.to_datetime(data['Date'])

# Sort data by athlete's name and date of competition
data = data.sort_values(by=['Name', 'Date'])

# Function to estimate ages based on previous known ages and competition dates
def estimate_age(row, last_known_age, last_known_date):
    if pd.notnull(row['Age']):
        return row['Age'], row['Date']
    elif pd.notnull(last_known_age) and pd.notnull(last_known_date):
        # Calculate the year difference
        year_difference = row['Date'].year - last_known_date.year
        # Estimate the current age
        estimated_age = last_known_age + year_difference
        return estimated_age, row['Date']
    return None, row['Date']

# Apply a groupby operation to carry forward the age estimation
last_known_age = None
last_known_date = None

def apply_age_estimation(group):
    global last_known_age, last_known_date
    last_known_age, last_known_date = None, None  # Reset for each group
    results = []
    for index, row in group.iterrows():
        estimated_age, last_known_date = estimate_age(row, last_known_age, last_known_date)
        last_known_age = estimated_age if pd.notnull(estimated_age) else last_known_age
        results.append(estimated_age)
    return pd.Series(results, index=group.index)

data['Estimated_Age'] = data.groupby('Name').apply(apply_age_estimation).reset_index(level=0, drop=True)

# Now handling other computations like Competition Frequency and Year-over-Year Performance

# Competition Frequency
competition_frequency = data.groupby('Name')['MeetName'].count().reset_index()
competition_frequency.rename(columns={'MeetName': 'CompetitionCount'}, inplace=True)

# Year-over-Year Performance Improvements
# Calculate the best total weight per year per athlete
data['CompetitionYear'] = data['Date'].dt.year
yearly_performance = data.groupby(['Name', 'CompetitionYear'])['TotalKg'].max().reset_index()
# Calculate year-over-year improvement
yearly_performance['YoY_Improvement'] = yearly_performance.groupby('Name')['TotalKg'].diff()

# Merge these calculations back to the main dataset
data = data.merge(competition_frequency, on='Name', how='left')
data = data.merge(yearly_performance[['Name', 'CompetitionYear', 'YoY_Improvement']], on=['Name', 'CompetitionYear'], how='left')

# Print the head of the dataset to verify results
print(data[['Name', 'Date', 'Age', 'Estimated_Age']].head())

ValueError: Cannot set a DataFrame with multiple columns to the single column Estimated_Age

### 3. Features Engineering