In [None]:
import pandas as pd

# Load the dataset directly from Kaggle
df = pd.read_csv("/kaggle/input/injury-prediction-for-competitive-runners/week_approach_maskedID_timeseries.csv")

# Display the first few rows of the dataset
print(df.head())

# Display information about the dataset
print(df.info())

###Data Preprocessing¶

Next, we'll preprocess the data by dropping unnecessary columns.

In [None]:
# Drop unnecessary columns
columns_to_drop = ['avg training success', 'min training success', 'max training success', 
                   # More columns to drop...
                   'rel total kms week 1_2']

# Filter out columns that exist in the DataFrame
columns_to_drop_existing = [col for col in columns_to_drop if col in df.columns]

# Drop existing columns
if columns_to_drop_existing:
    df = df.drop(columns=columns_to_drop_existing)

# Display the modified DataFrame
print(df.head())


### Data Visualization

We'll visualize individual athlete data to gain insights into the patterns.

In [None]:
import matplotlib.pyplot as plt

def plot_individual_data(id, column):
    df0 = df[df['Athlete ID'] == id]
    plt.figure(figsize=(10, 6))
    plt.plot(df0['Date'], df0[column])
    plt.xlabel('Date')
    plt.ylabel(column)
    plt.title(f'Athlete {id} - {column}')
    plt.show()

# Example usage:
plot_individual_data(1, 'total kms')

### Model Training and Evaluation

We'll split the data into features and target variables, then train a K-Nearest Neighbors classifier.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Splitting data into features and target
X = df.drop(['injury', 'Athlete ID'], axis=1)
y = df['injury']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Training
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Model Evaluation
y_pred = knn_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

###Conclusion
In this project, we embarked on an exploration of machine learning's potential to predict running injuries using biomechanical data. Through comprehensive data preprocessing, visualization, and model training, we gained valuable insights into the patterns and trends associated with running injuries. While our K-Nearest Neighbors (KNN) classifier exhibited high accuracy in identifying non-injury cases, its performance in detecting injuries was limited, suggesting areas for further improvement. Future endeavors could involve delving deeper into feature engineering, addressing class imbalance, and exploring alternative models or ensemble methods. By continuing to refine our approaches and collaborating with domain experts, we aim to develop more robust predictive models that can aid in injury prevention and enhance the overall well-being of athletes.