# STEP1: Data scraping 
The following code will scrap all the player's injury list from season 2003-2023

In [None]:
import csv
import requests
from bs4 import BeautifulSoup

In [5]:
df = pd.DataFrame()
injuries_data = []
for i in range(0,916):
    url_string = "https://www.prosportstransactions.com/baseball/Search/SearchResults.php?Player=&Team=&BeginDate=2003-01-01&EndDate=2023-11-15&InjuriesChkBx=yes&submit=Search&start="+str(25* i )
    req = requests.get(url_string)
    soup = BeautifulSoup(req.content, 'lxml')
    for item in soup.find_all("tr", {"align":"left"}):# Code for each individual page to capture data
        raw_text = item.text.strip().split("\n")
        injuries_data.append(raw_text)

df = pd.DataFrame(injuries_data)
df.head()

Unnamed: 0,0,1,2,3,4
0,2003-03-30,Astros,,• Carlos Hernandez (Enrique),placed on 15-day DL with shoulder injury
1,2003-04-01,Marlins,,• A.J. Burnett,reconstructive surgery on right elbow (date a...
2,2003-05-01,Blue Jays,,• Justin Miller (Mark),shoulder injury (DTD)
3,2003-05-04,Athletics,,• Ramon Hernandez (Jose),thumb injury (DTD)
4,2003-05-06,Athletics,• Ramon Hernandez (Jose),,returned to lineup


In [6]:
df.columns = ['Date','Team','Acquired','Relinquished','Notes']
df.head()

Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,2003-03-30,Astros,,• Carlos Hernandez (Enrique),placed on 15-day DL with shoulder injury
1,2003-04-01,Marlins,,• A.J. Burnett,reconstructive surgery on right elbow (date a...
2,2003-05-01,Blue Jays,,• Justin Miller (Mark),shoulder injury (DTD)
3,2003-05-04,Athletics,,• Ramon Hernandez (Jose),thumb injury (DTD)
4,2003-05-06,Athletics,• Ramon Hernandez (Jose),,returned to lineup


In [14]:
df.shape[0]

22862

In [26]:
df

Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,2003-03-30,Astros,,• Carlos Hernandez (Enrique),placed on 15-day DL with shoulder injury
1,2003-04-01,Marlins,,• A.J. Burnett,reconstructive surgery on right elbow (date a...
2,2003-05-01,Blue Jays,,• Justin Miller (Mark),shoulder injury (DTD)
3,2003-05-04,Athletics,,• Ramon Hernandez (Jose),thumb injury (DTD)
4,2003-05-06,Athletics,• Ramon Hernandez (Jose),,returned to lineup
...,...,...,...,...,...
22857,2023-10-02,Brewers,,• Brandon Woodruff,right shoulder injury (out indefinitely)
22858,2023-10-02,Yankees,,• Jasson Dominguez,"ligament replacement (""Tommy John"") surgery o..."
22859,2023-10-07,Astros,,• Kendall Graveman,discomfort in right shoulder (out for season)
22860,2023-10-07,Orioles,,• John Means,sore elbow (out indefinitely)


## STEP 2: Data cleaning

In [25]:
df.to_csv('injury.csv', index=False)

In [15]:
# 1. Keep only the columns we want
df_cleaned = df[['Date', 'Relinquished', 'Notes']]

# 2. Replace all content in the 'Notes' column with 1
df_cleaned['Notes'] = 1

# 3. Rename the 'Relinquished' column to 'Name'
df_cleaned = df_cleaned.rename(columns={'Relinquished': 'Name'})

# 4. Convert 'Date' to just the year and rename the column to 'Year'
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date']).dt.year
df_cleaned = df_cleaned.rename(columns={'Date': 'Year'})

print(df_cleaned)


       Year                                   Name  Notes
0      2003           • Carlos Hernandez (Enrique)      1
1      2003                         • A.J. Burnett      1
2      2003                 • Justin Miller (Mark)      1
3      2003               • Ramon Hernandez (Jose)      1
4      2003                                             1
...     ...                                    ...    ...
22857  2023                     • Brandon Woodruff      1
22858  2023                     • Jasson Dominguez      1
22859  2023                     • Kendall Graveman      1
22860  2023                           • John Means      1
22861  2023   • Jose Garcia / (Jose) Adolis Garcia      1

[22862 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Notes'] = 1


In [22]:
# 1. Remove the leading dot and any spaces after it
df_cleaned['Name'] = df_cleaned['Name'].str.replace(r'^\ • \s*', '', regex=True)

# 2. Remove parentheses with any content inside
df_cleaned['Name'] = df_cleaned['Name'].str.replace(r'\s*\([^)]*\)', '', regex=True)

# 3. For names with a "/", keep only the part after the slash
df_cleaned['Name'] = df_cleaned['Name'].str.split('/').str[-1]

print(df_cleaned)


       Year              Name  Notes
0      2003  Carlos Hernandez      1
1      2003      A.J. Burnett      1
2      2003     Justin Miller      1
3      2003   Ramon Hernandez      1
4      2003                        1
...     ...               ...    ...
22857  2023  Brandon Woodruff      1
22858  2023  Jasson Dominguez      1
22859  2023  Kendall Graveman      1
22860  2023        John Means      1
22861  2023     Adolis Garcia      1

[22862 rows x 3 columns]


In [32]:
# After dropping Name with NA, white space still appears
df2 = df_cleaned[df_cleaned['Name'].notna()]
df2

Unnamed: 0,Year,Name,Notes
0,2003,Carlos Hernandez,1
1,2003,A.J. Burnett,1
2,2003,Justin Miller,1
3,2003,Ramon Hernandez,1
4,2003,,1
...,...,...,...
22857,2023,Brandon Woodruff,1
22858,2023,Jasson Dominguez,1
22859,2023,Kendall Graveman,1
22860,2023,John Means,1


In [33]:
#remove white space and then dropping Name with NA
#please note that we are not dropping any informations here, Since the original dataframe we have are seperated with Acquired and Relinquished.
# Rows that Name are NAs are rows under Acquired, which we dont need in the beginning anyways.
df_cleaned = df_cleaned[df_cleaned['Name'].apply(lambda x: not (isinstance(x, str) and x.strip() == ''))]
df_cleaned = df_cleaned[df_cleaned['Name'].notna()]
df_cleaned

Unnamed: 0,Year,Name,Notes
0,2003,Carlos Hernandez,1
1,2003,A.J. Burnett,1
2,2003,Justin Miller,1
3,2003,Ramon Hernandez,1
5,2003,Mike Piazza,1
...,...,...,...
22857,2023,Brandon Woodruff,1
22858,2023,Jasson Dominguez,1
22859,2023,Kendall Graveman,1
22860,2023,John Means,1


In [34]:
df_cleaned.to_csv('injury_cleaned.csv', index=False)