#### Exercise 2 - Scrape data using read_html() and build a team winning prediction model ####

- Use the given url to scrape the tables data
- Clean the data by preprocessing (remove W-L%) and converting the required columns to numeric
- Create a label column suitable for classification (1s and 0s).
- Use a classification model to predict win or lose this season

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import sqlite3

import joblib

In [3]:
# Scrape the data from Pro-Football-Reference
# The read_html function returns a list of all tables on the page.
# The AFC and NFC tables are the first two.
url = 'https://www.pro-football-reference.com/years/2023/'
dfs = pd.read_html(url)

# The first table is AFC, the second is NFC
df_afc = dfs[0]
df_nfc = dfs[1]

print(f"NFC information:\n{df_nfc.info}\n\nDFC information{df_afc.info}")

NFC information:
<bound method DataFrame.info of                        Tm          W          L       W-L%         PF  \
0                NFC East   NFC East   NFC East   NFC East   NFC East   
1         Dallas Cowboys*         12          5       .706        509   
2    Philadelphia Eagles+         11          6       .647        433   
3         New York Giants          6         11       .353        266   
4   Washington Commanders          4         13       .235        329   
5               NFC North  NFC North  NFC North  NFC North  NFC North   
6          Detroit Lions*         12          5       .706        461   
7      Green Bay Packers+          9          8       .529        383   
8       Minnesota Vikings          7         10       .412        344   
9           Chicago Bears          7         10       .412        360   
10              NFC South  NFC South  NFC South  NFC South  NFC South   
11  Tampa Bay Buccaneers*          9          8       .529        348   
12

In [4]:
# Combine and Clean the Data
# Combine the two conference tables into one DataFrame
df = pd.concat([df_afc, df_nfc])
df = df[df['W'] != 'W'].reset_index(drop=True)
df['Tm'] = df['Tm'].str.replace('*', '').str.replace('+', '').str.strip()

# Convert stat columns to numeric types for modeling
cols_to_numeric = ['W', 'L', 'PF', 'PA', 'PD', 'MoV', 'SoS'] 
for col in cols_to_numeric:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df.dropna(inplace=True)


print(df,'\n',df.info())


<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, 1 to 39
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Tm      32 non-null     object 
 1   W       32 non-null     float64
 2   L       32 non-null     float64
 3   W-L%    32 non-null     object 
 4   PF      32 non-null     float64
 5   PA      32 non-null     float64
 6   PD      32 non-null     float64
 7   MoV     32 non-null     float64
 8   SoS     32 non-null     float64
 9   SRS     32 non-null     object 
 10  OSRS    32 non-null     object 
 11  DSRS    32 non-null     object 
dtypes: float64(7), object(5)
memory usage: 3.2+ KB
                       Tm     W     L  W-L%     PF     PA     PD   MoV  SoS  \
1           Buffalo Bills  11.0   6.0  .647  451.0  311.0  140.0   8.2 -1.8   
2          Miami Dolphins  11.0   6.0  .647  496.0  391.0  105.0   6.2 -1.8   
3           New York Jets   7.0  10.0  .412  268.0  355.0  -87.0  -5.1 -0.6   
4    New England

In [5]:
# Engineer the Target Column (Our Label)
# A winning season is when Wins > Losses
df['Winning_Season'] = (df['W'] > df['L']).astype(int)

In [13]:
# Save cleaned combined data to SQLite DB as table `stats`

conn = sqlite3.connect('NFL.db')
df.to_sql('stats', conn, if_exists='replace', index=False)
conn.commit()

print(pd.read_sql_query('SELECT COUNT(*) AS n_rows FROM stats', conn))

# Close the connection
conn.close()
print("Done. DB at", 'NFL.db')



   n_rows
0      32
Done. DB at NFL.db


In [7]:
# Prepare Data for Scikit-learn
# Select features (X) and the target (y)
# We'll use stats that describe team strength, but not the direct win/loss record.

features = ['PF', 'PA', 'PD', 'SoS']
target = 'Winning_Season'

X = df[features]
y = df[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42, stratify=y)

In [8]:
#  Build and Train the Model
model = LogisticRegression()
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [9]:
# Evaluate the Model
# Make predictions on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("--- NFL Winning Season Prediction ---")
print(f"Data scraped successfully for {len(df)} teams.")
print(f"Features used: {features}")
print(f"Target variable: '{target}'")
print("-" * 35)
print(f"Model: Logistic Regression")
print(f"Model Accuracy on Test Data: {accuracy:.2f}")

--- NFL Winning Season Prediction ---
Data scraped successfully for 32 teams.
Features used: ['PF', 'PA', 'PD', 'SoS']
Target variable: 'Winning_Season'
-----------------------------------
Model: Logistic Regression
Model Accuracy on Test Data: 0.75


In [16]:
# Save the model

joblib.dump(model,'model.pkl')

['model.pkl']

#### Using the model for prediction ####

In [10]:
# Gather new data for a hypothetical 2025 team 'Atlanta_Vipers'

new_team_data = {
    'PF': [465],   # Points For
    'PA': [380],   # Points Against
    'PD': [85],    # Point Differential
    'SoS': [1.5]   # Strength of Schedule
}

# Format the data into a DataFrame with the correct column order
future_team_df = pd.DataFrame(new_team_data, columns=['PF', 'PA', 'PD', 'SoS'])

print("--- New Team Data ---")
print(future_team_df)
print("\n")


# Scale the new data using the EXISTING scaler
# IMPORTANT: Use .transform(), not .fit_transform()!
future_team_scaled = scaler.transform(future_team_df)


# Make the prediction using the EXISTING model
prediction = model.predict(future_team_scaled)
prediction_probability = model.predict_proba(future_team_scaled)  # if you want to get the probability or the confidence of the model


# Interpret the results
print("--- Prediction Result ---")
if prediction[0] == 1:
    print("Prediction: The model predicts a Winning Season (1).")
else:
    print("Prediction: The model predicts a Non-Winning Season (0).")

# The predict_proba method gives the model's confidence
print(f"Model Confidence: {prediction_probability[0][1]*100:.2f}% probability of a winning season.")

--- New Team Data ---
    PF   PA  PD  SoS
0  465  380  85  1.5


--- Prediction Result ---
Prediction: The model predicts a Winning Season (1).
Model Confidence: 97.42% probability of a winning season.
