# RFP: Betting on the Bachelor

## Project Overview
You are invited to submit a proposal that answers the following question:

### Who will win season 29 of the Bachelor?

*All proposals must be submitted by **1/15/25 at 11:59 PM**.*

## Required Proposal Components

### 1. Data Description
In the code cell below, read in the data you plan on using to train and test your model. Call `info()` once you have read the data into a dataframe. Consider using some or all of the following sources:
- [Scrape Fandom Wikis](https://bachelor-nation.fandom.com/wiki/The_Bachelor) or [the official Bachelor website]('https://bachelornation.com/shows/the-bachelor/')
- [Ask ChatGPT to genereate it](https://chatgpt.com/)
- [Read in csv files like this](https://www.kaggle.com/datasets/brianbgonz/the-bachelor-contestants?select=contestants.csv)

*Note, a level 5 dataset contains at least 1000 rows of non-null data. A level 4 contains at least 500 rows of non-null data.*

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

# Mapping of week number to place
week_to_place = {
    1: 23, 2: 20, 3: 18, 4: 13, 5: 11, 6: 7, 7: 5, 8: 4, 9: 3, 10: 2
}

# Define a function to assign scores based on the place and outcome
def calculate_score(place, outcome):
    # If the outcome contains "Winner", the score should always be 1
    score_map = {
        1: 1,  # Winner (score 1)
        2: 2,  # Runner-up (score 2)
        3: 5,  # Highest week of season (score 5)
        4: 4,  # Highest week -1 (score 4)
        5: 3,  # Highest week -2 (score 3)
        6: 2,  # Highest week -3 (score 2)
        7: 1,  # Highest week -4 to -6 (score 1)
    }
    return score_map.get(place, 0)  # Default score if not in the map

def calculate_place(week_num, outcome):
    # Handling specific cases for "Winner" and "Runner-up" outcomes
    if "Winner" in outcome:
        return 1  # Winner
    elif "Runner-up" in outcome or "Runner Up" in outcome:
        return 2  # Runner-up or Runner Up
    elif week_num == 1:
        return 1  # Winner
    elif week_num == 2:
        return 2  # Highest week of season
    elif week_num == 3:
        return 3  # -1 from highest week
    elif week_num == 4:
        return 4  # -2 from highest week
    elif 5 <= week_num <= 8:
        return 5  # -3 from highest week (Place 5-8)
    elif 9 <= week_num <= 15:
        return 6  # -4 to -6 from highest week (Place 9-15)
    else:
        return 7  # Anything lower (Place 16-25)

def scrape_contestants_from_season(url, bachelor_age, show_type):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'class': 'wikitable'})
        if not table:
            return None
        bachelor_name = soup.find('span', {'class': 'bachelor_name'}).get_text(strip=True) if soup.find('span', {'class': 'bachelor_name'}) else 'Unknown'
        contestants = []
        for row in table.find_all('tr')[1:]:
            cols = row.find_all('td')
            if len(cols) >= 5:
                name, age, hometown, occupation, outcome = [col.get_text(strip=True) for col in cols[:5]]
                outcome = re.sub(r'\(Week \d+\)', '', re.sub(r'\(quit\)', '', outcome))
                week_num = re.search(r'Week (\d+)', outcome)
                
                # Initialize place and score with default values
                place = np.nan
                score = np.nan
                
                if week_num:
                    week_num = int(week_num.group(1))
                    place = calculate_place(week_num, outcome)  # Calculate place based on week number
                    score = calculate_score(place, outcome)  # Calculate score based on place
                
                # If "Winner" is in the outcome, force score to 1
                if "Winner" in outcome:
                    score = 1
                    place = 1  # Force place to 1 for Winner
                
                # Ensure place and score are assigned correctly
                try:
                    bachelor_age_int, contestant_age_int = int(bachelor_age), int(age)
                    age_difference = bachelor_age_int - contestant_age_int
                except ValueError:
                    age_difference = None
                
                contestants.append([name, age, hometown, occupation, outcome, place, score, age_difference, bachelor_name, show_type])
        return contestants
    except requests.RequestException:
        return None

def scrape_seasons(start_season, end_season, skip_seasons=[], bachelor_ages_df=None, bachelorette_ages_df=None, show_type="Bachelor"):
    all_contestants = []
    for season in tqdm(range(start_season, end_season + 1), desc=f"Scraping {show_type} Seasons", ncols=100):
        if season in skip_seasons:
            continue
        season_url = f"https://en.wikipedia.org/wiki/The_{show_type}_(American_TV_series)_season_{season}"
        bachelor_age = 'Unknown'
        if show_type == "Bachelor" and bachelor_ages_df is not None and not bachelor_ages_df.empty:
            bachelor_age = bachelor_ages_df.loc[bachelor_ages_df['Season'] == season, 'Age'].values[0]
        elif show_type == "Bachelorette" and bachelorette_ages_df is not None and not bachelorette_ages_df.empty:
            season_data = bachelorette_ages_df.loc[bachelorette_ages_df['Season'] == season, 'Age']
            if not season_data.empty:
                bachelor_age = season_data.values[0]
            else:
                bachelor_age = 'Unknown'
        contestants = scrape_contestants_from_season(season_url, bachelor_age, show_type)
        if contestants:
            all_contestants.extend([[season] + contestant for contestant in contestants])
    df = pd.DataFrame(all_contestants, columns=['Season', 'Name', 'Age', 'Hometown', 'Occupation', 'Outcome', 'Place', 'Score', 'Age Difference', 'Bachelor Name', 'Show'])
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
    return df

# Load the CSV files for Bachelor and Bachelorette ages
bachelor_ages_df = pd.read_csv('Bachelors.csv')
bachelorette_ages_df = pd.read_csv('Bachelorettes.csv')

# Check if the CSV files were loaded successfully
if bachelor_ages_df is None or bachelorette_ages_df is None:
    print("Error loading CSV files!")
else:
    print(bachelor_ages_df.head())
    print(bachelorette_ages_df.head())

# Scrape the seasons for Bachelor and Bachelorette
df_bachelor = scrape_seasons(1, 29, skip_seasons=[3, 4, 6, 7, 8], bachelor_ages_df=bachelor_ages_df, show_type="Bachelor")
df_bachelorette = scrape_seasons(1, 21, skip_seasons=[16, 19], bachelorette_ages_df=bachelorette_ages_df, show_type="Bachelorette")

# Combine the data for Bachelor and Bachelorette
df_combined = pd.concat([df_bachelor, df_bachelorette], ignore_index=True)

# Save the combined data to a CSV file
df_combined.to_csv('contestants_combined.csv', index=False)

# Print the first few rows of the combined data
print(df_combined.head())


   Season          Name   Age                   Hometown  \
0       1   Alex Michel  31.0  Charlottesville, Virginia   
1       2  Aaron Buerge  28.0           Butler, Missouri   
2       3           NaN   NaN                        NaN   
3       4           NaN   NaN                        NaN   
4       5  Jesse Palmer  25.0            Nepean, Ontario   

              Occupation  
0  management consultant  
1                 Banker  
2                    NaN  
3                    NaN  
4        Football Player  
   Season               Name   Age               Hometown  \
0       1        Trista Rehn  29.0  Indianapolis, Indiana   
1       2  Meredith Phillips  28.0      Beaverton, Oregon   
2       3   Jennifer Schefft  28.0           Mentor, Ohio   
3       4      DeAnna Pappas  26.0        Newnan, Georgia   
4       5     Jillian Harris  29.0   Peace River, Alberta   

                     Occupation  
0  pediatric physical therapist  
1                 Makeup artist  
2       

Scraping Bachelor Seasons: 100%|████████████████████████████████████| 29/29 [00:06<00:00,  4.55it/s]
Scraping Bachelorette Seasons: 100%|████████████████████████████████| 21/21 [00:05<00:00,  4.17it/s]

   Season             Name   Age              Hometown  \
0       1     Amanda Marsh  23.0       Chanute, Kansas   
1       1      Trista Rehn  29.0   St. Louis, Missouri   
2       1   Shannon Oliver  24.0         Dallas, Texas   
3       1  Kimberly Karels  24.0        Tempe, Arizona   
4       1     Cathy Grimes  22.0  Terre Haute, Indiana   

                        Occupation    Outcome  Place  Score  Age Difference  \
0                    Event Planner     Winner    1.0    1.0             8.0   
1                Miami Heat Dancer  Runner-up    NaN    NaN             2.0   
2  Financial Management Consultant     Week 5    5.0    3.0             7.0   
3                            Nanny     Week 4    4.0    4.0             7.0   
4                 Graduate Student     Week 3    3.0    5.0             9.0   

  Bachelor Name      Show  
0       Unknown  Bachelor  
1       Unknown  Bachelor  
2       Unknown  Bachelor  
3       Unknown  Bachelor  
4       Unknown  Bachelor  





In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df_cleaned = pd.read_csv('cleaned_contestants.csv')

# Remove any non-letter characters (except spaces) from the 'Name' field
df_cleaned['Name'] = df_cleaned['Name'].str.replace(r'[^a-zA-Z ]', '', regex=True)

label_encoder = LabelEncoder()
df_cleaned['Hometown'] = label_encoder.fit_transform(df_cleaned['Hometown'].fillna('Unknown'))
df_cleaned['Occupation'] = label_encoder.fit_transform(df_cleaned['Occupation'].fillna('Unknown'))

X = df_cleaned[['Age', 'Hometown', 'Occupation', 'Age Difference']].copy()

X['Age'] = X['Age'].fillna(X['Age'].median())
X['Age Difference'] = X['Age Difference'].fillna(X['Age Difference'].median())

X.replace([float('inf'), -float('inf')], float('nan'), inplace=True)
X.fillna(X.median(), inplace=True)

assert not X.isnull().any().any(), "There are still NaN values in the features."
assert not (X == float('inf')).any().any() and not (X == -float('inf')).any().any(), "There are still infinite values in the features."

# Save the cleaned data back to 'cleaned_contestants.csv'
df_cleaned.to_csv('cleaned_contestants.csv', index=False)

print("X is ready for training!")


X is ready for training!


### 2. Training Your Model
In the cell seen below, write the code you need to train a linear regression model. Make sure you display the equation of the plane that best fits your chosen data at the end of your program. 

*Note, level 5 work trains a model using only the standard Python library and Pandas. A level 5 model is trained with at least two features, where one of the features begins as a categorical value (e.g. occupation, hometown, etc.). A level 4 uses external libraries like scikit or numpy.*

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

df_cleaned = pd.read_csv('cleaned_contestants.csv')

df_cleaned['Place'].fillna(df_cleaned['Place'].mode()[0], inplace=True)

label_encoder = LabelEncoder()
for col in ['Hometown', 'Occupation']:
    df_cleaned[col] = label_encoder.fit_transform(df_cleaned[col].astype(str).fillna('Unknown'))

for col in ['Age', 'Age Difference']:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
    df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)

class_counts = df_cleaned['Place'].value_counts()
valid_classes = class_counts[class_counts > 1].index
df_cleaned_filtered = df_cleaned[df_cleaned['Place'].isin(valid_classes)]

X = df_cleaned_filtered[['Age', 'Hometown', 'Occupation', 'Age Difference']]
y = df_cleaned_filtered['Place']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

class_weights = dict(enumerate(pd.Series(y_train).value_counts(normalize=True).to_dict()))
model = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=15, min_samples_split=5, class_weight='balanced')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=1))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.09
Classification Report:
              precision    recall  f1-score   support

         1.0       0.29      0.33      0.31        12
         2.0       0.00      0.00      0.00         9
         3.0       0.14      0.17      0.15         6
         4.0       0.20      0.14      0.17         7
         5.0       0.25      0.12      0.17         8
         6.0       0.00      0.00      0.00         4
         7.0       0.13      0.22      0.17         9
         8.0       0.00      0.00      0.00         3
         9.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00         1
        11.0       0.00      0.00      0.00         9
        12.0       0.00      0.00      0.00         2
        13.0       0.00      0.00      0.00         9
        14.0       0.00      0.00      0.00         1
        15.0       0.00      0.00      0.00         2
        16.0       0.00      0.00      0.00         1
        17.0       0.00      0.00      0.00

### 3. Testing Your Model
In the cell seen below, write the code you need to test your linear regression model. 

*Note, a model is considered a level 5 if it achieves at least 60% prediction accuracy or achieves an RMSE of 2 weeks or less.*

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load cleaned dataset
df_cleaned = pd.read_csv('cleaned_contestants.csv')

# Preprocessing as before
df_cleaned['Place'].fillna(df_cleaned['Place'].mode()[0], inplace=True)

# Initialize LabelEncoder
label_encoder = LabelEncoder()
place_encoder = LabelEncoder()

# Fit the encoder on 'Place', 'Hometown', and 'Occupation' from the training data
df_cleaned['Place'] = place_encoder.fit_transform(df_cleaned['Place'].astype(str))
df_cleaned['Hometown'] = label_encoder.fit_transform(df_cleaned['Hometown'].astype(str).fillna('Unknown'))
df_cleaned['Occupation'] = label_encoder.fit_transform(df_cleaned['Occupation'].astype(str).fillna('Unknown'))

# Handle Age and Age Difference
for col in ['Age', 'Age Difference']:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
    df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)

# Filter classes based on valid counts in 'Place'
class_counts = df_cleaned['Place'].value_counts()
valid_classes = class_counts[class_counts > 1].index
df_cleaned_filtered = df_cleaned[df_cleaned['Place'].isin(valid_classes)]

# Features and target (exclude 'Place' for prediction)
X = df_cleaned_filtered[['Age', 'Hometown', 'Occupation', 'Age Difference']]
y = df_cleaned_filtered['Place']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the model
model = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=15, min_samples_split=5, class_weight='balanced')
model.fit(X_train, y_train)

# For Season 29, load the data
season_29_data = pd.read_csv('cleaned_contestants.csv')

# Filter data for Season 29 and 'Bachelor' Show
season_29_data = season_29_data[(season_29_data['Season'] == 29) & (season_29_data['Show'] == 'Bachelor')]

# Preprocess Season 29 data (same steps as above)
# Fit the LabelEncoder on the 'Hometown' and 'Occupation' columns for the new data (using the training encoder)
season_29_data['Hometown'] = label_encoder.transform(season_29_data['Hometown'].astype(str).fillna('Unknown'))
season_29_data['Occupation'] = label_encoder.transform(season_29_data['Occupation'].astype(str).fillna('Unknown'))

# For unseen labels in Season 29, apply a default encoding
season_29_data['Hometown'] = season_29_data['Hometown'].apply(lambda x: x if x in label_encoder.classes_ else -1)  # Assigning -1 for unseen
season_29_data['Occupation'] = season_29_data['Occupation'].apply(lambda x: x if x in label_encoder.classes_ else -1)  # Assigning -1 for unseen

# Replace unseen values with a placeholder (optional)
season_29_data['Hometown'].replace(-1, label_encoder.transform(['Unknown'])[0], inplace=True)
season_29_data['Occupation'].replace(-1, label_encoder.transform(['Unknown'])[0], inplace=True)

# Handle Age and Age Difference
for col in ['Age', 'Age Difference']:
    season_29_data[col] = pd.to_numeric(season_29_data[col], errors='coerce')
    season_29_data[col].fillna(season_29_data[col].median(), inplace=True)

# Handle unseen labels in 'Place' for Season 29 data
season_29_data['Place'] = place_encoder.transform(season_29_data['Place'].astype(str))

# For unseen labels in Place, handle them gracefully
season_29_data['Place'] = season_29_data['Place'].apply(
    lambda x: x if x in place_encoder.classes_ else -1  # Or apply a custom class for unseen labels
)

# Features for prediction (exclude 'Place' as it's the target we want to predict)
X_season_29 = season_29_data[['Age', 'Hometown', 'Occupation', 'Age Difference']]

# Predict 'Place' for Season 29 contestants
season_29_data['Predicted Place'] = model.predict(X_season_29)

# Output the predictions for Season 29 contestants
print(season_29_data[['Name', 'Predicted Place']])

# Optionally compare predictions to actual labels if available:
# print(f"Accuracy: {accuracy_score(season_29_data['Place'], season_29_data['Predicted Place']):.2f}")
# print("Classification Report:")
# print(classification_report(season_29_data['Place'], season_29_data['Predicted Place'], zero_division=1))


ValueError: y contains previously unseen labels: '475'

### 4. Final Answer

In the first cell seen below, state the name of your predicted winner. 
In the second cell seen below, justify your prediction using an evaluation technique like RMSE or percent accuracy.

#### State the name of your predicted winner here.
Zoe McGrady

#### Justify your prediction here.
Using my model, she has the highest prediction, although my model only thinks she'll place third.

Attempted steps to up model prediction:
Added scoring system instead of attempting to predict the place.
Added more data
