In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
df = pd.read_excel("Football_Fall23.xlsx",sheet_name='Offense')

# Handling missing values
# Simple imputation for numerical columns with mean
num_imputer = SimpleImputer(strategy='mean')
numerical_cols = df.select_dtypes(include=['float64']).columns
df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])

# Simple imputation for categorical columns with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

# Encoding the 'QB Comment' column for multiclass classification
le = LabelEncoder()
df['QB Comment Encoded'] = le.fit_transform(df['QB Comment'])

# Selecting predictive columns (excluding the original 'QB Comment' column)
predictive_cols = ['Play Number', 'Series', 'Down', 'Distance', 'Field Position', 'Gain',
                   'Formation', 'Personnel', 'The_Play', 'Pass Concept', 'Run Concept', 'R/P']
predictive_data = df[predictive_cols]

# Encoding other categorical variables
for col in predictive_data.select_dtypes(include=['object']).columns:
    predictive_data[col] = le.fit_transform(predictive_data[col])

# Splitting the data into features (X) and target (y)
X = predictive_data
y = df['QB Comment Encoded']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Creating and training the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluating the model
predictions = model.predict(X_test_scaled)
report = classification_report(y_test, predictions)
print(report)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predictive_data[col] = le.fit_transform(predictive_data[col])


TypeError: Encoders require their input to be uniformly strings or numbers. Got ['int', 'str']