In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np

# Load the dataset
data = pd.read_excel(r'C:\Users\ROG\Desktop\finalized ml.xlsx')

# Use 'Attendance' as the feature and 'HomeTeamGoals' as the target
X = data[['Attendance']].fillna(0)  # Fill NaN values with 0 or handle appropriately
y = data['HomeTeamGoals'].fillna(0)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
reg = LinearRegression().fit(X_train, y_train)

# Make predictions
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)

# Calculate metrics
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Print the results
print(f'Train MSE: {mse_train}, RMSE: {rmse_train}, MAPE: {mape_train}, R2: {r2_train}')
print(f'Test MSE: {mse_test}, RMSE: {rmse_test}, MAPE: {mape_test}, R2: {r2_test}')


ModuleNotFoundError: No module named 'sklearn'

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np

# Load the dataset
file_path = '/mnt/data/finalized ml.xlsx'
data = pd.read_excel(file_path)

# Use all relevant numeric features for prediction
X_all = data[['Attendance', 'Year', 'HomeTeam Keyplayers', 'AwayTeam Keyplayers', 'HomeTeam Injuries', 'AwayTeam Injuries']].fillna(0)
y = data['HomeTeamGoals'].fillna(0)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=42)

# Train the linear regression model
reg = LinearRegression().fit(X_train, y_train)

# Make predictions
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)

# Calculate metrics
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Print the results
print(f'Train MSE: {mse_train}, RMSE: {rmse_train}, MAPE: {mape_train}, R2: {r2_train}')
print(f'Test MSE: {mse_test}, RMSE: {rmse_test}, MAPE: {mape_test}, R2: {r2_test}')


In [None]:
import pandas as pd
from sklearn.cluster import KMeans

# Load the dataset
file_path = '/mnt/data/finalized ml.xlsx'
data = pd.read_excel(file_path)

# Perform K-means clustering with k=2
X_clustering = data[['Attendance', 'Year', 'HomeTeam Keyplayers', 'AwayTeam Keyplayers', 'HomeTeam Injuries', 'AwayTeam Injuries']].fillna(0)

kmeans = KMeans(n_clusters=2, random_state=42, n_init="auto").fit(X_clustering)
labels = kmeans.labels_

# Print the cluster labels and centers
print(f'Cluster Labels: {labels}')
print(f'Cluster Centers: {kmeans.cluster_centers_}')


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Load the dataset
file_path = '/mnt/data/finalized ml.xlsx'
data = pd.read_excel(file_path)

# Perform K-means clustering with k=2
X_clustering = data[['Attendance', 'Year', 'HomeTeam Keyplayers', 'AwayTeam Keyplayers', 'HomeTeam Injuries', 'AwayTeam Injuries']].fillna(0)

kmeans = KMeans(n_clusters=2, random_state=42, n_init="auto").fit(X_clustering)
labels = kmeans.labels_

# Calculate metrics
silhouette = silhouette_score(X_clustering, labels)
ch_score = calinski_harabasz_score(X_clustering, labels)
db_index = davies_bouldin_score(X_clustering, labels)

# Print the results
print(f'Silhouette Score: {silhouette}')
print(f'Calinski-Harabasz Score: {ch_score}')
print(f'Davies-Bouldin Index: {db_index}')


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt

# Load the dataset
file_path = '/mnt/data/finalized ml.xlsx'
data = pd.read_excel(file_path)

# Prepare the data
X_clustering = data[['Attendance', 'Year', 'HomeTeam Keyplayers', 'AwayTeam Keyplayers', 'HomeTeam Injuries', 'AwayTeam Injuries']].fillna(0)

# Perform K-means for different k values and evaluate metrics
silhouette_scores = []
ch_scores = []
db_indices = []

for k in range(2, 20):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto").fit(X_clustering)
    silhouette_scores.append(silhouette_score(X_clustering, kmeans.labels_))
    ch_scores.append(calinski_harabasz_score(X_clustering, kmeans.labels_))
    db_indices.append(davies_bouldin_score(X_clustering, kmeans.labels_))

# Plot the metrics
plt.figure(figsize=(18, 5))

plt.subplot(1, 3, 1)
plt.plot(range(2, 20), silhouette_scores, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs k')

plt.subplot(1, 3, 2)
plt.plot(range(2, 20), ch_scores, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Calinski-Harabasz Score')
plt.title('CH Score vs k')

plt.subplot(1, 3, 3)
plt.plot(range(2, 20), db_indices, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Davies-Bouldin Index')
plt.title('DB Index vs k')

plt.show()


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load the dataset
file_path = '/mnt/data/finalized ml.xlsx'
data = pd.read_excel(file_path)

# Prepare the data
X_clustering = data[['Attendance', 'Year', 'HomeTeam Keyplayers', 'AwayTeam Keyplayers', 'HomeTeam Injuries', 'AwayTeam Injuries']].fillna(0)

# Calculate distortions for different k values
distortions = []

for k in range(2, 20):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto").fit(X_clustering)
    distortions.append(kmeans.inertia_)

# Plot the elbow plot
plt.figure(figsize=(10, 6))
plt.plot(range(2, 20), distortions, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Distortion')
plt.title('Elbow Plot')
plt.show()
