In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [39]:
# Load the cleaned dataset
df_clean = pd.read_csv("Premier_data_cleaned.csv")
df_clean.head()

Unnamed: 0,Season,Team,rank,points,members,foreign_players,mean_age,salaries,spending,Mean_Player_Rating,Wins,Draws,Losses,Goals_For,Goals_Against,Goal_Difference
0,2015-2016,Leicester,1.0,81.0,36.0,18.0,24.7,0,36800000,6.1,23.0,12.0,3.0,68.0,36.0,32.0
1,2015-2016,Arsenal,2.0,71.0,44.0,30.0,22.9,0,24100000,6.5,20.0,11.0,7.0,65.0,36.0,29.0
2,2015-2016,Tottenham,3.0,70.0,34.0,16.0,22.6,0,67000000,6.1,19.0,13.0,6.0,69.0,35.0,34.0
3,2015-2016,Manchester City,4.0,66.0,56.0,41.0,23.2,582000,196320000,6.3,19.0,9.0,10.0,71.0,41.0,30.0
4,2015-2016,Manchester United,5.0,66.0,50.0,27.0,22.3,1840000,146000000,6.0,19.0,9.0,10.0,49.0,35.0,14.0


In [40]:
# Step 1: Create dummy/indicator features for categorical variables
df_processed = pd.get_dummies(df_clean, drop_first=True)

In [41]:
# Step 2: Standardize the magnitude of numeric features using a scaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_processed)
scaled_df = pd.DataFrame(scaled_features, columns=df_processed.columns)

In [42]:
# Step 3: Split into testing and training datasets
X = scaled_df.drop(['rank'], axis=1)  # Assuming 'rank' is the target variable
y = scaled_df['rank']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
# Combine the datasets with a 'set' column to indicate train/test
train_data = pd.concat([X_train, y_train], axis=1)
train_data['set'] = 'train'

test_data = pd.concat([X_test, y_test], axis=1)
test_data['set'] = 'test'

# Combine train and test data into one DataFrame
combined_data = pd.concat([train_data, test_data])

# Save the combined dataset to a CSV file
combined_data.to_csv("combined_data.csv", index=False)

# Display the shapes of the datasets
print(f"Combined data shape: {combined_data.shape}")

Combined data shape: (416, 85)
