In [63]:
import pandas as pd
import numpy as np
pd.set_option("max_columns", None)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


import warnings
warnings.filterwarnings("ignore")

In [64]:
data = pd.read_csv("performance-prediction.csv")
data

In [65]:
data.info()

In [66]:
def preprocessing_inputs(df):
    df = df.copy()
    
    # Drop name column
    df = df.drop("Name", axis = 1)
    
    # Fill missing values
    df['3PointPercent'] = df['3PointPercent'].fillna(df['3PointPercent'].mean())
    
    # Split data into X, y
    X = df.drop('Target', axis = 1)
    y = df["Target"]
    
    # Train and test split 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, shuffle = True, random_state = 1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [67]:
X_train, X_test, y_train, y_test = preprocessing_inputs(data)

In [68]:
model = LogisticRegression()
model.fit(X_train, y_train)

acc = model.score(X_test, y_test)

print("Test Accuracy: {:.2f}%".format(acc*100))

In [69]:
pca = PCA(n_components = 2)
X_reduced = pd.DataFrame(pca.fit_transform(X_train), index = X_train.index, columns=['PCA1', 'PCA2'])

kmeans = KMeans(n_clusters = 4)
kmeans.fit(X_train)

clusters = pd.Series(kmeans.labels_, name = "Cluster", index = X_train.index)
centroids = pca.transform(kmeans.cluster_centers_)

X_reduced = pd.concat([X_reduced, y_train, clusters], axis = 1)
X_reduced

In [70]:
plt.figure(figsize = (10, 10))
plt.scatter(X_reduced['PCA1'], X_reduced['PCA2'], c = X_reduced['Cluster'])
plt.scatter(centroids[:,0], centroids[:,1], color = 'orange', s = 200)
plt.xlabel("PCA1")
plt.ylabel("PCA2")
plt.title("K-means Clustring")
plt.show()