In [1]:
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.linear_model import LogisticRegression  # Logistic regression model
from sklearn.metrics import accuracy_score, classification_report  # Model evaluation
from sklearn.impute import SimpleImputer  # For handling missing values
from sklearn.preprocessing import StandardScaler  # For scaling data


In [14]:
# Load the training dataset
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")


# Display the first few rows of the training data

train_data["3P Made"]




Unnamed: 0,3P Made
0,0.5
1,0.1
2,0.0
3,0.0
4,0.4
...,...
938,0.5
939,0.0
940,0.1
941,0.0


In [3]:
# Show column information and check for missing values
train_data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         943 non-null    object 
 1   GP           943 non-null    int64  
 2   MIN          943 non-null    float64
 3   PTS          943 non-null    float64
 4   FGM          943 non-null    float64
 5   FGA          943 non-null    float64
 6   FG%          943 non-null    float64
 7   3P Made      943 non-null    float64
 8   3PA          943 non-null    float64
 9   3P%          937 non-null    float64
 10  FTM          943 non-null    float64
 11  FTA          943 non-null    float64
 12  FT%          943 non-null    float64
 13  OREB         943 non-null    float64
 14  DREB         943 non-null    float64
 15  REB          943 non-null    float64
 16  AST          943 non-null    float64
 17  STL          943 non-null    float64
 18  BLK          943 non-null    float64
 19  TOV     

In [10]:
train_data.isnull().sum()

Unnamed: 0,0
Name,0
GP,0
MIN,0
PTS,0
FGM,0
FGA,0
FG%,0
3P Made,0
3PA,0
3P%,6


In [5]:
# Drop unnecessary columns
X = train_data.drop(['Name', 'Id', 'TARGET_5Yrs'], axis=1)  # Features
y = train_data['TARGET_5Yrs']  # Target variable

# Handle missing values using median imputation
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Print the shape of the processed data
print("Shape of Processed Data:", X_scaled.shape)


Shape of Processed Data: (943, 19)


In [17]:
# Initialize and train the logistic regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_scaled, y)

len(logistic_model.coef_[0])



19

In [7]:
y_pred = logistic_model.predict(X_scaled)
accuracy = accuracy_score(y, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 71.47%


In [8]:
data_test = test_data.drop(['Name', 'Id'], axis=1)
data_test_imputed = imputer.transform(data_test)
data_test_scaled = scaler.transform(data_test_imputed)

In [9]:
logistic_model.predict(data_test_scaled)

array([0., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1.,
       1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0.,
       0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0.,
       0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1.,
       1., 1., 1., 1., 1.