In [1]:
# Necessary imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load your dataset
data= pd.read_csv("C:\\Users\\MASSIVE\\Downloads\\datathon\\train.csv")

# Drop irrelevant features
data = data.drop(columns=["Patient_ID"])

In [2]:
data.head()

Unnamed: 0,Age,Marital_Status,Year of Operation,Positive_Axillary_Nodes,Tumor_Size,Radiation_Therapy,Chemotherapy,Hormone_Therapy,Survival_Status
0,77,Married,1962,5,3.0,No,Yes,No,1
1,36,Married,1964,2,1.9,Yes,No,No,1
2,47,Married,1960,5,2.0,No,No,No,0
3,54,Married,1965,0,1.4,No,No,No,0
4,35,Single,1968,5,4.1,Yes,Yes,Yes,1


In [3]:
data['Marital_Status'] = data['Marital_Status'].map({'Married': 1, 'Single': 0})
data['Radiation_Therapy'] = data['Radiation_Therapy'].map({'Yes': 1, 'No': 0})
data['Chemotherapy'] = data['Chemotherapy'].map({'Yes': 1, 'No': 0})
data['Hormone_Therapy'] = data['Hormone_Therapy'].map({'Yes': 1, 'No': 0})

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

# Separate features and target
X = data.drop("Survival_Status", axis=1)
y = data["Survival_Status"]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numeric and categorical features
numeric_features = ["Age", "Tumor_Size", "Positive_Axillary_Nodes", "Year of Operation"]
categorical_features = ["Marital_Status", "Radiation_Therapy", "Chemotherapy", "Hormone_Therapy"]

# Preprocessing: Standardize numeric features and OneHotEncode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop='first'), categorical_features),
    ]
)

# Build a KNN model pipeline
knn_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", KNeighborsClassifier())
])

# Define the parameter grid for tuning
param_grid = {
    'classifier__n_neighbors': [3, 5, 7, 9, 11],  # Example values for K
    'classifier__weights': ['uniform', 'distance'],
}

# Perform grid search to find the best parameters
grid_search = GridSearchCV(knn_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters
print(f"Best Parameters: {grid_search.best_params_}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Best Parameters: {'classifier__n_neighbors': 3, 'classifier__weights': 'distance'}
Accuracy: 56.33%


i used a lot of models but this gave me the best results.

In [5]:
# Load the new data file (replace 'new_data.csv' with the actual file path)
new_data = pd.read_csv('C:\\Users\\MASSIVE\\Downloads\\datathon\\test.csv')

# Check the new data to confirm everything is in order
print(new_data.head())


   Patient_ID  Age Marital_Status  Year of Operation  Positive_Axillary_Nodes  \
0        1501   62         Single               1966                        4   
1        1502   33        Married               1960                        4   
2        1503   52        Married               1963                        7   
3        1504   56        Married               1968                        7   
4        1505   70        Married               1968                        1   

   Tumor_Size Radiation_Therapy Chemotherapy Hormone_Therapy  
0         1.4               Yes           No              No  
1         3.8               Yes           No              No  
2         2.1               Yes           No              No  
3         1.6               Yes           No              No  
4         4.1               Yes           No             Yes  


In [15]:
new_data = new_data.drop(columns=["Patient_ID"])

In [16]:
# Map categorical values in the new data
new_data = new_data.drop(columns=["Patient_ID"])
new_data['Marital_Status'] = new_data['Marital_Status'].map({'Married': 1, 'Single': 0})
new_data['Radiation_Therapy'] = new_data['Radiation_Therapy'].map({'Yes': 1, 'No': 0})
new_data['Chemotherapy'] = new_data['Chemotherapy'].map({'Yes': 1, 'No': 0})
new_data['Hormone_Therapy'] = new_data['Hormone_Therapy'].map({'Yes': 1, 'No': 0})

# Verify the mappings
print(new_data.head())


   Age  Marital_Status  Year of Operation  Positive_Axillary_Nodes  \
0   62               0               1966                        4   
1   33               1               1960                        4   
2   52               1               1963                        7   
3   56               1               1968                        7   
4   70               1               1968                        1   

   Tumor_Size  Radiation_Therapy  Chemotherapy  Hormone_Therapy  
0         1.4                  1             0                0  
1         3.8                  1             0                0  
2         2.1                  1             0                0  
3         1.6                  1             0                0  
4         4.1                  1             0                1  


In [8]:
X.head()

Unnamed: 0,Age,Marital_Status,Year of Operation,Positive_Axillary_Nodes,Tumor_Size,Radiation_Therapy,Chemotherapy,Hormone_Therapy
0,77,1,1962,5,3.0,0,1,0
1,36,1,1964,2,1.9,1,0,0
2,47,1,1960,5,2.0,0,0,0
3,54,1,1965,0,1.4,0,0,0
4,35,0,1968,5,4.1,1,1,1


In [9]:
predictions = grid_search.predict(new_data)

# Add predictions to the new DataFrame for clarity (optional)
new_data['Predicted_Survival_Status'] = predictions

# Display the new DataFrame with predictions
print(new_data)

     Age  Marital_Status  Year of Operation  Positive_Axillary_Nodes  \
0     62               0               1966                        4   
1     33               1               1960                        4   
2     52               1               1963                        7   
3     56               1               1968                        7   
4     70               1               1968                        1   
..   ...             ...                ...                      ...   
495   50               0               1967                        4   
496   53               1               1961                        3   
497   74               1               1963                       11   
498   37               0               1960                       23   
499   76               1               1969                        7   

     Tumor_Size  Radiation_Therapy  Chemotherapy  Hormone_Therapy  \
0           1.4                  1             0                0 

In [10]:
new_data.to_csv("C:\\Users\\MASSIVE\\Downloads\\datathon\\test predicted.csv", index=False)

print("Predictions saved to CSV file.")

Predictions saved to CSV file.


In [11]:
import pandas as pd

# Step 1: Load the existing test CSV file
test_data = pd.read_csv("C:\\Users\\MASSIVE\\Downloads\\datathon\\test.csv")

# Step 2: Ensure that new_data and test_data have the same number of rows
# This is important to avoid issues when adding the new column
if len(test_data) != len(new_data):
    raise ValueError("The number of rows in test_data and new_data do not match.")

# Step 3: Add the predictions from new_data as a new column in test_data
test_data['Predicted_Survival_Status'] = new_data['Predicted_Survival_Status']

# Step 4: Save the updated test DataFrame back to CSV
test_data.to_csv("C:\\Users\\MASSIVE\\Downloads\\datathon\\test.csv", index=False)

print("Predictions added to test.csv.")


Predictions added to test.csv.


In [12]:


# Step 1: Load the existing test CSV file
test_data = pd.read_csv("C:\\Users\\MASSIVE\\Downloads\\datathon\\test.csv")

# Step 2: Select only the patientID and the predicted survival status columns
# Ensure you replace 'Predicted_Survival_Status' with the actual name if it differs
filtered_test_data = test_data[['Patient_ID', 'Predicted_Survival_Status']]

# Step 3: Save the updated test DataFrame back to CSV
filtered_test_data.to_csv("C:\\Users\\MASSIVE\\Downloads\\datathon\\test.csv", index=False)

print("Dropped all columns except patientID and predicted survival status.")


Dropped all columns except patientID and predicted survival status.


In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE  # Optional for handling class imbalance

# Assuming 'data' is your DataFrame
# Separate features and target
X = data.drop("Survival_Status", axis=1)  # Replace with actual target column name
y = data["Survival_Status"]

# Feature Engineering
# Example: Categorizing ages into bins
X['Age_Category'] = pd.cut(X['Age'], bins=[0, 30, 60, 90], labels=['Young', 'Middle_Aged', 'Old'])



# Adding polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_numeric = X.select_dtypes(include=[np.number])  # Only numeric columns for polynomial features
X_poly = poly.fit_transform(X_numeric)

# Create a DataFrame for the polynomial features and keep original categorical columns
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X_numeric.columns))

# Concatenate original categorical features with new polynomial features
X = pd.concat([X_poly_df, X[['Marital_Status', 'Radiation_Therapy', 'Chemotherapy', 'Hormone_Therapy', 'Age_Category']].reset_index(drop=True)], axis=1)

# Remove duplicate column names if any
X = X.loc[:, ~X.columns.duplicated()]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline for numerical and categorical features
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = ['Marital_Status', 'Radiation_Therapy', 'Chemotherapy', 'Hormone_Therapy', 'Age_Category']

# Preprocessor with imputer and scaler
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="mean")), 
            ("scaler", StandardScaler())
        ]), numeric_features),
        ("cat", OneHotEncoder(drop='first'), categorical_features),
    ]
)

# Build a model pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", KNeighborsClassifier())
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    "classifier__n_neighbors": np.arange(1, 21),  # Try neighbors from 1 to 20
    "classifier__weights": ['uniform', 'distance']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Cross-validation to get a better estimate of model performance
cv_scores = cross_val_score(best_model, X, y, cv=5)
print(f"Mean CV Accuracy: {np.mean(cv_scores) * 100:.2f}%")

# Optionally, use SMOTE for oversampling (uncomment if needed)
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Train the best model
best_model.fit(X_train, y_train)

# Evaluate on test set
test_accuracy = best_model.score(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.2s
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.1s
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.0s
[CV] END classifier__n_neighbors=2, classifier__weights=uniform; total time=   0.1s
[CV] END 

In [18]:
# Load the new data file (replace 'new_data.csv' with the actual file path)
new_data = pd.read_csv('C:\\Users\\MASSIVE\\Downloads\\datathon\\test.csv')

# Check the new data to confirm everything is in order
print(new_data.head())

   Patient_ID  Age Marital_Status  Year of Operation  Positive_Axillary_Nodes  \
0        1501   62         Single               1966                        4   
1        1502   33        Married               1960                        4   
2        1503   52        Married               1963                        7   
3        1504   56        Married               1968                        7   
4        1505   70        Married               1968                        1   

   Tumor_Size Radiation_Therapy Chemotherapy Hormone_Therapy  
0         1.4               Yes           No              No  
1         3.8               Yes           No              No  
2         2.1               Yes           No              No  
3         1.6               Yes           No              No  
4         4.1               Yes           No             Yes  


In [19]:
new_data = new_data.drop(columns=["Patient_ID"])
new_data['Marital_Status'] = new_data['Marital_Status'].map({'Married': 1, 'Single': 0})
new_data['Radiation_Therapy'] = new_data['Radiation_Therapy'].map({'Yes': 1, 'No': 0})
new_data['Chemotherapy'] = new_data['Chemotherapy'].map({'Yes': 1, 'No': 0})
new_data['Hormone_Therapy'] = new_data['Hormone_Therapy'].map({'Yes': 1, 'No': 0})

# Verify the mappings
print(new_data.head())

   Age  Marital_Status  Year of Operation  Positive_Axillary_Nodes  \
0   62               0               1966                        4   
1   33               1               1960                        4   
2   52               1               1963                        7   
3   56               1               1968                        7   
4   70               1               1968                        1   

   Tumor_Size  Radiation_Therapy  Chemotherapy  Hormone_Therapy  
0         1.4                  1             0                0  
1         3.8                  1             0                0  
2         2.1                  1             0                0  
3         1.6                  1             0                0  
4         4.1                  1             0                1  


In [21]:
import pandas as pd
import numpy as np

# Assuming your model and polynomial feature transformer are already trained as 'best_model' and 'poly'
# Load new data
 # Adjust the path as needed

# Feature Engineering on new_data
new_data['Age_Category'] = pd.cut(new_data['Age'], bins=[0, 30, 60, 90], labels=['Young', 'Middle_Aged', 'Old'])

# Create polynomial features for new_data
X_new_numeric = new_data.select_dtypes(include=[np.number])  # Only numeric columns for polynomial features
X_new_poly = poly.transform(X_new_numeric)  # Use transform instead of fit_transform

# Create a DataFrame for the polynomial features
X_new_poly_df = pd.DataFrame(X_new_poly, columns=poly.get_feature_names_out(X_new_numeric.columns))

# Concatenate with the original categorical features
new_data = pd.concat([X_new_poly_df, new_data[['Marital_Status', 'Radiation_Therapy', 'Chemotherapy', 'Hormone_Therapy', 'Age_Category']].reset_index(drop=True)], axis=1)

# Remove duplicate column names if any
new_data = new_data.loc[:, ~new_data.columns.duplicated()]

# Make predictions on new data
predictions = best_model.predict(new_data)

# Add predictions to new_data
new_data['Predicted_Survival_Status'] = predictions

# Save the results to a new CSV file
new_data.to_csv('path_to_save_predictions.csv', index=False)  # Adjust the path as needed

print("Predictions saved successfully!")


Predictions saved successfully!


In [23]:
import pandas as pd

# Step 1: Load the existing test CSV file
test_data = pd.read_csv("C:\\Users\\MASSIVE\\Downloads\\datathon\\test.csv")

# Step 2: Ensure that new_data and test_data have the same number of rows
# This is important to avoid issues when adding the new column
if len(test_data) != len(new_data):
    raise ValueError("The number of rows in test_data and new_data do not match.")

# Step 3: Add the predictions from new_data as a new column in test_data
test_data['Predicted_Survival_Status'] = new_data['Predicted_Survival_Status']

# Step 4: Save the updated test DataFrame back to CSV
test_data.to_csv("C:\\Users\\MASSIVE\\Downloads\\datathon\\test.csv", index=False)

print("Predictions added to test.csv.")

Predictions added to test.csv.


In [24]:
test_data = pd.read_csv("C:\\Users\\MASSIVE\\Downloads\\datathon\\test.csv")

# Step 2: Select only the patientID and the predicted survival status columns
# Ensure you replace 'Predicted_Survival_Status' with the actual name if it differs
filtered_test_data = test_data[['Patient_ID', 'Predicted_Survival_Status']]

# Step 3: Save the updated test DataFrame back to CSV
filtered_test_data.to_csv("C:\\Users\\MASSIVE\\Downloads\\datathon\\test.csv", index=False)

print("Dropped all columns except patientID and predicted survival status.")

Dropped all columns except patientID and predicted survival status.
