In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
file_path1 = r"D:\.vscode\Datasets\train.csv"
file_path2=r"D:\.vscode\Datasets\test.csv"

# Load the data
if os.path.exists(file_path1):
    data_train = pd.read_csv(file_path1)
    data_test=pd.read_csv(file_path2)
    print(data_test.head(10))
else:
    print("File not found!")


      id  N_Days             Drug      Age Sex Ascites Hepatomegaly Spiders  \
0  15000  3492.0              NaN  21185.0   F     NaN          NaN     NaN   
1  15001  1654.0              NaN  19724.0   M     NaN          NaN     NaN   
2  15002   890.0          Placebo  24621.0   M       N            Y       N   
3  15003  1086.0              NaN  18628.0   F     NaN          NaN     NaN   
4  15004  4453.0          Placebo  20449.0   F       N            Y       N   
5  15005  3086.0  D-penicillamine  15712.0   F       N            N       N   
6  15006   611.0          Placebo  26259.0   F       N            Y       Y   
7  15007   904.0              NaN  25568.0   F     NaN          NaN     NaN   
8  15008  1690.0          Placebo  15574.0   F       N            N       Y   
9  15009  1092.0              NaN  21915.0   F     NaN          NaN     NaN   

  Edema  Bilirubin  Cholesterol  Albumin  Copper  Alk_Phos    SGOT  \
0     N        0.7          NaN     3.14     NaN       NaN  

In [3]:
data_train.dropna(subset=["N_Days"], inplace=True)
data_test.dropna(subset=["N_Days"], inplace=True)


In [4]:
data_test.columns

Index(['id', 'N_Days', 'Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly',
       'Spiders', 'Edema', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
       'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin',
       'Stage'],
      dtype='object')

In [5]:
for i in range(len(data_train.columns)):
    col = data_train.columns[i]
    if len(data_train[col].unique())<=4:
        print(col)


Drug
Sex
Ascites
Hepatomegaly
Spiders
Edema
Stage
Status


In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
def preprocess_and_impute(data, categorical_columns, target_column='Status', id_column=None):
    categorical_columns = [col for col in categorical_columns if col in data.columns]
    if target_column in categorical_columns:
        categorical_columns.remove(target_column)

    # Continue with your preprocessing logic
    # Keep the target column (Status) separate and don't apply transformations to it
    numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
    for col in numeric_cols:
        if col in categorical_columns or col == target_column:
            continue
        if data[col].isnull().sum() > 0:
            data[col].fillna(data[col].mean(), inplace=True)

    # Apply One-Hot Encoding for categorical columns, excluding the target column
    data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

    return data


In [8]:
# List of categorical columns including the target column
categorical_columns = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema','Status']

# Iterate through each categorical column for imputation
preprocessed_data = preprocess_and_impute(data_train, categorical_columns, id_column='id')
preprocessed_data_test=preprocess_and_impute(data_test, categorical_columns, id_column='id')



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


In [9]:
print(preprocessed_data.columns)  # Check after preprocessing the training data
print(preprocessed_data_test.columns)  # Check after preprocessing the test data


Index(['id', 'N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
       'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin',
       'Stage', 'Status', 'Drug_Placebo', 'Sex_M', 'Ascites_Y',
       'Hepatomegaly_Y', 'Spiders_Y', 'Edema_S', 'Edema_Y'],
      dtype='object')
Index(['id', 'N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
       'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin',
       'Stage', 'Drug_Placebo', 'Drug_Y', 'Sex_M', 'Ascites_Y',
       'Hepatomegaly_Y', 'Spiders_Y', 'Edema_S', 'Edema_Y'],
      dtype='object')


In [10]:
preprocessed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14999 entries, 0 to 14999
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              14999 non-null  int64  
 1   N_Days          14999 non-null  float64
 2   Age             14999 non-null  float64
 3   Bilirubin       14999 non-null  float64
 4   Cholesterol     14999 non-null  float64
 5   Albumin         14999 non-null  float64
 6   Copper          14999 non-null  float64
 7   Alk_Phos        14999 non-null  float64
 8   SGOT            14999 non-null  float64
 9   Tryglicerides   14999 non-null  float64
 10  Platelets       14999 non-null  float64
 11  Prothrombin     14999 non-null  float64
 12  Stage           14999 non-null  float64
 13  Status          14999 non-null  object 
 14  Drug_Placebo    14999 non-null  bool   
 15  Sex_M           14999 non-null  bool   
 16  Ascites_Y       14999 non-null  bool   
 17  Hepatomegaly_Y  14999 non-null  bool

In [11]:
preprocessed_data['Age_in_Years'] = preprocessed_data['Age'] / 365.25
preprocessed_data_test['Age_in_Years'] = preprocessed_data_test['Age'] / 365.25


In [12]:
preprocessed_data.drop(columns=["id","Age"],axis=1,inplace=True)
preprocessed_data_test.drop(columns=["id","Age"],axis=1,inplace=True)

In [13]:
preprocessed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14999 entries, 0 to 14999
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   N_Days          14999 non-null  float64
 1   Bilirubin       14999 non-null  float64
 2   Cholesterol     14999 non-null  float64
 3   Albumin         14999 non-null  float64
 4   Copper          14999 non-null  float64
 5   Alk_Phos        14999 non-null  float64
 6   SGOT            14999 non-null  float64
 7   Tryglicerides   14999 non-null  float64
 8   Platelets       14999 non-null  float64
 9   Prothrombin     14999 non-null  float64
 10  Stage           14999 non-null  float64
 11  Status          14999 non-null  object 
 12  Drug_Placebo    14999 non-null  bool   
 13  Sex_M           14999 non-null  bool   
 14  Ascites_Y       14999 non-null  bool   
 15  Hepatomegaly_Y  14999 non-null  bool   
 16  Spiders_Y       14999 non-null  bool   
 17  Edema_S         14999 non-null  bool

In [14]:
preprocessed_data["Status"].unique()

array(['C', 'D', 'CL'], dtype=object)

In [15]:
preprocessed_data.columns

Index(['N_Days', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos',
       'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage', 'Status',
       'Drug_Placebo', 'Sex_M', 'Ascites_Y', 'Hepatomegaly_Y', 'Spiders_Y',
       'Edema_S', 'Edema_Y', 'Age_in_Years'],
      dtype='object')

In [16]:
features = ['Age_in_Years', 'Cholesterol', 'Bilirubin', 'Prothrombin', 'Copper', 'SGOT']
X_train = preprocessed_data[features]
y_train = preprocessed_data['Status']
X_test = preprocessed_data_test[features]

In [17]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import shuffle

In [18]:
def standardize_data(data, target_column='Status'):
    # Separate target column and features
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    # Standardize features (excluding the target column)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y

In [19]:
def hyperparameter_tuning(X_train, y_train):
    # Define the models and hyperparameters to tune
    models = {
        'RandomForest': RandomForestClassifier(),
        'SVM': SVC(probability=True),
        'LogisticRegression': LogisticRegression(max_iter=1000),
        'KNN': KNeighborsClassifier(),
        'DecisionTree': DecisionTreeClassifier(),
        'NaiveBayes': GaussianNB()
    }
    
    param_grid = {
        'RandomForest': {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15]},
        'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'LogisticRegression': {'C': [0.1, 1, 10]},
        'KNN': {'n_neighbors': [3, 5, 7]},
        'DecisionTree': {'max_depth': [5, 10, 15]},
        'NaiveBayes': {}
    }
    
    best_model = None
    best_score = 0
    
    # Perform GridSearchCV for each model and find the best one
    for model_name, model in models.items():
        print(f"Tuning {model_name}...")
        grid = GridSearchCV(model, param_grid[model_name], cv=3, scoring='accuracy')
        grid.fit(X_train, y_train)
        
        # Update the best model if current one has a better score
        if grid.best_score_ > best_score:
            best_score = grid.best_score_
            best_model = grid.best_estimator_
    
    return best_model

In [20]:
def evaluate_model(model, X_test, y_test):
    # Evaluate the model on the test set
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n{cm}")
    
    return accuracy

In [21]:
preprocessed_data['Status'] = preprocessed_data['Status'].map({'C': 0, 'CL': 1, 'D': 2})


In [None]:
def create_submission_file(model, data):
    X = data.drop(columns=['Status'])
    # Standardize the features
    X_scaled, _ = standardize_data(preprocessed_data, target_column='Status')
    
    # Get the predicted probabilities
    probabilities = model.predict_proba(X_scaled)
    
    # Create a submission DataFrame
    submission = pd.DataFrame(probabilities, columns=['Statusc', 'Statuscl', 'Statusd'])
    submission['id'] = data['id']
    
    # Shuffle the DataFrame if needed
    submission = shuffle(submission)
    
    # Save to CSV
    submission.to_csv('submission_file.csv', index=False)
    print("Submission file saved as 'submission_file.csv'")

# Main Execution Flow
if __name__ == "__main__":
    # Step 1: Hyperparameter Tuning and Model Training
    best_model = hyperparameter_tuning(X_train, y_train)

    # Step 2: Get predicted probabilities for the test set
    probabilities = evaluate_model(best_model, X_test)
    
    # Step 3: Create and Save Submission File
    create_submission_file(best_model, preprocessed_data_test)

Tuning RandomForest...


  _data = np.array(data, dtype=dtype, copy=copy,


Tuning SVM...


In [None]:
print(preprocessed_data_test.columns)
